Пример #1
0
 def get_last_amount_recurrency(recurrents_filter_worktable, group_array):
     """
     Getting statistics (average , standard deviation and coefficient of variation )on transaction_amount field and it is group by customer and commerce
     :param recurrents_filter_worktable:Recurrents Dataframe
     :param group_array: Array  groups by  the following fields['customer_id', 'commerce_affiliation_id']
     :return: Recurrents Dataframe contains transaction_amount statistics group by group_array parameters
     """
     return recurrents_filter_worktable.groupBy(group_array).agg(
         # means
         avg(
             when(
                 c('order2') <= 4,
                 c(fields.transaction_amount).cast('Float'))
         ).cast('Float').alias("MEAN_transaction_amount"),
         # SD
         stddev(
             when(
                 c('order2') <= 4,
                 c(fields.transaction_amount).cast('Float'))
         ).cast('Float').alias("SD_transaction_amount"),
         # CV
         ((stddev(
             when(
                 c('order2') <= 4,
                 c(fields.transaction_amount).cast('Float'))).cast('Float'))
          / (avg(
              when(
                  c('order2') <= 4,
                  c(fields.transaction_amount).cast('Float'))).cast('Float')
             )).cast('Float').alias("CV_transaction_amount"))
Пример #2
0
def bikes_summary(bikes, stations):
    member_summary = bikes.groupBy('member_type').count().sort(F.desc("count"))
    member_summary.show()
    bikes_summary = bikes.groupBy('bike_number').count().sort(F.desc("count"))
    bikes_summary.show()
    duration_summary = bikes.select([
        F.min("duration"),
        F.max("duration"),
        F.mean("duration"),
        F.stddev("duration")
    ])
    duration_summary.show()

    #print statistics for stations check-ins and check-outs
    check_in_summary = bikes.groupBy('start_station_id').count().sort(
        F.desc("count"))
    check_in_summary.show()
    check_out_summary = bikes.groupBy('end_station_id').count().sort(
        F.desc("count"))
    check_out_summary.show()

    #print statistics for distance
    distance_summary = bikes.select([F.min("abs_distance"), F.max("abs_distance"), F.mean("abs_distance"), \
        F.stddev("abs_distance")])
    distance_summary.show()
Пример #3
0
def addUserFeatures(df: DataFrame) -> DataFrame:
    '''
    提取用户特征
    :param df:
    :return:
    '''

    extractGenresUdf = udf(extractGenres, returnType=ArrayType(IntegerType()))
    print('start user feature')
    samplesWithUserFeatures = df.withColumn('userPositiveHistory',collect_list(when(col('label')==1,col('movieId')).otherwise(lit(None))).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)))\
        .withColumn('userPositiveHistory',reverse(col('userPositiveHistory'))) \
        .withColumn("userRatedMovie1", col("userPositiveHistory").getItem(0))  \
        .withColumn("userRatedMovie2", col("userPositiveHistory").getItem(1)) \
        .withColumn("userRatedMovie3", col("userPositiveHistory").getItem(2)) \
        .withColumn("userRatedMovie4", col("userPositiveHistory").getItem(3)) \
        .withColumn("userRatedMovie5", col("userPositiveHistory").getItem(4)) \
        .withColumn('userRatingCount',count(lit(1)).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1))) \
        .withColumn('userAvgReleaseYear',avg(col('releaseYear')).over(Window.partitionBy("userId").orderBy(col('timestamp')).rowsBetween(-100, -1)).cast('int')) \
        .withColumn('userReleaseYearStddev',stddev(col('releaseYear')).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1))) \
        .withColumn('userAvgRating',format_number(avg(col('rating')).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)),2)) \
        .withColumn('userRatingStddev',    stddev(col("rating")).over(Window.partitionBy("userId").orderBy(col("timestamp")).rowsBetween(-100, -1))) \
        .withColumn('userGenres',extractGenresUdf(collect_list(when(col('label') == 1,col('genres')).otherwise(lit(None))).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)))).na.fill(0) \
        .withColumn("userRatingStddev", format_number(col("userRatingStddev"), 2)) \
        .withColumn("userReleaseYearStddev", format_number(col("userReleaseYearStddev"), 2)) \
        .withColumn("userGenre1", col("userGenres").getItem(0)) \
        .withColumn("userGenre2", col("userGenres").getItem(1)) \
        .withColumn("userGenre3", col("userGenres").getItem(2)) \
        .withColumn("userGenre4", col("userGenres").getItem(3)) \
        .withColumn("userGenre5", col("userGenres").getItem(4))\
        .drop("genres", "userGenres", "userPositiveHistory").filter(col('userRatingCount') > 1)

    # samplesWithUserFeatures.printSchema()
    samplesWithUserFeatures.show(10, truncate=True)
    return samplesWithUserFeatures
Пример #4
0
def get_metrics_by_source(df: SparkDataFrame) -> SparkDataFrame:
    """

    Parameters
    ----------
    df: SparkDataFrame

    Returns
    -------
    return: SparkDataFrame
    """
    frame_len = col("frame_len")
    time_diff = col("Source_time_diff")
    metrics = [
        F.count("*").alias("Spc"),
        F.sum(frame_len).alias("Tss"),
        F.min(frame_len).alias("Smin"),
        F.max(frame_len).alias("Smax"),
        F.mean(frame_len).alias("Savg"),
        F.stddev(frame_len).alias("Svar"),
        F.min(time_diff).alias("SITmin"),
        F.max(time_diff).alias("SITmax"),
        F.mean(time_diff).alias("SITavg"),
        F.stddev(time_diff).alias("SITvar"),
    ]
    grouped_by_source = group_by_source(df)
    return grouped_by_source.agg(*metrics)
Пример #5
0
def get_metrics_by_destination(df: SparkDataFrame) -> SparkDataFrame:
    """

    Parameters
    ----------
    df: SparkDataFrame

    Returns
    -------
    return: SparkDataFrame
    """
    frame_len = col("frame_len")
    time_diff = col("Destination_time_diff")
    metrics = [
        F.count("*").alias("Rpc"),
        F.sum(frame_len).alias("Tsr"),
        F.mean(frame_len).alias("Ravg"),
        F.stddev(frame_len).alias("Rvar"),
        F.min(time_diff).alias("RITmin"),
        F.max(time_diff).alias("RITmax"),
        F.mean(time_diff).alias("RITavg"),
        F.stddev(time_diff).alias("RITvar"),
    ]
    grouped_by_destination = group_by_destination(df)
    return grouped_by_destination.agg(*metrics)
Пример #6
0
def groupby_statistics(df):
    ret = df.groupby('passenger_count').agg(f.mean('fare_amount'),
                                            f.stddev('fare_amount'),
                                            f.mean('tip_amount'),
                                            f.stddev('tip_amount'))
    ret.take(3)
    return ret
Пример #7
0
def print_avg_predictions(predictions_df, label):
    result_row = (predictions_df.agg(
        F.avg('rating').alias('avg_rating'),
        F.stddev('rating').alias('stddev_rating'),
        F.avg('prediction').alias('avg_prediction'),
        F.stddev('prediction').alias('stddev_prediction')).head())
    print('[{} Prediction] Rating Avg: {} Stddev: {}'.format(
        label, result_row[0], result_row[1]))
    print('[{} Prediction] Prediction Avg: {} Stddev: {}'.format(
        label, result_row[2], result_row[3]))
Пример #8
0
def stats():
    stats1DF = joinDF.orderBy('fhours').groupBy('fhours').agg(
        count('fhours').alias('count'),
        mean('tempD').alias('tempM'),
        stddev('tempd').alias('tempSD'),
        mean('humidityD').alias('humidityM'),
        stddev('humidityD').alias('humiditySD'),
        mean('wind_mphD').alias('wind_mphM'),
        stddev('wind_mphD').alias('wind_mphSD'),
        mean('feelslikeD').alias('feelslikeM'),
        stddev('feelslikeD').alias('feelslikeSD'))
    return stats1DF
def addUserFeatures(samplesWithMovieFeatures):
    extractGenresUdf = udf(extractGenres, ArrayType(StringType()))
    '''
     userPositiveHistory: 样本出现之前该用户喜欢的电影ID列表,按最近到最远排序
     userRatedMovie1~5:基于userPositiveHistory生成,最近访问的5部电影
     userRatingCount:最近评分的次数
     userAvgReleaseYear:最近电影平均年份
     userReleaseYearStddev:最近电影年份标准差
     userAvgRating:最近电影平均评分
     userRatingStddev:最近电影评分标准差
     userGenres:最近喜欢电影的分类出现次数统计,返回从高到低频率的分类
     userGenre1~5:从userGenres取top 5分类
     
     保留userRatingCount大于1的样本
    '''
    samplesWithUserFeatures = samplesWithMovieFeatures \
        .withColumn('userPositiveHistory',
                    F.collect_list(when(F.col('label') == 1, F.col('movieId')).otherwise(F.lit(None))).over(
                        sql.Window.partitionBy("userId").orderBy(F.col("timestamp")).rowsBetween(-100, -1))) \
        .withColumn("userPositiveHistory", reverse(F.col("userPositiveHistory"))) \
        .withColumn('userRatedMovie1', F.col('userPositiveHistory')[0]) \
        .withColumn('userRatedMovie2', F.col('userPositiveHistory')[1]) \
        .withColumn('userRatedMovie3', F.col('userPositiveHistory')[2]) \
        .withColumn('userRatedMovie4', F.col('userPositiveHistory')[3]) \
        .withColumn('userRatedMovie5', F.col('userPositiveHistory')[4]) \
        .withColumn('userRatingCount',
                    F.count(F.lit(1)).over(sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))) \
        .withColumn('userAvgReleaseYear', F.avg(F.col('releaseYear')).over(
        sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)).cast(IntegerType())) \
        .withColumn('userReleaseYearStddev', F.stddev(F.col("releaseYear")).over(
        sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))) \
        .withColumn("userReleaseYearStddev", format_number(F.col("userReleaseYearStddev"), NUMBER_PRECISION)) \
        .withColumn("userAvgRating", format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION)) \
        .withColumn("userRatingStddev", F.stddev(F.col("rating")).over(sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))) \
        .withColumn("userRatingStddev", format_number(F.col("userRatingStddev"), NUMBER_PRECISION)) \
        .withColumn("userGenres", extractGenresUdf(F.collect_list(when(F.col('label') == 1, F.col('genres')).otherwise(F.lit(None))).over(
            sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)))) \
        .withColumn("userGenre1", F.col("userGenres")[0]) \
        .withColumn("userGenre2", F.col("userGenres")[1]) \
        .withColumn("userGenre3", F.col("userGenres")[2]) \
        .withColumn("userGenre4", F.col("userGenres")[3]) \
        .withColumn("userGenre5", F.col("userGenres")[4]) \
        .drop("genres", "userGenres", "userPositiveHistory") \
        .filter(F.col("userRatingCount") > 1)

    samplesWithUserFeatures.printSchema()
    samplesWithUserFeatures.show(10)
    # 挑选1个用户看一下
    samplesWithUserFeatures.filter(
        samplesWithMovieFeatures['userId'] == 1).orderBy(
            F.col('timestamp').asc()).show(truncate=False)
    return samplesWithUserFeatures
Пример #10
0
 def featurize(df, col_name):
     df_stats = df.select(
         F.mean(F.col(col_name)).alias('mean'),
         F.stddev(F.col(col_name)).alias('std')).collect()
     mean = df_stats[0]['mean']
     std = df_stats[0]['std']
     data = df.withColumn(col_name, (df[col_name] - mean) / std)
     data_stats = data.select(
         F.mean(F.col(col_name)).alias('mean'),
         F.stddev(F.col(col_name)).alias('std')).collect()
     new_mean = data_stats[0]['mean']
     new_std = data_stats[0]['std']
     return data
Пример #11
0
def testAgg_1() -> None:
    spark = SparkSession.builder.appName("aggregation").getOrCreate()
    df = spark.read.csv("./datasets/sales_info.csv",
                        inferSchema=True,
                        header=True)
    df.printSchema()
    df.show()
    print("Sales average")
    df.groupBy("Company").mean().show()
    print("Count")
    df.groupBy("Company").count().show()
    print("Aggregation")
    df.agg({"Sales": "sum"}).show()
    df.agg({"Sales": "max"}).show()
    print("Group and aggregation")
    group_data = df.groupBy("Company")
    group_data.agg({"Sales": "max"}).show()
    print("Functions")
    df.select(countDistinct("Sales")).show()
    df.select(avg("Sales")).show()
    df.select(avg("Sales").alias("Average Sales")).show()
    print("Format")
    sales_std = df.select(stddev("Sales").alias("std"))
    sales_std.select(format_number("std",
                                   2).alias("Standard Deviation")).show()
    print("Order By")
    df.orderBy("Sales").show()  # Ascending order.
    df.orderBy(df["Sales"].desc()).show()  # Descending order.
Пример #12
0
def election_donations_analysis(input, output1, output2):
    from pyspark.sql import SparkSession
    import pyspark.sql.functions as F
    spark = SparkSession.builder.getOrCreate()

    reader = spark.read
    reader.option("header", True).option("inferSchema", True)
    df = reader.csv(input)

    cand_info = ['cand_id', 'cand_nm']

    grouped_df = df.groupBy(cand_info)\
        .agg(
            F.count(F.when(F.col('contb_receipt_amt') >= '0', 1)).alias("NumDon"),
        F.sum('contb_receipt_amt').alias("Total_cntrb"),
        F.countDistinct(F.when(F.col('contb_receipt_amt') >= '0', F.array('contbr_nm', 'contbr_zip', 'contbr_employer', 'contbr_occupation'))).alias('UnqCtb'),
        F.mean("contb_receipt_amt").alias("Mean"),
        F.stddev("contb_receipt_amt").alias("SD"),
        F.count(F.when((F.col('contb_receipt_amt') < '50') & (F.col('contb_receipt_amt') >= '0'), 1)).alias("smCtb")
        ).withColumn("%", (F.col("smCtb") /  F.col("NumDon"))*100)

    grouped_df.sort(F.desc('cand_id')).show()
    grouped_df.write.format('csv').mode('overwrite').save(output1)

    required_cand_ids = ["P80001571", "P80000722"]
    df_6 = df.filter(df.cand_id.isin(required_cand_ids)).filter(
        "contb_receipt_amt >= '0'").select('cand_nm', 'contb_receipt_amt')

    df_6.sort(F.desc('contb_receipt_amt')).show()
    df_6.coalesce(1).write.format('csv').mode('overwrite').save(output2)
Пример #13
0
def _calculate_poi_stats(df):
    """
    Calculate all the relevant statistics for a given position of interest (POI)

    :param poi_df: A Spark dataframe containing information mapping a request ID to a POI together with the distance
    between the ID and the POI
    :return: A Spark dataframe containing information about the mean and standard deviations of the distances of all
    various requests, the maximum distance (radius), request counts, and density associated to a POI
    """
    poi_stats_df = df \
        .groupBy('POIID') \
        .agg(
            count('*').alias('req_count'),
            avg('min_distance').alias('mean_distance'),
            stddev('min_distance').alias('stddev_distance'),
            max('min_distance').alias('radius')
    ) \
        .select(
            'POIID',
            'req_count',
            'mean_distance',
            'stddev_distance',
            'radius',
            round(col('req_count') / (pi * col('radius') ** 2), 6).alias('density')
    )

    return poi_stats_df
Пример #14
0
class AverageEstimator(Estimator, HasInputCol, HasOutputCol):
    def fit(self, X):
                '''
        Creates a Spark dataframe containing the average and standard deviation on a per-game basis for all games
        in the input dataframe X.

        Parameters: self and a dataframe X that contains steamid, steamid_int, appid, and playtime_forever.

        Outputs: a self object wih a Spark dataframe (self.table) containing the appid, average playtime for each
                 appid, and standard deviation of the playtime for each appid.

        '''

        grp_window = Window.partitionBy('appid')
        avg_func = func.mean('playtime_forever')
        stddev_func = func.stddev('playtime_forever')
        
        X = X.withColumn('avg', avg_func.over(grp_window))
        X = X.withColumn('std_dev', stddev_func.over(grp_window))
        
        self.table = X
        for col in ['steamid', 'steamid_int', 'playtime_forever']:
            self.table = self.table.drop(col)
        
        self.table = self.table.dropDuplicates(subset=['appid'])
        
        return self
Пример #15
0
def standardize(df):
    '''
    Standardize all columns
    formula = [(x - mean) / std]
    CREDITS: https://gist.github.com/morganmcg1/15a9de711b9c5e8e1bd142b4be80252d
    Based on the solution on stackoverflow: 
    https://stackoverflow.com/questions/44580644/subtract-mean-from-pyspark-dataframe
    Paramters:
        - dataframe
    Returns:
        - dataframe with standardized variables
    '''

    # create list to aggregate all mean and stds
    aggAvg = []
    aggStd = []

    # aggregate all means and stds
    for c in df.columns:
        aggAvg.append(mean(df[c]).alias(c))
        aggStd.append(stddev(df[c]).alias(c + '_stddev'))

    averages = df.agg(*aggAvg).collect()[0]
    std_devs = df.agg(*aggStd).collect()[0]

    for c in df.columns:
        df = df.withColumn(c + '_norm',
                           ((df[c] - averages[c]) / std_devs[c + '_stddev']))

    return df
Пример #16
0
    def occCalc(self, channelID, testing=False):
        """ Calculates occupancy for the user defined month
		"""
        if type(channelID) != list:
            raise TypeError('ChannelID is required to be a list')

        conf = SparkConf()\
          .setAppName("Occupancy Calc")\
          .set("spark.master", "local[*]")\
          .set("spark.driver.maxResultSize", "15G")
        sc = SparkContext(conf=conf)
        sql = SQLContext(sc)
        path = 'AZURE PATH' + self.month +\
          '/*/*/' + self.sensor + '*'
        data = sql.read.parquet(path)

        timeCount = data.select('scan_time').distinct().count()
        timeCount = sc.broadcast(timeCount)
        subData = data.select('scan_time', 'channel_id', 'power_dbm').filter(
            data.channel_id.isin(channelID))
        subData = subData.groupBy('channel_id').agg(
            (count(column('power_dbm')) / timeCount.value).alias('freq'),
            stddev(column('power_dbm')).alias('sd')).sort(
                asc('freq'), desc('sd'))

        if testing:
            subData.toPandas().to_csv('C:/path/freq.csv', sep='\t')
            sc.stop()
        else:
            sc.stop()
            return (subData.toPandas())
Пример #17
0
    def matching_set(self, scores, thresh=1.5):
        """Best-guess method with eccentricity threshold.
        
        Scores is a spark dataframe that may contain multiple custId.
        Parameters:
            - scores: DF(custId_1, custId_2, value)
        """
        # Sigma (custId_1, std): Standard deviation for each attacked customer.
        sigma = scores.groupBy('custId_1').agg(
            F.stddev(scores.value).alias('std'))
        window = Window.partitionBy(scores.custId_1).orderBy(
            scores.value.desc())
        # Top scores (custId_1, custId_2, value, rank):
        # For each customer, the two closest customers in the dataset.
        top_scores = scores.select(
            '*',
            F.row_number().over(window).alias('rank')).filter(
                F.col('rank') <= 2)
        # First match: (custId_1, custId_2, value_1)
        top_1 = top_scores.filter('rank == 1').drop('rank').withColumnRenamed(
            'value', 'value_1')
        # Second match: (custId_1, custId_3, value_2)
        top_2 = top_scores.filter('rank == 2').drop('rank').withColumnRenamed(
            'value', 'value_2').withColumnRenamed('custId_2', 'custId_3')

        # (custId_1, custId_2, custId_3, value_1, value_2, std)
        scores = top_1.join(top_2, ['custId_1']).join(sigma, 'custId_1')
        # (..., eccentricity)
        # For each customer, the two closests customers in the dataset, along with the
        # eccentricity measure.
        scores_w_eccentricity = scores.withColumn(
            'eccentricity',
            (F.col('value_1') - F.col('value_2')) / F.col('std'))
        return scores_w_eccentricity.filter(
            'eccentricity >= {}'.format(thresh))
def get_session_stats(data_in, name):
    # Limit to song plays
    session_vars = data_in.filter(data_in['page'] == 'NextSong')

    # Get play time for each song
    session_window = ssw.Window.partitionBy(
        session_vars['userId'], session_vars['sessionId']).orderBy('ts')
    session_vars = session_vars.withColumn(
        'nextPlayStart',
        ssf.lead(session_vars['ts']).over(session_window))

    session_vars = session_vars.withColumn(
        'playTime', session_vars['nextPlayStart'] - session_vars['ts'])

    # Get play time for each session
    session_vars = session_vars.groupby('userId', 'sessionId').agg(
        #pylint: disable=no-member
        ssf.sum('playTime').alias('sessionLength'))

    # Get play time stats for each user
    session_vars = session_vars.groupby('userId').agg(
        #pylint: disable=no-member
        ssf.mean('sessionLength').alias(f'lengthMean{name}'),
        ssf.stddev('sessionLength').alias(f'lengthStd{name}'),
        ssf.sum('sessionLength').alias(f'lengthSum{name}'))

    return session_vars
Пример #19
0
    def baseline_model(self, df_processed):
        cond_dvs = F.when(col("dvs_p30days") > 400,
                          400).otherwise(col("dvs_p30days"))


        stats=df_processed\
        .withColumn("dvs_p30days",cond_dvs)\
        .filter(col("recent_event_date")==1)\
        .filter(col("event_date")>=F.date_add(F.current_date(),-15))\
        .groupby(['grt_l2_cat_name'])\
        .agg(F.avg(col("dvs_p30days")).alias("avg_dvs_p30days")
            ,F.stddev(col("dvs_p30days")).alias("std_deal_view")
            ,F.round(F.avg(col("dvs_p30days")).cast('integer')).alias("avg_deal_view")
            ,F.max("dvs_p30days").alias("max_deal_view"))

        w = Window.partitionBy(F.col('consumer_id')).orderBy(
            F.col('normalized_dvs_p30days').desc())

        df_final=df_processed\
        .filter(col("recent_event_date")==1)\
        .filter(col("event_date")>=F.date_add(F.current_date(),-15))\
        .join(stats,on='grt_l2_cat_name')\
        .withColumn('normalized_dvs_p30days',(col('dvs_p30days')-col('avg_deal_view'))/col('std_deal_view'))\
        .withColumn('normalized_dvs_p30days_rank',F.row_number().over(w))


        df_micro=df_final\
        .filter(col('normalized_dvs_p30days')>=0)\
        .filter(col('normalized_dvs_p30days_rank')==1)\
        .filter(col('grt_l2_purchaser_14d')==0)

        return df_micro
Пример #20
0
def addMovieFeatures(movieSamples, ratingSamplesWithLabel):
    # add movie basic features
    samplesWithMovies1 = ratingSamplesWithLabel.join(movieSamples,
                                                     on=['movieId'],
                                                     how='left')
    # add releaseYear,title
    samplesWithMovies2 = samplesWithMovies1.withColumn('releaseYear',
                                                       udf(extractReleaseYearUdf, IntegerType())('title')) \
        .withColumn('title', udf(lambda x: x.strip()[:-6].strip(), StringType())('title')) \
        .drop('title')
    # split genres
    samplesWithMovies3 = samplesWithMovies2 \
        .withColumn('movieGenre1', split(F.col('genres'), "\\|")[0]) \
        .withColumn('movieGenre2', split(F.col('genres'), "\\|")[1]) \
        .withColumn('movieGenre3', split(F.col('genres'), "\\|")[2])
    # add rating features
    movieRatingFeatures = samplesWithMovies3.groupBy('movieId') \
        .agg(F.count(F.lit(1)).alias('movieRatingCount'),
             format_number(F.avg(F.col('rating')), NUMBER_PRECISION) \
             .alias('movieAvgRating'), F.stddev(F.col('rating')) \
             .alias('movieRatingStddev')).fillna(0) \
        .withColumn('movieRatingStddev', format_number(F.col('movieRatingStddev'), NUMBER_PRECISION))
    # join movie rating features
    samplesWithMovies4 = samplesWithMovies3.join(movieRatingFeatures,
                                                 on=['movieId'],
                                                 how='left')
    samplesWithMovies4.printSchema()
    samplesWithMovies4.show(5, truncate=False)
    return samplesWithMovies4
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark).dropna()
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA)
    reviews_df.cache()
    helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter(reviews_df.total_votes > 1).filter(reviews_df.helpful_votes > 0).cache()
    helpful_df = helpful_df.withColumn('helpfulness_percentage',(helpful_df.helpful_votes / helpful_df.total_votes) * 100)
    helpful_df = helpful_df.filter(helpful_df.helpfulness_percentage > 90).select('review_body')            # take most helpful reviews

    helpful_df = helpful_df.withColumn('word_count', word_count(helpful_df.review_body)).cache()

    ## median of most helpful reviews
    median = helpful_df.approxQuantile('word_count', [0.5], 0.25)
    sc.parallelize(median).repartition(1).saveAsTextFile('median_words_in_helpfulreviews.csv')

    ## average, std of most helpful reviews
    aggregations = [functions.avg('word_count').alias('average_wordcount'), functions.stddev('word_count').alias('std_wordcount')]

    stats_df = helpful_df.agg(*aggregations)
    stats_df.repartition(1).write.mode('overwrite').csv('average_stddev_wordcount_mosthelpful')


    count = reviews_df.filter(reviews_df.customer_id.isNotNull()).select('customer_id').agg(functions.countDistinct('customer_id'))
    count.repartition(1).write.mode('overwrite').csv('total_customersin_dataset')

    product_count = reviews_df.filter(reviews_df.product_id.isNotNull()).select('product_id').agg(functions.countDistinct('product_id'))
    product_count.repartition(1).write.mode('overwrite').csv('total_productsin_dataset')
Пример #22
0
        def standard():
            self.logger.info("conducting Standard Scaling")
            if (self.parameterDF is None) & (self.columns is not None):
                # 没有提供参数表,执行fit_transform操作
                mmParamList = []
                self.transformedDF = self.originalDF
                for col in self.columns:
                    mmRow = self.originalDF.select(
                        functions.avg(col), functions.stddev(col)).first()
                    mmAvg = mmRow["avg(" + col + ")"]
                    mmStd = mmRow["stddev_samp(" + col + ")"]
                    mmParamList.append([col, float(mmAvg), float(mmStd)])
                    self.transformedDF = self.transformedDF\
                        .withColumn(col + "ss", (functions.col(col) - mmAvg) / mmStd) \
                        .drop(col)\
                        .withColumnRenamed(col + "ss", col)

                self.parameterDF = self.spark.createDataFrame(
                    mmParamList, ['col', 'avg', 'std'])

            elif self.parameterDF is not None:
                # 读取参数表中的参数并用于转换原始表
                self.transformedDF = self.originalDF
                self.parameterDFP = self.parameterDF.toPandas()
                for col in self.parameterDFP["col"]:
                    mmParamList = [
                        self.parameterDFP.loc[self.parameterDFP["col"] == col,
                                              "avg"].tolist()[0],
                        self.parameterDFP.loc[self.parameterDFP["col"] == col,
                                              "std"].tolist()[0]
                    ]
                    self.transformedDF = self.transformedDF\
                        .withColumn(col+"ss", (functions.col(col) - mmParamList[0])/(mmParamList[1] - mmParamList[0])) \
                        .drop(col)\
                        .withColumnRenamed(col + "ss", col)
Пример #23
0
    def describe_float_1d(df, column, current_result, nrows):
        stats_df = df.select(column).na.drop().agg(
            mean(col(column)).alias("mean"),
            df_min(col(column)).alias("min"),
            df_max(col(column)).alias("max"),
            variance(col(column)).alias("variance"),
            kurtosis(col(column)).alias("kurtosis"),
            stddev(col(column)).alias("std"),
            skewness(col(column)).alias("skewness"),
            df_sum(col(column)).alias("sum")).toPandas()

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr(
                "percentile_approx(`{col}`,CAST({n} AS DOUBLE))".format(
                    col=column, n=x)).toPandas().iloc[:, 0])
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column).na.drop().select(
            df_abs(col(column) - stats["mean"]).alias("delta")).agg(
                df_sum(col("delta"))).toPandas().iloc[0, 0] /
                        float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        hist_data = create_hist_data(df, column, stats["min"], stats["max"],
                                     bins)

        return stats
 def replaceByStandardDeviation(self, feature, df, stddev_=-1):
     stddevValue = df.select(stddev(col(feature.name)).alias(
         'stddev')).collect()[0]["stddev"]
     df.fillna(stddevValue, subset=[feature.name])
     df = df.withColumn(feature.name,
                        when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
     return df
Пример #25
0
def _partial_dependence_num(vals, df, model, feature_column, assembler, model_type, n_samples):
    """Compute PD values for numerical feature"""
    vals_df = spark.createDataFrame(list(map(lambda x: (float(x),), vals)),
                                    schema=StructType([StructField(feature_column, DoubleType())]))
    vals_df.cache()

    result_df = spark.createDataFrame(sc.emptyRDD(), schema=StructType([StructField(feature_column, DoubleType()),
                                                                        StructField('pd', DoubleType())]))

    df = df.drop(feature_column)

    for i in range(n_samples):
        cross = df.sample(withReplacement=False, fraction=1 / n_samples).crossJoin(vals_df)
        if model_type == 'classification':
            pred = model.transform(assembler.transform(cross))\
                        .select(feature_column, p1_proba('probability').alias('pred'))
        else:
            pred = model.transform(assembler.transform(cross))\
                        .select(feature_column, F.col('prediction').alias('pred'))
        pd = pred.groupBy(feature_column).agg(F.avg('pred').alias('pd'))
        result_df = result_df.union(pd)

    stats = result_df.groupBy(feature_column).agg(F.avg('pd'), F.stddev('pd')).orderBy(feature_column).toPandas().values
    means = stats[:, 1]
    cis = np.apply_along_axis(lambda arr: st.norm.interval(0.95, loc=arr[0], scale=arr[1] / np.sqrt(n_samples)),
                              1, stats[:, 1:])

    return means, cis
def addMovieFeatures(movieSamples: DataFrame,
                     ratingSamplesWithLabel: DataFrame) -> DataFrame:
    # 在movieId列上等值连接,ratingSamplesWithLabel作为左表,movieSamples作为右表,
    samplesWithMovie = ratingSamplesWithLabel.join(movieSamples,
                                                   on=['movieId'],
                                                   how='left')
    # 从title列中提取处于其末尾括号中的releaseYear字段
    samplesWithMovie = samplesWithMovie.withColumn('releaseYear',
                                                   F.udf(extractReleaseYearUDF, IntegerType())('title')) \
        .withColumn('title', F.udf(lambda x: x.strip()[:-6].strip(), StringType())('title')) \
        .drop('title')  # DataFrame中去掉暂时无用的title字段(只需要movieId即可)
    # 从genres列中分隔以'|'符号为分界符的各genre字段
    samplesWithMovie = samplesWithMovie.withColumn('genre0', F.split(F.col('genres'), "\\|")[0]) \
        .withColumn('genre1', F.split(F.col('genres'), "\\|")[1]) \
        .withColumn('genre2', F.split(F.col('genres'), "\\|")[2])
    # 添加评分特征
    movieRatingFeatures = samplesWithMovie.groupBy('movieId').agg(F.count(F.lit(1)).alias('movieRatingCount'),
                                                                  F.format_number(F.avg(F.col('rating')),
                                                                                  NUMBER_PRECISION).alias(
                                                                      'movieAvgRating'),
                                                                  F.stddev(F.col('rating')).alias(
                                                                      'movieRatingStddev')) \
        .fillna(0) \
        .withColumn('movieRatingStddev', F.format_number(F.col('movieRatingStddev'), NUMBER_PRECISION))

    # 将samplesWithMovie和movieRatingFeatures在movieId列上等值连接,前者作为左表,后者作为右表
    return samplesWithMovie.join(movieRatingFeatures,
                                 on=['movieId'],
                                 how='left')
def preprocessing(spark_df):
    smart_feature_columns=[column for column in spark_df.columns if 'smart' in column]


    window_spec_7 = Window.partitionBy('model', 'serial_number').orderBy(
        F.datediff(F.col('dt'), F.lit('2017-07-01'))).rangeBetween(-7, 0)
    prefix_window7='window_7_'
    for smart_col in smart_feature_columns:
        spark_df=spark_df.withColumn(smart_col,F.col(smart_col).cast(DoubleType()))
        if smart_col in ['smart_1_normalized','smart_5raw','smart_7_normalized','smart_194raw','smart_199raw',
                         'smart_190raw','smart_191raw','smart_193raw','smart_195_normalized','smart_195raw']:
            spark_df = spark_df.withColumn(prefix_window7 + 'range_' + smart_col,
                                         F.max(F.col(smart_col)).over(window_spec_7) - F.min(F.col(smart_col)).over(
                                             window_spec_7))
            spark_df = spark_df.withColumn(prefix_window7 + 'std_' + smart_col,
                                         F.stddev(F.col(smart_col)).over(window_spec_7))
        #if smart_col in ['smart_187raw','smart_188raw','smart_197raw','smart_198raw']:
        #    spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)>0,1).otherwise(0))
        #if smart_col in ['smart_187_normalized','smart_188_normalized','smart_197_normalized','smart_198_normalized']:
        #    spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)<100,1).otherwise(0))
        if smart_col in ['smart_4raw','smart_5raw','smart_191raw',
                         'smart_187raw','smart_197raw','smart_198raw',
                         'smart_199raw','window_7_range_smart_199raw']:
            spark_df=spark_df.withColumn(smart_col,F.log2(F.col(smart_col)+F.lit(1.)))

    spark_df=spark_df.withColumn('smart_199raw',F.col('smart_199raw')*F.col('window_7_range_smart_199raw'))

    spark_df = spark_df.withColumn('anomaly_sum',
                                   F.col('smart_4raw') / 12 + F.col('smart_5raw') / 16  + F.col('smart_191raw') / 18
                                    + F.col('smart_198raw')/18 +F.col('smart_197raw')/18+F.col('smart_187raw')/15)

    return spark_df
Пример #28
0
    def output(self, scores, thresh=1.5, mode="best-guess"):
        """Standard output of the algorithm

        De-anonymisation has two modes: entropic (keeps the full distribution) or 
        best-guess (matching with threshold).
        """
        if mode == "best-guess":
            return self.matching_set(scores, thresh)
        elif mode == "entropic":
            # (custId_1, std)
            sigma = scores.groupBy('custId_1').agg(
                F.stddev(scores.value).alias('std'))
            # (custId_1, custId_2, probas_raw)
            probas_raw = scores\
                .join(sigma, ['custId_1'])\
                .withColumn("probas_raw", F.exp(F.col('value')/F.col('std')))\
                .select(['custId_1', 'custId_2', 'probas_raw', 'std'])
            # (custId_1, probas_z)
            probas_z = probas_raw.groupBy('custId_1').agg(
                F.sum(probas_raw.probas_raw).alias('probas_z'))
            # (custId_1, custId_2, probas)
            return scores\
                .join(probas_raw, ['custId_1', 'custId_2'])\
                .join(probas_z, ['custId_1'])\
                .withColumn("probas", F.col('probas_raw')/F.col('probas_z'))\
                .select(['custId_1', 'custId_2', 'probas', 'value', 'std'])
        else:
            raise "Mode '{}' is invalid.".format(mode)
Пример #29
0
def get_standardized_residuals(res):
    resstd = res.select(
        fns.stddev('residuals').alias('std')).collect()[0]['std']
    resstd = res.withColumn('std', fns.lit(resstd))
    resstd = resstd.withColumn('resstd', fns.col('residuals') / fns.col('std'))
    resstd = resstd.drop('residuals')
    return resstd
Пример #30
0
def threeSTD(df, column):

    df_non_nan = df.filter(df[column].isNotNull())
    avg = df_non_nan.select(mean(column).alias(column)).collect()
    avg = round(avg[0][column], 2)
    std = df_non_nan.select(stddev(column).alias(column)).collect()
    std = round(std[0][column], 2)

    down_limit = avg - (3 * std)
    up_limit = avg + (3 * std)

    def udf_XiGeMa(s):
        if (s == None) | (s == 'NULL'):
            return s
        else:
            s = float(s)
            if (s >= down_limit) & (s <= up_limit):
                return float(s)
            else:
                return None

    udf_transf = udf(udf_XiGeMa, FloatType())
    df = df_non_nan.select('*',
                           udf_transf(df_non_nan[column]).alias('temp_name'))
    df = df.drop(column)

    df = df.withColumnRenamed('temp_name', column)
    df = df.filter(df[column].isNotNull())

    return df
Пример #31
0
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().ix[:,0]
                                        )
        stats = stats_df.ix[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats
        weather_df.humidity,
        weather_df.pressure,
        weather_df.visibility,
        weather_df.precipIntensity,
        weather_df.windSpeed
    ))

status_joined_df.show()

stats_df = (status_joined_df
    .agg(
        F.mean(status_joined_df.temperature).alias("avgTemp"),
        F.mean(status_joined_df.humidity).alias("avgHumidity"),
        F.mean(status_joined_df.pressure).alias("avgPressure"),

        F.stddev(status_joined_df.temperature).alias("stddevTemp"),
        F.stddev(status_joined_df.humidity).alias("stddevHumidity"),
        F.stddev(status_joined_df.pressure).alias("stddevPressure"),
        F.stddev(status_joined_df.visibility).alias("stddevVisibility"),
        F.stddev(status_joined_df.precipIntensity).alias("stddevPrecipitation"),
        F.stddev(status_joined_df.windSpeed).alias("stddevWindSpeed")))

stats_df.write.mode('overwrite').parquet("hdfs://hadoop:9000/models/weather-stats")

stats = stats_df.collect()[0]

print "Statistics: %s" % (stats,)

day_of_week = F.udf(
    lambda d: datetime.datetime.strptime(d, "%Y-%m-%d").weekday(),
    IntegerType())