def cond_fluent_window(pyData):
    dfData = spark.createDataFrame(pyData)
    dfData = dfData \
        .withColumn("cond", func.when(dfData.E < 0, -1).otherwise( +1))
    dfData = dfData \
        .orderBy(dfData.grp, dfData.subgrp, dfData.cond, dfData.id)
    window = Window \
        .partitionBy(dfData.grp, dfData.subgrp, dfData.cond) \
        .orderBy(dfData.id)\
        .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    dfData = dfData \
        .withColumn("cond_var_of_E_2_pre1",
                    func.when(dfData.cond < 0,
                            func.variance(dfData.E)\
                            .over(window)))
    dfData = dfData \
        .groupBy(dfData.grp, dfData.subgrp, dfData.cond)\
        .agg(func.sum(dfData.C).alias("sum_of_C_pre"),
            func.count(dfData.C).alias("count_of_C_pre"),
            func.max(dfData.D).alias("max_of_D_pre"),
            func.variance(func.when(dfData.E < 0, dfData.E)).alias("cond_var_of_E_1_pre"),
            func.last(dfData.cond_var_of_E_2_pre1).alias("cond_var_of_E_2_pre2"))

    dfData = dfData \
        .groupBy(dfData.grp, dfData.subgrp)\
        .agg((func.sum(dfData.sum_of_C_pre) \
            / func.sum(dfData.count_of_C_pre)\
            ).alias("mean_of_C"),
            func.max(dfData.max_of_D_pre).alias("max_of_D"),
            func.max(dfData.cond_var_of_E_1_pre).alias("cond_var_of_E_1"),
            func.max(dfData.cond_var_of_E_2_pre2).alias("cond_var_of_E_2"))\
        .orderBy(dfData.grp, dfData.subgrp)\
        .collect()
def bi_fluent_window(pyData):
    df = spark.createDataFrame(pyData)
    window = Window \
        .partitionBy(df.grp, df.subgrp) \
        .orderBy(df.id)
    df = df \
        .orderBy(df.grp, df.subgrp, df.id)\
        .withColumn("sub_var_of_E",
                    func.variance(df.E)\
                              .over(window))
    df = df \
        .groupBy(df.grp, df.subgrp)\
        .agg(func.sum(df.C).alias("sub_sum_of_C"),
            func.count(df.C).alias("sub_count"),
            func.max(df.D).alias("sub_max_of_D"),
            func.last(df.sub_var_of_E).alias("sub_var_of_E1"),
            func.variance(df.E).alias("sub_var_of_E2"))
    df \
        .groupBy(df.grp)\
        .agg(
            (func.sum(df.sub_sum_of_C)/
             func.sum(df.sub_count)).alias("mean_of_C"),
            func.max(df.sub_max_of_D).alias("max_of_D"),
            func.avg(df.sub_var_of_E1).alias("avg_var_of_E1"),
            func.avg(df.sub_var_of_E2).alias("avg_var_of_E2"))\
        .orderBy(df.grp)\
        .collect()
示例#3
0
def task_1(data_io, review_data, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    asin_column = 'asin'
    overall_column = 'overall'
    # Outputs:
    mean_rating_column = 'meanRating'
    count_rating_column = 'countRating'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    data = review_data.groupBy(F.col(asin_column)).agg(
        F.avg(F.col(overall_column)).alias(mean_rating_column),
        F.count("*").alias(count_rating_column))

    merged = product_data.join(data, on=asin_column, how='left')

    aggregate_func = merged.agg(
        F.count("*"), F.avg(F.col(mean_rating_column)),
        F.variance(F.col(mean_rating_column)),
        F.sum(F.isnull(F.col(mean_rating_column)).astype("int")),
        F.avg(F.col(count_rating_column)),
        F.variance(F.col(count_rating_column)),
        F.sum(F.isnull(F.col(count_rating_column)).astype("int"))).collect()[0]

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    # Calculate the values programmaticly. Do not change the keys and do not
    # hard-code values in the dict. Your submission will be evaluated with
    # different inputs.
    # Modify the values of the following dictionary accordingly.
    res = {
        'count_total': None,
        'mean_meanRating': None,
        'variance_meanRating': None,
        'numNulls_meanRating': None,
        'mean_countRating': None,
        'variance_countRating': None,
        'numNulls_countRating': None
    }
    # Modify res:

    res['count_total'] = aggregate_func[0]
    res['mean_meanRating'] = aggregate_func[1]
    res['variance_meanRating'] = aggregate_func[2]
    res['numNulls_meanRating'] = aggregate_func[3]
    res['mean_countRating'] = aggregate_func[4]
    res['variance_countRating'] = aggregate_func[5]
    res['numNulls_countRating'] = aggregate_func[6]

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_1')
    return res
def bi_fluent_join(pyData):
    df = spark.createDataFrame(pyData)
    level1 = df \
        .groupBy(df.grp) \
        .agg(
            func.mean(df.C).alias("mean_of_C"),
            func.max(df.D).alias("max_of_D"))
    level2 = df \
        .groupBy(df.grp, df.subgrp) \
        .agg(
            func.variance(df.E).alias("var_of_E"),
            ((func.sum(df.E * df.E)-
              func.sum(df.E) * func.avg(df.E))
             /(func.count(df.E)-1)).alias("var_of_E2")
        )
    level3 = level2 \
        .join(level1, "grp") \
        .groupBy(level1.grp) \
        .agg(
            func.last(level1.mean_of_C).alias("mean_of_C"),
            func.last(level1.max_of_D).alias("max_of_D"),
            func.avg(level2.var_of_E).alias("avg_var_of_E"),
            func.avg(level2.var_of_E2).alias("avg_var_of_E2")
        ) \
        .orderBy(level1.grp)
    # .collect()
    return level3, None
def ratingFeatures(ratingSamples):
    ratingSamples.printSchema()
    ratingSamples.show()

    # calculate average movie rating score and rating count
    # 按movieId做聚合,统计电影点击次数count(1) as ratingCount
    # avg(rating) as avgRating
    # variance(rating) as ratingVar   -- 这个是方差
    movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'),
                                                         F.avg("rating").alias("avgRating"),
                                                         F.variance('rating').alias('ratingVar')) \
        .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating'))  # 把平均得分转成只有1列的向量存储,后续做标准化要求的
    movieFeatures.show(10)

    ######## 走pipeline特征处理 ########
    # bucketing
    # 连续值分桶:对ratingCount按分布划分成100个大小一样的桶
    ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100,
                                                 inputCol="ratingCount",
                                                 outputCol="ratingCountBucket")
    # Normalization
    # 标准化:将平均得分向量进行标准化
    ratingScaler = MinMaxScaler(inputCol="avgRatingVec",
                                outputCol="scaleAvgRating")

    # 创建pipeline
    pipelineStage = [ratingCountDiscretizer, ratingScaler]
    featurePipeline = Pipeline(stages=pipelineStage)
    movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform(
        movieFeatures)

    # 把分桶转成整数类型, 把标准化的向量提取为非向量
    movieProcessedFeatures = movieProcessedFeatures.withColumn('ratingCountBucket', F.col('ratingCountBucket').cast(IntegerType()))\
        .withColumn('scaleAvgRating', udf(lambda v: float(v[0]), FloatType())(F.col('scaleAvgRating'))).drop(F.col('avgRatingVec'))
    movieProcessedFeatures.show(10)
def bi_fluent_nested(pyData):
    df = spark.createDataFrame(pyData)
    df = df.groupBy(df.grp, df.subgrp)\
        .agg(func.mean(df.C).alias("sub_mean_of_C"),
            func.count(df.C).alias("sub_count"),
            func.sum(df.C).alias("sub_sum_of_C"),
            func.max(df.D).alias("sub_max_of_D"),
            func.variance(df.E).alias("sub_var_of_E"),
            func.sum(df.E * df.E).alias("sub_sum_of_E_squared"),
            func.sum(df.E).alias("sub_sum_of_E"))
    df = df.groupBy(df.grp) \
        .agg(
            (
                func.sum(df.sub_mean_of_C * df.sub_count)
                / func.sum(df.sub_count)
            ).alias("mean_of_C"),
            func.max(df.sub_max_of_D).alias("max_of_D"),
            func.avg(df.sub_var_of_E).alias("cond_var_of_E1"),
            func.avg(
                (df.sub_sum_of_E_squared -
                df.sub_sum_of_E * df.sub_sum_of_E
                 / df.sub_count)).alias("cond_var_of_E2"))
    df.select('grp', 'mean_of_C', 'max_of_D',
                   'cond_var_of_E1', 'cond_var_of_E2')\
        .orderBy(df.grp)\
        .collect()
def cond_fluent_nested(pyData):
    dfData = spark.createDataFrame(pyData)
    dfInter = dfData\
        .withColumn('cond', func.when(dfData.E < 0, -1).otherwise(1))
    dfInter = dfInter.groupBy(dfInter.grp, dfInter.subgrp, dfInter.cond)\
        .agg(func.mean(dfData.C).alias("sub_mean_of_C"),
            func.count(dfData.C).alias("sub_count"),
            func.sum(dfData.C).alias("sub_sum_of_C"),
            func.max(dfData.D).alias("sub_max_of_D"),
            func.variance(dfData.E).alias("sub_var_of_E"),
            func.sum(dfData.E * dfData.E).alias("sub_sum_of_E_squared"),
            func.sum(dfData.E).alias("sub_sum_of_E"))
    dfInter = dfInter.groupBy(dfInter.grp, dfInter.subgrp) \
        .agg(func.mean(dfInter.sub_mean_of_C).alias("wrong_mean_of_C"),
            (
                func.sum(dfInter.sub_mean_of_C * dfInter.sub_count)
                / func.sum(dfInter.sub_count)
            ).alias("mean_of_C2"),
            func.sum(dfInter.sub_count).alias("uncond_count"),
            func.sum(func.when(dfInter.cond < 0,dfInter.sub_count)\
                    .otherwise(0)).alias("cond_count"),
            func.sum(dfInter.sub_sum_of_C).alias("sum_of_C"),
            func.max(dfInter.sub_max_of_D).alias("max_of_D"),
            func.sum(func.when(dfInter.cond < 0,dfInter.sub_var_of_E)\
                    .otherwise(0)).alias("cond_var_of_E"))
    dfInter = dfInter\
        .withColumn('mean_of_C', dfInter.sum_of_C / dfInter.uncond_count)
    dfInter.select('grp', 'subgrp', 'mean_of_C', 'mean_of_C2', 'wrong_mean_of_C',
                'max_of_D', 'cond_var_of_E')\
        .orderBy(dfInter.grp, dfInter.subgrp)\
        .collect()
示例#8
0
    def _transform(self, df):
        input = self.getInputCol()
        prefix = self.getPrefix()
        outputs = self.getOutputCols()
        stats = self.getStatistics()

        groupByCol = self.getGroupByCol()

        aggs = []

        for stat in stats:
            name = "{}_{}".format(prefix, stat)

            if stat == 'var':
                agg = F.variance(input).alias(name)
            elif stat == 'mean':
                agg = F.mean(input).alias(name)
            elif stat == 'count':
                agg = F.count(input).alias(name)
            elif stat == 'sum':
                agg = F.sum(input).alias(name)
            elif stat == 'nunique' or stat == 'distinct':
                agg = F.countDistinct(input).alias(name)

            aggs.append(agg)

        temp = df.groupBy(groupByCol).agg(*aggs)
        temp = temp.select(*groupByCol, *outputs)

        temp = temp.na.fill(0.0)

        df = df.join(temp, groupByCol, how='left')
        df = df.coalesce(1)

        return df
示例#9
0
    def __init__(self):
        super(FeatureRequestIntervalVariance, self).__init__()

        self.w = Window.partitionBy(
            F.col('client_request_host'), F.col('client_ip')
        ).orderBy(F.col("@timestamp"))
        self.group_by_aggs = {
            'request_interval_var': F.variance(
                F.col('request_interval').cast('float') / 60.
            ),
        }
        self.pre_group_by_calcs = {
            'row_num_per_group':
                F.row_number().over(self.w),
            'prev_ts': F.lag(F.col('@timestamp')).over(
                self.w),
            'request_interval': F.when(
                F.col('row_num_per_group') > 1,
                F.when(
                    F.isnull(
                        F.col('@timestamp').cast('long') -
                        F.col('prev_ts').cast('long')
                    ), 0
                ).otherwise(
                    F.col('@timestamp').cast('long') -
                    F.col('prev_ts').cast('long')
                )).otherwise(None),
        }
示例#10
0
    def describe_float_1d(df, column, current_result, nrows):
        stats_df = df.select(column).na.drop().agg(
            mean(col(column)).alias("mean"),
            df_min(col(column)).alias("min"),
            df_max(col(column)).alias("max"),
            variance(col(column)).alias("variance"),
            kurtosis(col(column)).alias("kurtosis"),
            stddev(col(column)).alias("std"),
            skewness(col(column)).alias("skewness"),
            df_sum(col(column)).alias("sum")).toPandas()

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr(
                "percentile_approx(`{col}`,CAST({n} AS DOUBLE))".format(
                    col=column, n=x)).toPandas().iloc[:, 0])
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column).na.drop().select(
            df_abs(col(column) - stats["mean"]).alias("delta")).agg(
                df_sum(col("delta"))).toPandas().iloc[0, 0] /
                        float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        hist_data = create_hist_data(df, column, stats["min"], stats["max"],
                                     bins)

        return stats
示例#11
0
    def dataStatistics(self, categoricalFeatures, numericalFeatures):
        # self.dataTranform()
        self.categoricalFeatures = None if categoricalFeatures == None else categoricalFeatures
        self.numericalFeatures = None if numericalFeatures == None else numericalFeatures
        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}
        dataset = self.dataset
        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        for colm in self.numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListSubTemp = []
                for val in summ:
                    summaryListSubTemp.append(round(float(val), 4))
                summaryListTemp.append(summaryListSubTemp)
            summaryDict[colm] = summaryListTemp
        summaryList.extend(['skewness', 'kurtosis', 'variance'])
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = self.categoricalFeatures
        skewnessList = []
        kurtosisList = []
        varianceList = []
        skewKurtVarDict = {}
        for colm in self.numericalFeatures:
            skewness = (dataset.select(F.skewness(dataset[colm])).toPandas())
            for i, row in skewness.iterrows():
                for j, column in row.iteritems():
                    skewnessList.append(round(column, 4))
            kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas())
            for i, row in kurtosis.iterrows():
                for j, column in row.iteritems():
                    kurtosisList.append(round(column, 4))
            variance = (dataset.select(F.variance(dataset[colm])).toPandas())
            for i, row in variance.iterrows():
                for j, column in row.iteritems():
                    varianceList.append(round(column, 4))

        for skew, kurt, var, colm in zip(skewnessList, kurtosisList,
                                         varianceList, self.numericalFeatures):
            print(skew, kurt, var)
            skewKurtVarList = []
            skewKurtVarList.append(skew)
            skewKurtVarList.append(kurt)
            skewKurtVarList.append(var)
            skewKurtVarDict[colm] = skewKurtVarList

        for (keyOne, valueOne), (keyTwo,
                                 valueTwo) in zip(summaryDict.items(),
                                                  skewKurtVarDict.items()):
            print(keyOne, valueOne, keyTwo, valueTwo)
            if keyOne == keyTwo:
                valueOne.extend(valueTwo)
                summaryDict[keyOne] = valueOne
        return summaryDict
示例#12
0
 def __init__(self):
     super(FeaturePathDepthVariance, self).__init__()
     self.group_by_aggs = {
         'client_url_slash_count_variance':
         F.variance(F.col('client_url_slash_count'))
     }
     self.pre_group_by_calcs = {
         'client_url_slash_count':
         (F.size(F.split(F.col('client_url'), '/')) - 1)
     }
示例#13
0
    def __calc_stats(self, df, resolution):
        """
        Calculates statistics for every column in the Spark DF and returns a seperate DF with the results.
        Statistics: sum, min, max, count, mean, kurtosis, skewness, stddev, variance.
        :param df: DF containing the columns that you want to run your statistics calculations on
        :param resolution: int resolution in milli or microseconds OR string '5m'/'1h'/'1d'
        :return: aggregation dataframe containing statistics
        """

        if type(resolution) is str:
            # resolution to microseconds
            res_dict = {'5m': 300000000, '1h': 3600000000, '1d': 86400000000}
            agg_interval = res_dict[resolution]

        elif type(resolution) is int:
            if len(str(resolution)) < 16:
                resolution = int(str(resolution).ljust(16, '0'))
            agg_interval = resolution

        ts_col = F.col('timestamp')
        df_ori_cols = list(set(df.columns) - set(['timestamp']))

        df = df.withColumn('interval_start',
                           (F.floor(ts_col / agg_interval) * agg_interval))  #\
        #.withColumn('interval_stop', F.ceil(ts_col/agg_interval) * agg_interval)\
        #.orderBy(F.col('interval_start'))
        agg_df = df.groupBy('interval_start').agg(
            F.max(ts_col).alias('max_ts'))

        # TODO Column type checking: string columns are automatically ignored and parse as NaN, so
        # TODO drop NaN columns?

        # TODO: interval_stop ignore, as well as drop max_ts
        # TODO: filter out NaN columns

        # TODO: question: run the statistics job as a seperate job without having to make a udf script

        stat_cols = df_ori_cols  #[c for c in df_ori_cols if c not in ['interval_start', 'interval_stop', 'timestamp', 'max_ts']]
        for column in stat_cols:
            grouped_df = df.groupBy('interval_start')\
                           .agg(F.sum(column).alias('sum_%s' % column),
                                F.min(column).alias('min_%s' % column),
                                F.max(column).alias('max_%s' % column),
                                F.count(column).alias('count_%s' % column),
                                F.kurtosis(column).alias('kurtosis_%s' % column),
                                F.mean(column).alias('mean_%s' % column),
                                F.skewness(column).alias('skewness_%s' % column),
                                F.stddev(column).alias('stddev_%s' % column),
                                F.variance(column).alias('var_%s' % column))
            agg_df = grouped_df.join(agg_df, on='interval_start')
        #agg_df = agg_df.drop('max_ts').drop(F.when(F.col('*').isna())).dropna(how='all').drop_duplicates()

        return agg_df
示例#14
0
def vanilla_fluent(pyData):
    df = spark.createDataFrame(pyData, schema=DataPointSchema)
    df = df \
        .groupBy(df.grp, df.subgrp) \
        .agg(
            func.mean(df.C).alias("mean_of_C"),
            func.max(df.D).alias("max_of_D"),
            func.variance(df.E).alias("var_of_E"),
            ((
                func.sum(df.E *df.E)
                - func.pow(func.sum(df.E),2)/func.count(df.E)
            )/(func.count(df.E)-1)).alias("var_of_E2")
        )\
        .orderBy(df.grp, df.subgrp)
    return None, df
示例#15
0
def describe_numeric_1d(df,
                        bins,
                        column,
                        current_result,
                        nrows,
                        k=2,
                        dtype='int'):
    stats_df = df.select(column).na.drop().agg(
        mean(col(column)).alias('mean'),
        min(col(column)).alias('min'),
        max(col(column)).alias('max'),
        variance(col(column)).alias('variance'),
        kurtosis(col(column)).alias('kurtosis'),
        stddev(col(column)).alias('std'),
        skewness(col(column)).alias('skewness'),
        sum(col(column)).alias('sum')).toPandas()

    if dtype.lower() == 'int':
        select_expr = 'percentile({c},CAST({p} AS DOUBLE))'
    else:
        select_expr = 'percentile_approx({c},CAST({p} AS DOUBLE))'
    for p in [0.05, 0.25, 0.5, 0.75, 0.95]:
        stats_df[pretty_name(p)] = (df.select(column).na.drop().selectExpr(
            select_expr.format(c=column, p=p)).toPandas().ix[:, 0])
    stats = stats_df.ix[0].copy()
    stats.name = column
    stats['range'] = stats['max'] - stats['min']
    q3, q1 = stats[pretty_name(0.75)], stats[pretty_name(0.25)]
    stats['iqr'] = q3 - q1
    stats['cv'] = stats['std'] / float(stats['mean'])
    stats['mad'] = (df.select(column).na.drop().select(
        abs(col(column) - stats['mean']).alias('delta')).agg(sum(
            col('delta'))).toPandas().ix[0, 0] /
                    float(current_result['count']))
    stats['type'] = 'NUM'
    stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count()
    stats['p_zeros'] = stats['n_zeros'] / float(nrows)
    stats['high_idx'] = df.select(column).where(
        col(column) > q3 + k * (q3 - q1)).count()
    stats['low_idx'] = df.select(column).where(
        col(column) < q1 - k * (q3 - q1)).count()

    # generate histograms
    hist_data = generate_hist_data(df, column, stats['min'], stats['max'],
                                   bins)
    stats['histogram'] = complete_histogram(hist_data)
    stats['mini_histogram'] = mini_histogram(hist_data)
    return stats
示例#16
0
def numerical_example(data: DataFrame):
    movie_features = data.groupBy("movieId").agg(
        F.count(F.lit(1)).alias("ratingCount"),
        F.avg("rating").alias("avgRating"),
        F.variance("rating").alias("ratingVar")).withColumn(
            "avgRatingVec", udf_avg_rating_to_vec(F.col("avgRating")))
    print_info(movie_features)
    # bucketing
    rating_count_discretizer = QuantileDiscretizer(
        numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket")
    # normalization
    rating_scaler = MinMaxScaler(inputCol="avgRatingVec",
                                 outputCol="scaleAvgRating")
    pipeline_stage = [rating_count_discretizer, rating_scaler]
    feature_pipeline = Pipeline(stages=pipeline_stage)
    movie_processed_features = feature_pipeline.fit(movie_features).transform(
        movie_features)
    print_info(movie_processed_features)
def cond_fluent_join(pyData):
    dfData = spark.createDataFrame(pyData)
    uncond = dfData \
        .groupBy(dfData.grp, dfData.subgrp) \
        .agg(
            func.mean(dfData.C).alias("mean_of_C"),
            func.max(dfData.D).alias("max_of_D"))
    cond = dfData \
        .filter(dfData.E < 0) \
        .groupBy(dfData.grp, dfData.subgrp) \
        .agg(
            func.variance(dfData.E).alias("cond_var_of_E"))
    uncond \
        .join(cond,
            (uncond.grp == cond.grp) & (uncond.subgrp == cond.subgrp)) \
        .drop(cond.grp) \
        .drop(cond.subgrp) \
        .orderBy(uncond.grp, uncond.subgrp) \
        .collect()
示例#18
0
def ratingFeatures(ratingSamples):
    ratingSamples.printSchema()
    ratingSamples.show()
    # calculate average movie rating score and rating count
    movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'),
                                                         F.avg("rating").alias("avgRating"),
                                                         F.variance('rating').alias('ratingVar')) \
        .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating'))
    movieFeatures.show(10)
    # bucketing
    ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100,
                                                 inputCol="ratingCount",
                                                 outputCol="ratingCountBucket")
    # Normalization
    ratingScaler = MinMaxScaler(inputCol="avgRatingVec",
                                outputCol="scaleAvgRating")
    pipelineStage = [ratingCountDiscretizer, ratingScaler]
    featurePipeline = Pipeline(stages=pipelineStage)
    movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform(
        movieFeatures)
    movieProcessedFeatures.show(10)
示例#19
0
def learn3():
    df1 = ss.read.csv('F:/Research/data/ccFraud.csv',
                      header=True,
                      inferSchema=True)
    df1.show()
    # 按gender列对df进行分组,并统计每组的行数
    df2 = df1.groupby('gender').count()
    df2.show()
    df3 = df1.describe(['balance', 'numTrans', 'numIntlTrans'])
    df3.show()
    # 检查偏度
    df1.agg({'balance': 'skewness'}).show()
    df1.agg(
        functions.max('balance').alias('max'),
        functions.avg('balance').alias('avg'),
        functions.mean('balance').alias('mean'),
        functions.stddev('balance').alias('stddev'),
        functions.sum('balance').alias('sum'),
        functions.skewness('balance').alias('skewness'),
        functions.variance('balance').alias('variance'),
        functions.sumDistinct('balance').alias('sumDistinct')).show()
    corr1 = df1.corr('balance', 'numTrans')
    print(corr1)
def summaryCustomized(raw_df: DataFrame):
    param_name = "countDistinct"
    mySchemaTemp = list(filter(lambda x: (x[1] != 'timestamp'), raw_df.dtypes))
    mySchema = list(map(lambda z: (z[0]), mySchemaTemp))
    ColumnListWithDistinct_count = [param_name] + mySchema
    WithDistinctCntSummaryDF = raw_df.select([
        F.countDistinct(c).alias(c) for c in mySchema
    ]).withColumn(param_name,
                  F.lit(param_name)).selectExpr(ColumnListWithDistinct_count)

    param_name = "NullValueCount"
    ColumnListWithNullValueCount = [param_name] + mySchema
    ColumnListWithNullValueCountDF = raw_df.select(
        [sum(F.when(isnull(F.col(c)), 1).otherwise(0)).name(c) for c in mySchema]).\
        withColumn(param_name, F.lit(param_name)).selectExpr(ColumnListWithNullValueCount)

    param_name = "variance"
    ColumnListWithVariance = [param_name] + mySchema
    WithVarianceSummaryDF = raw_df.select([F.variance(c).alias(c) for c in mySchema]).\
        withColumn(param_name, F.lit(param_name)).selectExpr(ColumnListWithVariance)

    return raw_df.summary().union(WithDistinctCntSummaryDF).union(
        WithVarianceSummaryDF).union(ColumnListWithNullValueCountDF)
示例#21
0
      return False
    else:
      return False

# Create UDF funcs
get_pval_udf = F.udf(lambda vars: get_normal_pval(vars), FloatType())
if_norm_udf = F.udf(lambda p: if_norm(p), BooleanType())

# COMMAND ----------

toneDataAll = toneData.select('ActionGeo_FullName', 'wTRA_1d', 'wTRA_60d', 'nArticles') \
                                        .groupBy('ActionGeo_FullName') \
                                        .agg( F.skewness('wTRA_1d'),
                                              F.kurtosis('wTRA_1d'),
                                              F.stddev('wTRA_1d'),
                                              F.variance('wTRA_1d'),
                                              F.collect_list('wTRA_1d').alias('list_wTRA_1d'),
                                              F.skewness('wTRA_60d'),
                                              F.kurtosis('wTRA_60d'),
                                              F.stddev('wTRA_60d'),
                                              F.variance('wTRA_60d'),
                                              F.collect_list('wTRA_60d').alias('list_wTRA_60d'),
                                              F.sum('nArticles').alias('nArticles'),
                                              F.count(F.lit(1)).alias('n_observations')
                                        )

# get p-value and define normalcy
toneDataAll = toneDataAll.withColumn('p_value_1d', get_pval_udf(toneDataAll.list_wTRA_1d))
toneDataAll = toneDataAll.withColumn('if_normal_1d', if_norm_udf(toneDataAll.p_value_1d))
toneDataAll = toneDataAll.withColumn('p_value_60d', get_pval_udf(toneDataAll.list_wTRA_60d))
toneDataAll = toneDataAll.withColumn('if_normal_60d', if_norm_udf(toneDataAll.p_value_60d))
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)
示例#23
0
    else:
        return False


# Create UDF funcs
get_pval_udf = F.udf(lambda vars: get_normal_pval(vars), FloatType())
if_norm_udf = F.udf(lambda p: if_norm(p), BooleanType())

# COMMAND ----------

eventsDataAll = eventsData.select('ActionGeo_FullName', 'wERA_3d', 'wERA_60d', 'nArticles') \
                                        .groupBy('ActionGeo_FullName') \
                                        .agg( F.skewness('wERA_3d'),
                                              F.kurtosis('wERA_3d'),
                                              F.stddev('wERA_3d'),
                                              F.variance('wERA_3d'),
                                              F.collect_list('wERA_3d').alias('list_wERA_3d'),
                                              F.skewness('wERA_60d'),
                                              F.kurtosis('wERA_60d'),
                                              F.stddev('wERA_60d'),
                                              F.variance('wERA_60d'),
                                              F.collect_list('wERA_60d').alias('list_wERA_60d'),
                                              F.sum('nArticles').alias('nArticles'),
                                              F.count(F.lit(1)).alias('n_observations')
                                        )

# get p-value and define normalcy
eventsDataAll = eventsDataAll.withColumn(
    'p_value_3d', get_pval_udf(eventsDataAll.list_wERA_3d))
eventsDataAll = eventsDataAll.withColumn('if_normal_3d',
                                         if_norm_udf(eventsDataAll.p_value_3d))
  sum("Quantity").alias("total_purchases"),
  avg("Quantity").alias("avg_purchases"),
  expr("mean(Quantity)").alias("mean_purchases")) \
.selectExpr(
  "total_purchases/total_transactions",
  "avg_purchases",
  "mean_purchases").show()

# ----------------------------------------------------------
# Example 4 - varience and standard deviation
#----------------------------------------------------------
                 
from pyspark.sql.functions import var_pop, stddev_pop, variance, stddev
from pyspark.sql.functions import var_samp, stddev_samp

df.select(variance("Quantity"), stddev("Quantity"),
  var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()
    
spark.sql("""SELECT var_pop(Quantity), 
                    var_samp(Quantity),
                    stddev_pop(Quantity), 
                    stddev_samp(Quantity)
             FROM dfTable""").show()

#----------------------------------------------------------
# Example 5 - skewness & kurtosis
#----------------------------------------------------------
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()             
示例#25
0
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().ix[:,0]
                                        )
        stats = stats_df.ix[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats
示例#26
0
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum"),
                                                       count(col(column) == 0.0).alias('n_zeros')
                                                       ).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       df_sum(col(column)).alias("sum"),
                                                       count(col(column) == 0.0).alias('n_zeros')
                                                       ).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]

        for x in [0.05, 0.25, 0.5, 0.75, 0.95]:
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().iloc[:,0]
                                        )
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats
示例#27
0
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list,
                     relation):
    try:
        dataset = spark.read.parquet(dataset_add)
        label = ''
        for y in label_colm:
            label = y

        Schema = dataset.schema
        stringFeatures = []
        numericalFeatures = []
        for x in Schema:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        stringFeatures.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        numericalFeatures.append(x.name)

        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}

        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        for colm in numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListSubTemp = []
                for val in summ:
                    summaryListSubTemp.append(round(float(val), 4))
                # print(summaryListSubTemp)
                summaryListTemp.append(summaryListSubTemp)
            # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
            # summaryListTemp.append(varianceListTemp)
            summaryDict[colm] = summaryListTemp
        # summaryList.append('variance')
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = stringFeatures
        skewnessList = []
        kurtosisList = []
        varianceList = []
        skewKurtVarDict = {}
        for colm in numericalFeatures:
            skewness = (dataset.select(F.skewness(dataset[colm])).toPandas())
            for i, row in skewness.iterrows():
                for j, column in row.iteritems():
                    skewnessList.append(round(column, 4))
            kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas())
            for i, row in kurtosis.iterrows():
                for j, column in row.iteritems():
                    kurtosisList.append(round(column, 4))
            variance = (dataset.select(F.variance(dataset[colm])).toPandas())
            for i, row in variance.iterrows():
                for j, column in row.iteritems():
                    varianceList.append(round(column, 4))

        for skew, kurt, var, colm in zip(skewnessList, kurtosisList,
                                         varianceList, numericalFeatures):
            print(skew, kurt, var)
            skewKurtVarList = []
            skewKurtVarList.append(skew)
            skewKurtVarList.append(kurt)
            skewKurtVarList.append(var)
            skewKurtVarDict[colm] = skewKurtVarList

        for (keyOne, valueOne), (keyTwo,
                                 valueTwo) in zip(summaryDict.items(),
                                                  skewKurtVarDict.items()):
            print(keyOne, valueOne, keyTwo, valueTwo)
            if keyOne == keyTwo:
                valueOne.extend(valueTwo)
                summaryDict[keyOne] = valueOne
        print(summaryDict)
        print(summaryList.extend(['skewness', 'kurtosis', 'variance']))
        print(summaryDict)
        # for colm in numericalFeatures:
        #     skewness = (dataset.select(F.skewness(dataset[colm])).alias('skewness_' + colm))
        #     kurtosis = (dataset.select(F.kurtosis(dataset[colm])).alias('kurtosis_' + colm))
        #     variance = (dataset.select(F.variance(dataset[colm]).alias('kurtosis_' + colm)))
        if relation == 'linear':
            dataset = dataset
        if relation == 'non_linear':
            dataset = Relationship(dataset, relation_list)

        dataset.show()
        for x in Schema:
            if (str(x.dataType) == "StringType" and x.name == label):
                for labelkey in label_colm:
                    label_indexer = StringIndexer(inputCol=label,
                                                  outputCol='indexed_' +
                                                  label).fit(dataset)
                    dataset = label_indexer.transform(dataset)
                    label = 'indexed_' + label
            else:
                label = label
        indexed_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
        final_features = numericalFeatures + indexed_features
        response_chi_test = chi_square_test(dataset=dataset,
                                            features=indexed_features,
                                            label_col=label,
                                            stringFeatures=stringFeatures)

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")
        dataset = featureassembler.transform(dataset)
        dataset.show()
        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=4).fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))
        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()
        finalized_data = vec_indexed.select(label, 'vec_indexed_features')
        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)
        rf = RandomForestClassifier(labelCol=label,
                                    featuresCol='vec_indexed_features',
                                    numTrees=10)
        model = rf.fit(train_data)
        predictions = model.transform(test_data)
        print(model.featureImportances)
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)
        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)
        # feature_importance = [round(x,4) for x in feature_importance]
        featureImportance = []
        for x in feature_importance:
            featureImportance.append(round(x, 4))
        print(featureImportance)

        features_column_for_user = numericalFeatures + stringFeatures
        feature_imp = {
            'feature_importance': featureImportance,
            "feature_column": features_column_for_user
        }
        response_dict = {
            'feature_importance': feature_imp,
            'ChiSquareTestData': response_chi_test,
            'summaryDict': summaryDict
        }
        return response_dict
    except Exception as e:
        print("exception is  = " + str(e))
示例#28
0
def task_3(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    asin_column = 'asin'
    price_column = 'price'
    attribute = 'also_viewed'
    related_column = 'related'
    # Outputs:
    meanPriceAlsoViewed_column = 'meanPriceAlsoViewed'
    countAlsoViewed_column = 'countAlsoViewed'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    data = product_data.select(
        F.col(asin_column),
        F.explode(F.col(related_column))).filter(F.col('key') == attribute)

    data = data.select(F.col(asin_column), F.explode_outer(F.col('value')))

    first_join = product_data.select(
        F.col(asin_column).alias("col"),
        F.col(price_column).alias("prices"))

    merged = data.join(first_join, on="col", how='left')

    merged = merged.groupby(F.col(asin_column)).agg(
        F.avg('prices').alias(meanPriceAlsoViewed_column),
        F.count('*').alias(countAlsoViewed_column))

    merged = merged.withColumn(
        countAlsoViewed_column,
        F.when(F.col(countAlsoViewed_column) == 0,
               None).otherwise(F.col(countAlsoViewed_column)))

    count_total = product_data.count()

    numNulls_meanPriceAlsoViewed = count_total - merged.where(
        (merged[meanPriceAlsoViewed_column].isNotNull())).count()

    numNulls_countAlsoViewed = count_total - merged.where(
        (merged[countAlsoViewed_column].isNotNull())).count()

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_meanPriceAlsoViewed': None,
        'variance_meanPriceAlsoViewed': None,
        'numNulls_meanPriceAlsoViewed': None,
        'mean_countAlsoViewed': None,
        'variance_countAlsoViewed': None,
        'numNulls_countAlsoViewed': None
    }
    # Modify res:
    res['count_total'] = count_total
    res['mean_meanPriceAlsoViewed'] = merged.select(
        F.avg(F.col(meanPriceAlsoViewed_column))).head()[0]
    res['variance_meanPriceAlsoViewed'] = merged.select(
        F.variance(F.col(meanPriceAlsoViewed_column))).head()[0]
    res['numNulls_meanPriceAlsoViewed'] = numNulls_meanPriceAlsoViewed
    res['mean_countAlsoViewed'] = merged.select(
        F.avg(F.col(countAlsoViewed_column))).head()[0]
    res['variance_countAlsoViewed'] = merged.select(
        F.variance(F.col(countAlsoViewed_column))).head()[0]
    res['numNulls_countAlsoViewed'] = numNulls_countAlsoViewed

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_3')
    return res
示例#29
0
def task_4(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    price_column = 'price'
    title_column = 'title'
    # Outputs:
    meanImputedPrice_column = 'meanImputedPrice'
    medianImputedPrice_column = 'medianImputedPrice'
    unknownImputedTitle_column = 'unknownImputedTitle'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    casted_data = product_data.withColumn(
        price_column,
        F.col(price_column).cast(T.FloatType()))

    mean_imputer = M.feature.Imputer(strategy='mean',
                                     inputCols=[price_column],
                                     outputCols=[meanImputedPrice_column])
    mean_model = mean_imputer.fit(casted_data)
    meanImputedPrice = mean_model.transform(
        casted_data.select('asin', price_column))

    mean_meanImputedPrice = meanImputedPrice.select(
        F.avg(F.col(meanImputedPrice_column))).head()[0]
    variance_meanImputedPrice = meanImputedPrice.select(
        F.variance(F.col(meanImputedPrice_column))).head()[0]
    numNulls_meanImputedPrice = meanImputedPrice.filter(
        F.col(meanImputedPrice_column).isNull()).count()

    median_imputer = M.feature.Imputer(strategy='median',
                                       inputCols=[price_column],
                                       outputCols=[medianImputedPrice_column])
    median_model = median_imputer.fit(casted_data)
    medianImputedPrice = median_model.transform(meanImputedPrice)

    mean_medianImputedPrice = medianImputedPrice.select(
        F.avg(F.col(medianImputedPrice_column))).head()[0]
    variance_medianImputedPrice = medianImputedPrice.select(
        F.variance(F.col(medianImputedPrice_column))).head()[0]
    numNulls_medianImputedPrice = medianImputedPrice.filter(
        F.col(medianImputedPrice_column).isNull()).count()

    unknownImputedTitle = product_data.select(
        F.col(title_column).alias(unknownImputedTitle_column)).fillna(
            'unknown').replace('', 'unknown')
    numUnknowns_unknownImputedTitle = unknownImputedTitle.filter(
        F.col(unknownImputedTitle_column) == 'unknown').count()

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_meanImputedPrice': None,
        'variance_meanImputedPrice': None,
        'numNulls_meanImputedPrice': None,
        'mean_medianImputedPrice': None,
        'variance_medianImputedPrice': None,
        'numNulls_medianImputedPrice': None,
        'numUnknowns_unknownImputedTitle': None
    }
    # Modify res:

    res['count_total'] = medianImputedPrice.count()
    res['mean_meanImputedPrice'] = mean_meanImputedPrice
    res['variance_meanImputedPrice'] = variance_meanImputedPrice
    res['numNulls_meanImputedPrice'] = numNulls_meanImputedPrice
    res['mean_medianImputedPrice'] = mean_medianImputedPrice
    res['variance_medianImputedPrice'] = variance_medianImputedPrice
    res['numNulls_medianImputedPrice'] = numNulls_medianImputedPrice
    res['numUnknowns_unknownImputedTitle'] = numUnknowns_unknownImputedTitle

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_4')
    return res
示例#30
0
def task_2(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    salesRank_column = 'salesRank'
    categories_column = 'categories'
    asin_column = 'asin'
    # Outputs:
    category_column = 'category'
    bestSalesCategory_column = 'bestSalesCategory'
    bestSalesRank_column = 'bestSalesRank'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    ### category
    data = product_data.withColumn(
        category_column,
        F.when(F.col(categories_column)[0][0] == '',
               None).otherwise(F.col(categories_column)[0][0]))
    category_nulls = data.filter(F.col(category_column).isNull()).count()
    category_distinct = data.agg(F.countDistinct(
        F.col(category_column))).head()[0]

    ### salesRank and salesCategory
    key_and_values = data.select(
        asin_column, category_column,
        F.map_keys(salesRank_column)[0].alias(bestSalesCategory_column),
        F.map_values(salesRank_column)[0].alias(bestSalesRank_column))

    mean_of_salesRank = key_and_values.select(
        F.avg(F.col(bestSalesRank_column))).head()[0]
    variance_of_salesRank = key_and_values.select(
        F.variance(F.col(bestSalesRank_column))).head()[0]

    salesCategory_nulls = key_and_values.filter(
        F.col(bestSalesCategory_column).isNull()).count()
    salesCategory_distinct = key_and_values.agg(
        F.countDistinct(F.col(bestSalesCategory_column))).head()[0]

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_bestSalesRank': None,
        'variance_bestSalesRank': None,
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_bestSalesCategory': None,
        'countDistinct_bestSalesCategory': None
    }
    # Modify res:

    res['count_total'] = data.count()
    res['mean_bestSalesRank'] = mean_of_salesRank
    res['variance_bestSalesRank'] = variance_of_salesRank
    res['numNulls_category'] = category_nulls
    res['countDistinct_category'] = category_distinct
    res['numNulls_bestSalesCategory'] = salesCategory_nulls
    res['countDistinct_bestSalesCategory'] = salesCategory_distinct

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_2')
    return res
示例#31
0
    def run_pipeline(self):
        try:
            logging.info(
                "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/"
            )
            # check collect_list and collect_set
            #collect_set() function returns all values from an input column with duplicate values eliminated.
            #collect_list() function returns all values from an input column with duplicates

            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600),
                          ("Robert", "Sales", 4100),
                          ("Maria", "Finance", 3000), ("James", "Sales", 3000),
                          ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                          ("Jeff", "Marketing", 3000),
                          ("Kumar", "Marketing", 2000),
                          ("Saif", "Sales", 4100)]
            schema = ["employee_name", "department", "salary"]

            df = self.spark.createDataFrame(data=simpleData,
                                            schema=schema).cache()
            df.show(truncate=False)

            from pyspark.sql.functions import approx_count_distinct, collect_list
            from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
            from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
            from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
            from pyspark.sql.functions import variance, var_samp, var_pop
            df.printSchema()
            df.show(truncate=False)

            print("approx_count_distinct: " + \
                  str(df.select(approx_count_distinct("salary")).collect()[0][0]))

            print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

            df.select(collect_list("salary")).show(truncate=False)

            df.select(collect_set("salary")).show(truncate=False)

            df2 = df.select(countDistinct("department", "salary"))
            df2.show(truncate=False)
            print("Distinct Count of Department & Salary: " +
                  str(df2.collect()[0][0]))

            print("count: " + str(df.select(count("salary")).collect()[0]))
            dffirst = df.select(first("salary"))
            dffirst.show(truncate=False)
            df.select(last("salary")).show(truncate=False)
            df.select(kurtosis("salary")).show(truncate=False)
            df.select(max("salary")).show(truncate=False)
            df.select(min("salary")).show(truncate=False)
            df.select(mean("salary")).show(truncate=False)
            df.select(skewness("salary")).show(truncate=False)
            df.select(stddev("salary"), stddev_samp("salary"), \
                      stddev_pop("salary")).show(truncate=False)
            df.select(sum("salary")).show(truncate=False)
            df.select(sumDistinct("salary")).show(truncate=False)
            df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \
                .show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return