Пример #1
0
def is_except(df, cols: dict, groupBy: list):
    """
    :param df: 包含的columns:cust_id com_id cust_seg value
    :param cols:
    :param groupBy: list
    """
    value = cols["value"]
    abnormal = cols["abnormal"]
    mean_plus_3std = cols["mean_plus_3std"]
    mean_minus_3std = cols["mean_minus_3std"]
    mean = cols["mean"]

    #按照 市 档位分组 求vlaue的均值和标准差
    mean_3std = df.groupBy(groupBy) \
        .agg(f.mean(value).alias("mean"), (f.stddev_pop(value) * 3).alias("+3std"),
             (f.stddev_pop(value) * (-3)).alias("-3std")) \
        .withColumn("mean+3std", col("mean")+col("+3std")) \
        .withColumn("mean-3std", col("mean")+col("-3std"))

    result=df.join(mean_3std, groupBy)\
        .withColumn(abnormal,f.when(col(value)>col("mean+3std"),1)
                            .when(col(value)<col("mean-3std"),0)
                  )\
        .withColumnRenamed("mean+3std", mean_plus_3std) \
        .withColumnRenamed("mean-3std", mean_minus_3std) \
        .withColumnRenamed("mean", mean) \
        .dropna(subset=[abnormal])
    return result
Пример #2
0
def non_lambda_standardize(df):
    # get a list of columns to standardized
    standardize_cols = [colName for colName in df.columns if colName != "ehhn"]

    # create a list of expressions to average/stddev each column
    avg_columns = [
        avg(col(colName)).alias(colName) for colName in standardize_cols
    ]
    stddev_columns = [
        stddev_pop(col(col_name)).alias(col_name)
        for col_name in standardize_cols
    ]

    # create a dataframe of the averages/stddevs
    df_avgs = df[avg_columns].na.fill(0)
    df_stddevs = df[stddev_columns].na.fill(0)
    # df_avgs.select('population_density', 'cafeteria_count', 'casual_dining_count', 'family_restaurant_count').show()

    avgs = df_avgs.first().asDict()
    stddevs = df_stddevs.first().asDict()

    exprs = [
        zscore(colName, avgs[colName][0], stddevs[colName][0])
        for colName in standardize_cols
    ]
    exprs.insert(0, col("ehhn"))

    return df[exprs]
Пример #3
0
def poi_index(spark, cust_lng_lat, coordinate, regex, is_fill=False):
    """
    :param cust_lng_lat:DataFrame,列:city cust_id lng lat lng_l lng_r lat_d lat_u
    :param coordinate:poi 列:cityname lng lat types
    :param regex:  过滤poi的正则
    :param is_fill: 是否填充
    :return DataFrame city cust_id poi_index
    """

    print(f"{str(dt.now())} poi指数  {regex}")
    # coordinate先过滤符合条件的服务 再去join零售户, 在按照city cust_id分组
    count_df = coordinate.where(col("types").rlike(regex)) \
                        .join(cust_lng_lat.drop("lng","lat"),
                              (col("cityname")==col("city")) &
                              (col("lng") >= col("lng_l")) & (col("lng") <= col("lng_r"))
                              & (col("lat") >= col("lat_d")) & (col("lat") <= col("lat_u"))) \
                        .groupBy("city", "cust_id").count()

    if is_fill == True:
        # 周围有对应店铺数量的零售户
        exist_df = count_df.join(cust_lng_lat, ["city", "cust_id"]) \
            .select("city", "cust_id", "lng", "lat", "count")

        # 周围没有对应店铺的零售户
        not_df = cust_lng_lat.select("city", "cust_id") \
            .exceptAll(count_df.select("city", "cust_id")) \
            .join(cust_lng_lat, ["city", "cust_id"]) \
            .select("city", "cust_id", "lng", "lat")

        if not_df.count() > 0:
            # 用knn填充没有店铺数量的零售户
            fill_pd_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(),
                                     "count")
            fill_df = spark.createDataFrame(fill_pd_df)

            all_df = exist_df.unionByName(fill_df)
        else:
            all_df = exist_df

        # 计算阈值threshold =mean(对应数量)+3*std(对应数量)
        threshold = all_df.groupBy("city").agg(
            (f.mean("count") + 3 * f.stddev_pop("count")).alias("threshold"),
            f.max("count"))
        # 如果数量大于threshold,就用threshold替换
        replace_df = all_df.join(threshold, "city") \
            .withColumn("count", f.when(col("count") > col("threshold"), col("threshold"))
                        .otherwise(col("count"))
                        )

        result = replace_df
    else:
        result = count_df

    # 全市所有店铺同等计算方式的最大值
    count_max = result.groupBy("city").agg(f.max("count").alias("max"))

    # result
    return result.join(count_max,
                       "city").withColumn("poi_index",
                                          col("count") / col("max") * 5)
Пример #4
0
def standardize(df):
    # get a list of columns to standardized
    standardize_cols = list(filter(lambda c: c != "ehhn", df.columns))

    # create a list of expressions to average/stddev each column
    avg_columns = map(lambda c: avg(col(c).cast(DoubleType())).alias(c),
                      standardize_cols)
    stddev_columns = map(
        lambda c: stddev_pop(col(c).cast(DoubleType())).alias(c),
        standardize_cols)

    # create a dataframe of the averages/stddevs
    df_avgs = df.groupBy().agg(*avg_columns).na.fill(0)
    df_stddevs = df.groupBy().agg(*stddev_columns).na.fill(0)
    # df_avgs.select('population_density', 'cafeteria_count', 'casual_dining_count', 'family_restaurant_count').show()

    avgs = df_avgs.first().asDict()
    stddevs = df_stddevs.first().asDict()

    exprs = [col("ehhn")] + list(
        map(
            lambda c: lit(0).alias(c) if (stddevs[c] == 0) else
            ((col(c) - avgs[c]) / stddevs[c]).alias(c), standardize_cols))

    return df[exprs]
Пример #5
0
    def add_additional_fields(self, table):
        days = lambda i: i * 86400

        windowSpec = Window.partitionBy("ticker").orderBy(
            col("Date").cast('long')).rangeBetween(-days(7), 0)

        windowSpec20 = Window.partitionBy("ticker").orderBy(
            col("Date").cast('long')).rangeBetween(-days(20), 0)

        union_table = table.withColumn('rolling_seven_day_average',
                                       avg("Close").over(windowSpec))

        union_table = union_table.withColumn('rolling_20_day_average',
                                             avg("Close").over(windowSpec20))

        union_table = union_table.withColumn(
            'stdev',
            stddev_pop("Close").over(windowSpec20))

        union_table = union_table.withColumn(
            'Upper',
            union_table['rolling_20_day_average'] + (union_table['stdev'] * 2))

        union_table = union_table.withColumn(
            'Lower',
            union_table['rolling_20_day_average'] - (union_table['stdev'] * 2))

        return union_table
Пример #6
0
 def _add_outliers(dataframe, **kwargs):
     """
     Calculate a boundary for which a data point will be considered an outlier [bool]
     The boundary is the mean plus "stddev" (number of standard derivations) * the standard derivation
     Uses pyspark's Window function to partition over the special predictions and thereby count number of data 
     points in each cluster, their number of outliers and the outlier percentage 
     
     :param dataframe: 
     :param kwargs: 
         prediction_col can be set in the function call, else it will search for the column name 'predictionCol'
         distance_col can be set in the function call, else it will search for the column name 'distance'
         no_stddev (number of standard derivations) can be set in the function call, else default sat to 2
     :return: dataframe with added 'is_outlier' bool column
     """
     assert kwargs.get('distance_col', 'distance') in dataframe.columns, 'Distances have not been computed!'
     prediction_col = F.col(kwargs.get('prediction_col', 'prediction'))
     distance_col = F.col(kwargs.get('distance_col', 'distance'))
     no_stddev = kwargs.get('no_stddev', 2.0)
     window_outlier = Window().partitionBy(prediction_col)
     computed_boundary = (F.mean(distance_col).over(window_outlier)
                          + no_stddev * F.stddev_pop(distance_col).over(window_outlier)
                          )
     return (dataframe
             .withColumn(colName='computed_boundary',
                         col=computed_boundary)
             .withColumn(colName='is_outlier',
                         col=F.when(distance_col > computed_boundary, 1).otherwise(0))
             )
Пример #7
0
    def _apply_on_cols(self) -> List[Callable]:
        input_col = self.input_col

        return [
            f.mean(f.col(input_col)).alias(
                StandardScalarScalerConfig.mean_token),
            f.stddev_pop(f.col(input_col)).alias(
                StandardScalarScalerConfig.std_token)
        ]
Пример #8
0
def filter_using_constraints(consistency_df, constraint_list):
    consistency_df = consistency_df.where(func.col("winner")\
                    .isin(constraint_list))                                # applying the constraint list

    consistency_df = consistency_df.groupBy("winner")\
                    .agg(func.stddev_pop("count").alias("stddev"),\
                    func.sum("count").alias("total_wins"))\
                    .orderBy("stddev","total_wins")                        # calculating the performance consistency
    return consistency_df
Пример #9
0
def get_vfr_index():
    try:
        print(f"{str(dt.now())} 零售户周边人流指数")
        # 有人流数据的零售户
        vfr = get_around_vfr(spark)

        vfr.cache()

        # 零售户
        co_cust = get_co_cust(spark).select("cust_id")
        # 有经纬度的零售户
        cust_lng_lat = get_cust_lng_lat(spark) \
            .select("city", "cust_id", "lng", "lat") \
            .join(co_cust, "cust_id")

        cust_lng_lat.cache()

        # 周边没有人流的零售户
        not_df = cust_lng_lat.select("cust_id") \
            .exceptAll(vfr.select("cust_id")) \
            .join(cust_lng_lat, "cust_id")

        exist_df = vfr.join(cust_lng_lat, ["city", "cust_id"])

        if not_df.count()>0:
            #knn填充
            fill_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(), "avg_vfr")

            all_df = spark.createDataFrame(fill_df) \
                .unionByName(exist_df)
        else:
            all_df=exist_df

        #阈值
        threshold = all_df.groupBy("city") \
            .agg((f.mean("avg_vfr") + 3 * f.stddev_pop("avg_vfr")).alias("threshold"))

        truncate_df = all_df.join(threshold, "city") \
            .withColumn("avg_vfr", f.when(col("avg_vfr") > col("threshold"), col("threshold"))
                        .otherwise(col("avg_vfr"))
                        )

        log_df = truncate_df.withColumn("log", f.log(col("avg_vfr") + 1))
        log_max = log_df.groupBy("city").agg(f.max("log").alias("log_max"))

        colName = "people_count"
        log_df.join(log_max, "city") \
            .withColumn(colName, col("log") / col("log_max") * 5) \
            .foreachPartition(lambda x: write_hbase1(x, [colName], hbase))

        vfr.unpersist()
        cust_lng_lat.unpersist()
    except Exception:
        tb.print_exc()
def normalizeFeatures(df, cols):
	""" Normalized feature method is used to normalize each feature passed into the list cols"""
	allCols = df.columns
	#remove the cols to normalized to set the columns of return dataframe
	_ = [allCols.remove(x) for x in cols]
	# calculate the avg and stddev of the features to normalized
	stats = (df.groupBy().agg(*([stddev_pop(x).alias(x + '_stddev') for x in cols] + [avg(x).alias(x + '_avg') for x in cols])))
	# broadcast and join into current DF 
	df = df.join(broadcast(stats))
	# normalized the columns and select the required columns gor final DF
	exprs = [x for x in allCols] + [((df[x] - df[x + '_avg']) / df[x + '_stddev']).alias(x) for x in cols]  
	return df.select(*exprs)
def create_stats(
        df,
        tenant_col: str,
        value_col: str = AccessAnomalyConfig.default_output_col) -> StatsMap:
    stat_rows = df.groupBy(tenant_col).agg(
        f.count('*').alias('__count__'),
        f.min(f.col(value_col)).alias('__min__'),
        f.max(f.col(value_col)).alias('__max__'),
        f.mean(f.col(value_col)).alias('__mean__'),
        f.stddev_pop(f.col(value_col)).alias('__std__')).collect()

    stats_map = {
        row[tenant_col]:
        BasicStats(row['__count__'], row['__min__'], row['__max__'],
                   row['__mean__'], row['__std__'])
        for row in stat_rows
    }

    return StatsMap(stats_map)
Пример #12
0
    def transform(self, dataframe):
        """Applies standardization to the specified columns.

        # Arguments
            dataframe: dataframe. Spark Dataframe.
        """
        # Compute the means of the specified columns.
        means = [mean(x) for x in self.columns]
        means = dataframe.select(means).collect()[0].asDict()
        self.means = self.clean_mean_keys(means)
        # Compute the standard deviation of the specified columns.
        stddevs = [stddev_pop(x) for x in self.columns]
        stddevs = dataframe.select(stddevs).collect()[0].asDict()
        self.stddevs = self.clean_stddev_keys(stddevs)
        # For every feature, add a new column to the dataframe.
        for column in self.columns:
            self.current_column = column
            dataframe = dataframe.rdd.map(self._transform).toDF()

        return dataframe
Пример #13
0
def except_grade(df, cols: dict, groupBy: list, grade: list):
    """
        零售户指标与 {groupBy} 零售户相比
    :param df: 包含的columns:cust_id com_id cust_seg value
    :param cols:   列:cust_id,cols["value"]  cols["value"]:计算均值、标准差的值  如 条均价/订单额
    :param groupBy: list   [city]/[city,cluster_index]   按照city或 city和聚类结果 分组
    :param grade: [3,4,5] 按照 3/4/5倍标准差分档
    """

    value = cols["value"]

    abnormal = "warning_level_code"
    plus_one_grade = "avg_orders_plus3"
    minus_one_grade = "avg_orders_minu3"
    plus_two_grade = "avg_orders_plus4"
    minus_two_grade = "avg_orders_minu4"
    plus_three_grade = "avg_orders_plus5"
    minus_three_grade = "avg_orders_minu5"

    # 按照 [city]/[city,cust_seg]分组 求vlaue的均值和标准差
    mean_std = df.groupBy(groupBy) \
        .agg(f.mean(value).alias("mean"), f.stddev_pop(value).alias("stddev")) \
        .withColumn(plus_one_grade, col("mean") + col("stddev") * grade[0]) \
        .withColumn(minus_one_grade, col("mean") - col("stddev") * grade[0]) \
        .withColumn(plus_two_grade, col("mean") + col("stddev") * grade[1]) \
        .withColumn(minus_two_grade, col("mean") - col("stddev") * grade[1]) \
        .withColumn(plus_three_grade, col("mean") + col("stddev") * grade[2]) \
        .withColumn(minus_three_grade, col("mean") - col("stddev") * grade[2])

    result = df.join(mean_std, groupBy) \
        .withColumn(abnormal,
                    f.when((col(value) > col(plus_one_grade)) & (col(value) < col(plus_two_grade)), "C1")
                    .when((col(value) > col(minus_two_grade)) & (col(value) < col(minus_one_grade)), "C0")
                    .when((col(value) > col(plus_two_grade)) & (col(value) < col(plus_three_grade)), "B1")
                    .when((col(value) > col(minus_three_grade)) & (col(value) < col(minus_two_grade)), "B0")
                    .when((col(value) > col(plus_three_grade)), "A1")
                    .when((col(value) < col(minus_three_grade)), "A0")
                    ).dropna(subset=[abnormal])
    return result
Пример #14
0
# _read csv file
flightdata = spark.read.option('inferSchema',
                               'true').option('header',
                                              'true').csv('2015-summary.csv')
flightdata.show(5)
flightdata.printSchema()

# _add new column using withColumn, we are just printing the updated dataframe by show, but it should be taken into new variable as new dataframe.
flightdata.withColumn("newCol", col("count") + 10).show(4)

# _using select, we can also mention column names explicitly in place of *
flightdata_mod = flightdata.select("*", (col("count") + 20).alias("newCol2"))
flightdata_mod.show(5)

# _basic statistical functions
flightdata.select(mean("count")).show()
flightdata.select(min("count")).show()
flightdata.select(max("count")).show()
flightdata.select(stddev_pop("count")).show()
flightdata.select(stddev_samp("count")).show()
flightdata.select()

# _group by and aggregations
flightdata.groupBy("DEST_COUNTRY_NAME").agg(sum('count')).show(5)
dest_count_data = flightdata.groupBy("DEST_COUNTRY_NAME").agg({'count': 'sum'})

# _write the data to csv after coalesce
dest_count_data_merged = dest_count_data.coalesce(1)
dest_count_data_merged.write.format('csv').option('header',
                                                  'true').save('dest_country')
Пример #15
0
def around_except_grade(around_cust,
                        value_df,
                        cust_cluster,
                        cols: dict,
                        grade=[3, 4, 5]):
    """
          零售户指标与周围零售户相比
    :param around_cust: 列:cust_id1,cust_id0  零售户cust_id1周边包含cust_id0这些零售户
    :param value_df:  列:cust_id,cols["value"]  cols["value"]:计算均值、标准差的值  如 条均价/订单额
    :param cust_cluster: 列:cust_id,cluster_index  零售户聚类结果
    :param cols: {"value":value,}
    :param grade: [3,4,5] 按照 3/4/5倍标准差分档
    :return:
    """

    value = cols["value"]

    abnormal = "warning_level_code"
    plus_one_grade = "avg_orders_plus3"
    minus_one_grade = "avg_orders_minu3"
    plus_two_grade = "avg_orders_plus4"
    minus_two_grade = "avg_orders_minu4"
    plus_three_grade = "avg_orders_plus5"
    minus_three_grade = "avg_orders_minu5"

    # 1.获取cust_id1的类别
    # 2.获取cust_id0的类别
    around_cust = around_cust.join(cust_cluster, col("cust_id") == col("cust_id1")) \
                            .drop("cust_id") \
                            .withColumnRenamed("cluster_index", "cluster_index1") \
                            .join(cust_cluster, col("cust_id") == col("cust_id0")) \
                            .withColumnRenamed("cluster_index", "cluster_index0") \
                            .where(col("cluster_index0") == col("cluster_index1")) \
                            .select("cust_id1", "cust_id0")

    #每个零售户周边均值 标准差
    mean_std = around_cust.join(value_df, col("cust_id0") == col("cust_id")) \
        .select("cust_id1", value) \
        .groupBy("cust_id1") \
        .agg(f.mean(value).alias("mean"), f.stddev_pop(col(value)).alias("stddev"))
    """
    预警C
        均值 + 3标准差 < 条均价 < 均值 + 4标准差   过高  11
        均值 - 4标准差 < 条均价 < 均值 - 3标准差   过低  10
    预警B
        均值 + 4标准差 < 条均价 < 均值 + 5标准差   过高  21
        均值 - 5标准差 < 条均价 < 均值 - 4标准差   过低  20
    预警A
        均值 + 5标准差 < 条均价    过高  31
        条均价 < 均值 - 5标准差   过低  30
    """
    except_cust = mean_std.join(value_df, col("cust_id1") == col("cust_id")) \
        .withColumn(plus_one_grade, col("mean") + col("stddev") * grade[0]) \
        .withColumn(minus_one_grade, col("mean") - col("stddev") * grade[0]) \
        .withColumn(plus_two_grade, col("mean") + col("stddev") * grade[1]) \
        .withColumn(minus_two_grade, col("mean") - col("stddev") * grade[1]) \
        .withColumn(plus_three_grade, col("mean") + col("stddev") * grade[2]) \
        .withColumn(minus_three_grade, col("mean") - col("stddev") * grade[2]) \
        .withColumn(abnormal,
                    f.when((col(value) > col(plus_one_grade)) & (col(value) < col(plus_two_grade)), "C1")
                    .when((col(value) > col(minus_two_grade)) & (col(value) < col(minus_one_grade)), "C0")
                    .when((col(value) > col(plus_two_grade)) & (col(value) < col(plus_three_grade)), "B1")
                    .when((col(value) > col(minus_three_grade)) & (col(value) < col(minus_two_grade)), "B0")
                    .when((col(value) > col(plus_three_grade)), "A1")
                    .when((col(value) < col(minus_three_grade)), "A0")
        ).dropna(subset=[abnormal])

    return except_cust
Пример #16
0
def fill_dp(spark, cust_lng_lat, dp, colName, coordinate, regex):
    """
        填充价格
    :param cust_lng_lat:零售户经纬度DataFrame,列: city cust_id lng lat lng_l lng_r lat_d lat_u
    :param dp:零售户房租/餐饮/酒店的平均价格DataFrame, 列: city cust_id {colName}
    :param colName:价格字段
    :param coordinate:poi数据DataFrame, 列: lng lat types
    :param regex: 过滤poi的正则
    """
    # 列:city,cust_id,poi_index
    poi_index_df = poi_index(spark, cust_lng_lat, coordinate, regex)
    # 得到包含所有零售户的DataFrame
    df0 = cust_lng_lat.join(poi_index_df, ["city", "cust_id"], "left") \
                        .join(dp, ["city", "cust_id"], "left") \
                        .select("city", "cust_id", colName, "poi_index", "lng", "lat")

    summary = df0.select(colName, "poi_index").summary().cache()

    # 全市平均价格的min,max及各分位数
    price_min = summary.where(col("summary") == "min").collect()[0][colName]
    price_25 = summary.where(col("summary") == "25%").collect()[0][colName]
    price_50 = summary.where(col("summary") == "50%").collect()[0][colName]
    price_75 = summary.where(col("summary") == "75%").collect()[0][colName]
    price_max = summary.where(col("summary") == "max").collect()[0][colName]

    # 全市对应poi指数的min,max及各分位数
    poi_25 = summary.where(col("summary") == "25%").collect()[0]["poi_index"]
    poi_50 = summary.where(col("summary") == "50%").collect()[0]["poi_index"]
    poi_75 = summary.where(col("summary") == "75%").collect()[0]["poi_index"]
    poi_max = summary.where(col("summary") == "max").collect()[0]["poi_index"]

    # 零售户周边既没有poi也没有价格的零售户,先用0填充
    df1 = df0.withColumn(
        colName,
        f.when(col(colName).isNull() & col("poi_index").isNull(),
               0).otherwise(col(colName)))

    # 零售户周边有poi没有价格的,使用分箱填充价格
    df2 = df1.withColumn(
        colName,
        f.when(
            (col(colName).isNull()) & (col("poi_index") < poi_25),
            price_min).when(
                (col(colName).isNull()) & (col("poi_index") >= poi_25) &
                (col("poi_index") < poi_50), price_25).when(
                    (col(colName).isNull()) & (col("poi_index") >= poi_50) &
                    (col("poi_index") < poi_75), price_50).when(
                        (col(colName).isNull()) & (col("poi_index") >= poi_75)
                        & (col("poi_index") < poi_max),
                        price_75).otherwise(price_max))

    # -----零售户周边价格为0的零售户,使用KNN填充
    # 价格不为0
    exist_df = df2.where(col(colName) > 0).drop("poi_index")
    # 价格为0
    not_df = df2.where(col(colName) == 0).drop("poi_index")

    if not_df.count() > 0:
        # knn填充
        fill_pd_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(),
                                 colName)
        # 合并
        all_df = spark.createDataFrame(fill_pd_df).unionByName(exist_df)
    else:
        all_df = exist_df

    #-----截断
    #计算每个城市的 threshold
    threshold = all_df.groupBy("city")\
                     .agg((f.mean(col(colName)) + 3 * f.stddev_pop(col(colName))).alias("threshold"))
    truncate_df = all_df.join(threshold, "city") \
                        .withColumn(colName,
                                    f.when(col(colName) > col("threshold"), col("threshold"))
                                    .otherwise(col(colName))
                                    )
    #计算价格+1 的对数
    log_df = truncate_df.withColumn("log", f.log(col(colName) + 1))

    log_max = log_df.groupBy("city").agg(f.max("log").alias("log_max"))

    result=log_df.join(log_max, "city") \
                .withColumn(colName+"_index", col("log") / col("log_max") * 5)
    #'city','cust_id','food_avg','poi_index','lng','lat','threshold','log','log_max','{colName}_index'
    return result
Пример #17
0
def runModel(regressionMethodName,
             stationID,
             stationDataFrame,
             featureInputCols,
             normalize,
             splitMethod='random'):
    print("=" * 80)
    print('Station:{0}'.format(stationID))
    print(
        'Model:{0}, Normalize:{1}, LinkFunction:{2}, train/test splitMethod:{3}'
        .format(regressionMethodName, normalize, labelLinkFunction,
                splitMethod))
    print(featureInputCols)

    oneHot = OneHotEncoderEstimator(
        inputCols=["hourOfDay", "dayOfWeek"],
        outputCols=["hourOfDayVector", "dayOfWeekVector"])

    stationSummaryAll = stationDataFrame.groupBy('station_id').agg(
        count('label'), sum('label'), avg("label"), stddev_pop("label"))
    stationAvg = stationSummaryAll.select('avg(label)').where(
        col('station_id') == stationID).collect()
    stationSum = stationSummaryAll.select('sum(label)').where(
        col('station_id') == stationID).collect()
    stationStd = stationSummaryAll.select('stddev_pop(label)').where(
        col('station_id') == stationID).collect()
    stationNonZeroCount = stationSummaryAll.select('count(label)').where(
        col('station_id') == stationID).collect()
    stationCount = stationSummaryAll.select('count(label)').where(
        col('station_id') == "None").collect()

    featureInputCols.extend(["hourOfDayVector", "dayOfWeekVector"])
    assembler = VectorAssembler(inputCols=featureInputCols,
                                outputCol='features')

    if normalize == True:
        normalizer = Normalizer(inputCol="features",
                                outputCol="normFeatures",
                                p=1.0)
        featureName = "normFeatures"
        regressionMethod, regressionModelParameters = selectRegressionMethod(
            'rf', featureName)
        pipeline = Pipeline(
            stages=[oneHot, assembler, normalizer, regressionMethod])
    else:
        featureName = "features"
        regressionMethod, regressionModelParameters = selectRegressionMethod(
            'rf', featureName)
        pipeline = Pipeline(stages=[oneHot, assembler, regressionMethod])

    trainingDates = ['2016-10-01 00:00:00', '2017-9-30 23:59:59']

    testDates = ['2017-10-01 00:00:00', '2017-10-31 23:59:59']

    dates = {'train': trainingDates, 'test': testDates}

    if splitMethod == 'random':
        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = stationDataFrame.randomSplit([0.6, 0.4])

    else:
        (trainingData, testData) = timeSeriesTestTrain(stationDataFrame, dates)

    #fit model and make predictions
    model = pipeline.fit(trainingData)
    predictedData = model.transform(testData)
    #predictedData.select("prediction", "label", featureName).show(5)
    predictedData
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    evaluator2 = RegressionEvaluator(labelCol="label",
                                     predictionCol="prediction",
                                     metricName="r2")
    evaluator3 = RegressionEvaluator(labelCol="label",
                                     predictionCol="prediction",
                                     metricName="explainedVariance")

    rmse = evaluator.evaluate(predictedData)
    rSquared = evaluator2.evaluate(predictedData)
    varianceExplained = evaluator2.evaluate(predictedData)

    print(
        "RMSE, R2, and variance explained on test data = {0:6.3f}, {1:6.3f}, {2:6.3f}"
        .format(rmse, rSquared, varianceExplained))
    print()
    basetime = 1541216769
    experimentTimeStamp = int((time.time() - basetime) / 6)
    experiment = {
        experimentTimeStamp: {
            "station": stationID,
            'stationNonZeroCount': stationNonZeroCount,
            'stationCount': stationCount,
            'stationSum': stationSum,
            'stationAvg': stationAvg,
            'stationStd': stationStd,
            'regressionMethodName': regressionMethodName,
            'normalize': normalize,
            'linkFunctionLabel': labelLinkFunction,
            'featureInputCols': featureInputCols,
            'rmse': rmse,
            'rSquared': rSquared,
            'varianceExplained': varianceExplained,
            'version': "Added OneHotEncode for hOD, dOW",
            'trainSplitMethod': splitMethod
        }
    }
    experiments.update(experiment)
    with open(pathFigure + "experiments.json", "w") as f:
        json.dump(experiments, f)
    return ()
Пример #18
0
def get_cpt_index():
    # -----零售户店铺竞争指数
    # def get_cpt_index():
    print(f"{str(dt.now())}   cust competitive index")
    try:
        # 零售户档位
        co_cust = get_co_cust(spark).select("cust_id", "cust_seg") \
            .withColumn("cust_seg", f.regexp_replace(col("cust_seg"), "zz|ZZ", "31"))

        co_cust.cache()

        # 1.cust_id1 周围有cust_id0这些零售户
        # 2.零售户与范围内每个零售户的距离
        around_cust = get_around_cust(spark, 0.5)\
                                 .withColumn("length", haversine(col("lng1"), col("lat1"), col("lng0"),col("lat0"))) \
                                 .select("city1", "cust_id1", "cust_id0", "length")

        around_cust.cache()

        # 算出范围内距离的最大值和最小值
        max_min = around_cust.groupBy("city1", "cust_id1") \
            .agg(f.max("length").alias("max"), f.min("length").alias("min"))

        """
           距离系数=1/零售户与其他零售户距离的标准化后的值
           距离标准化值= ( 距离-min( 距离) )/(max( 距离 )-min( 距离 ))
        """
        # 距离系数
        dis_coe = around_cust.join(max_min, ["city1", "cust_id1"]) \
            .withColumn("dis_coe", 1 * (col("max") - col("min")) / (col("length") - col("min"))) \
            .select("city1", "cust_id1", "cust_id0", "dis_coe")

        # 1.获取cust_id1 档位 cust_seg1
        # 2.获取cust_id0 档位 cust_seg0
        # 3.档位差*距离系数
        # 4.加权平均档位差=sum((其他零售户档位-零售户档位)*距离系数)
        avg_seg_diff = dis_coe.join(co_cust, col("cust_id") == col("cust_id1")) \
            .drop("cust_id") \
            .withColumnRenamed("cust_seg", "cust_seg1") \
            .join(co_cust, col("cust_id") == col("cust_id0")) \
            .withColumnRenamed("cust_seg", "cust_seg0") \
            .groupBy("city1", "cust_id1") \
            .agg(f.sum((col("cust_seg0") - col("cust_seg1")) * col("dis_coe")).alias("avg_seg_diff")) \
            .select("city1", "cust_id1", "avg_seg_diff")

        # 零售户竞争强度=零售户500m范围加权平均档位差*范围内零售户数量
        cust_cpt = around_cust.groupBy("city1", "cust_id1").count() \
            .join(avg_seg_diff, ["city1", "cust_id1"]) \
            .withColumn("cust_cpt", col("avg_seg_diff") * col("count")) \
            .select("city1", "cust_id1", "cust_cpt")

        cust_cpt.cache()

        threshold = cust_cpt.groupBy("city1").agg(
            (f.mean("cust_cpt") + 3 * f.stddev_pop(col("cust_cpt"))).alias("threshold_plus"),
            (f.mean("cust_cpt") - 3 * f.stddev_pop(col("cust_cpt"))).alias("threshold_minus")
        )

        truncate_df = cust_cpt.join(threshold, "city1") \
            .withColumn("cust_cpt",
                        f.when(col("cust_cpt") > col("threshold_plus"), col("threshold_plus"))
                        .when(col("cust_cpt") < col("threshold_minus"), col("threshold_minus")) \
                        .otherwise(col("cust_cpt"))
                        )

        # 全市最大竞争强度
        max_cpt = truncate_df.groupBy("city1").agg(f.max(col("cust_cpt")).alias("max_cpt"),
                                                   f.min(col("cust_cpt")).alias("min_cpt"))

        # 店铺竞争指数
        colName = "order_competitive_index"
        result = truncate_df.join(max_cpt, "city1") \
            .withColumnRenamed("cust_id1", "cust_id") \
            .withColumn(colName, 5 - (col("cust_cpt") - col("min_cpt")) / (col("max_cpt") - col("min_cpt")) * 5) \
            .foreachPartition(lambda x: write_hbase1(x, [colName], hbase))

        co_cust.unpersist()
        around_cust.unpersist()
        cust_cpt.unpersist()
    except Exception:
        tb.print_exc()
Пример #19
0
def get_builtin_aggregator_column(agg, ctx):
    try:
        aggregator = ctx.aggregators[agg["aggregator"]]

        try:
            input = ctx.populate_values(agg["input"],
                                        aggregator["input"],
                                        preserve_column_refs=False)
        except CortexException as e:
            e.wrap("input")
            raise

        if aggregator["name"] == "approx_count_distinct":
            return F.approxCountDistinct(input["col"],
                                         input.get("rsd")).alias(agg["name"])
        if aggregator["name"] == "avg":
            return F.avg(input).alias(agg["name"])
        if aggregator["name"] in {
                "collect_set_int", "collect_set_float", "collect_set_string"
        }:
            return F.collect_set(input).alias(agg["name"])
        if aggregator["name"] == "count":
            return F.count(input).alias(agg["name"])
        if aggregator["name"] == "count_distinct":
            return F.countDistinct(*input).alias(agg["name"])
        if aggregator["name"] == "covar_pop":
            return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"])
        if aggregator["name"] == "covar_samp":
            return F.covar_samp(input["col1"],
                                input["col2"]).alias(agg["name"])
        if aggregator["name"] == "kurtosis":
            return F.kurtosis(input).alias(agg["name"])
        if aggregator["name"] in {"max_int", "max_float", "max_string"}:
            return F.max(input).alias(agg["name"])
        if aggregator["name"] == "mean":
            return F.mean(input).alias(agg["name"])
        if aggregator["name"] in {"min_int", "min_float", "min_string"}:
            return F.min(input).alias(agg["name"])
        if aggregator["name"] == "skewness":
            return F.skewness(input).alias(agg["name"])
        if aggregator["name"] == "stddev":
            return F.stddev(input).alias(agg["name"])
        if aggregator["name"] == "stddev_pop":
            return F.stddev_pop(input).alias(agg["name"])
        if aggregator["name"] == "stddev_samp":
            return F.stddev_samp(input).alias(agg["name"])
        if aggregator["name"] in {"sum_int", "sum_float"}:
            return F.sum(input).alias(agg["name"])
        if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}:
            return F.sumDistinct(input).alias(agg["name"])
        if aggregator["name"] == "var_pop":
            return F.var_pop(input).alias(agg["name"])
        if aggregator["name"] == "var_samp":
            return F.var_samp(input).alias(agg["name"])
        if aggregator["name"] == "variance":
            return F.variance(input).alias(agg["name"])

        raise ValueError("missing builtin aggregator")  # unexpected

    except CortexException as e:
        e.wrap("aggregate " + agg["name"])
        raise
Пример #20
0
        format="parquet",
        path="/data/intermediate_data/cdr_step1/")

    df = sqlContext.read.parquet("/data/intermediate_data/cdr_step1/")
    df1 = df.sort('number').groupBy('number', 'type', 'week').agg(
        f.sum('callduration').alias('call_sum'),
        f.count('callduration').alias('call_count'),
        f.sum('iscompethot').alias('call_count_competitor'))
    df1.write.mode("overwrite").saveAsTable(
        "cdr_step2",
        format="parquet",
        path="/data/intermediate_data/cdr_step2/")

    df = sqlContext.read.parquet('/data/intermediate_data/cdr_step2/')
    df1 = df.groupBy('number', 'type').agg(
        (f.stddev_pop('call_sum') /
         f.mean('call_sum')).alias('coefficiant_of_variance'),
        f.sum('call_count').alias('call_count'),
        f.sum('call_count_competitor').alias('call_count_competitor'))
    df1.write.mode("overwrite").saveAsTable(
        "cdr_step3",
        format="parquet",
        path="/data/intermediate_data/cdr_step3/")

    df = sqlContext.read.parquet('/data/intermediate_data/cdr_step3/')
    df_out = df.where(df.type == "out").select(df.number,
                                               df.coefficiant_of_variance,
                                               df.call_count,
                                               df.call_count_competitor)
    df_in = df.where(df.type == "in").select(df.number,
                                             df.coefficiant_of_variance,
Пример #21
0
    def run_pipeline(self):
        try:
            logging.info(
                "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/"
            )
            # check collect_list and collect_set
            #collect_set() function returns all values from an input column with duplicate values eliminated.
            #collect_list() function returns all values from an input column with duplicates

            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600),
                          ("Robert", "Sales", 4100),
                          ("Maria", "Finance", 3000), ("James", "Sales", 3000),
                          ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                          ("Jeff", "Marketing", 3000),
                          ("Kumar", "Marketing", 2000),
                          ("Saif", "Sales", 4100)]
            schema = ["employee_name", "department", "salary"]

            df = self.spark.createDataFrame(data=simpleData,
                                            schema=schema).cache()
            df.show(truncate=False)

            from pyspark.sql.functions import approx_count_distinct, collect_list
            from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
            from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
            from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
            from pyspark.sql.functions import variance, var_samp, var_pop
            df.printSchema()
            df.show(truncate=False)

            print("approx_count_distinct: " + \
                  str(df.select(approx_count_distinct("salary")).collect()[0][0]))

            print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

            df.select(collect_list("salary")).show(truncate=False)

            df.select(collect_set("salary")).show(truncate=False)

            df2 = df.select(countDistinct("department", "salary"))
            df2.show(truncate=False)
            print("Distinct Count of Department & Salary: " +
                  str(df2.collect()[0][0]))

            print("count: " + str(df.select(count("salary")).collect()[0]))
            dffirst = df.select(first("salary"))
            dffirst.show(truncate=False)
            df.select(last("salary")).show(truncate=False)
            df.select(kurtosis("salary")).show(truncate=False)
            df.select(max("salary")).show(truncate=False)
            df.select(min("salary")).show(truncate=False)
            df.select(mean("salary")).show(truncate=False)
            df.select(skewness("salary")).show(truncate=False)
            df.select(stddev("salary"), stddev_samp("salary"), \
                      stddev_pop("salary")).show(truncate=False)
            df.select(sum("salary")).show(truncate=False)
            df.select(sumDistinct("salary")).show(truncate=False)
            df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \
                .show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()


# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()

Пример #23
0
    def num_iqr_filter(self,
                       data_frame,
                       inputCol="VALUE",
                       labelCol="ITEMID",
                       REPARTITION_CONST=None,
                       sc=None):
        from pyspark.sql.window import Window
        from pyspark.sql.functions import abs, percent_rank, row_number, collect_list, udf, struct, count, avg, stddev_pop
        from pyspark.sql.types import MapType, StringType, DoubleType
        self.logger.debug("[NUM_IQR_FILTER] IN")
        self.logger.debug("[NUM_IQR_FILTER] BEFORE QUANTILE TAGGED")
        if REPARTITION_CONST is None:
            value_order = Window.partitionBy(labelCol).orderBy(
                col(inputCol).cast("float"))
            Q1_percentile = Window.partitionBy(labelCol).orderBy(
                abs(0.25 - col("percentile")))
            Q2_percentile = Window.partitionBy(labelCol).orderBy(
                abs(0.5 - col("percentile")))
            Q3_percentile = Window.partitionBy(labelCol).orderBy(
                abs(0.75 - col("percentile")))
            percent_data_frame = data_frame.select(
                labelCol, inputCol,
                percent_rank().over(value_order).alias("percentile"))
            Q1_data_frame = percent_data_frame.withColumn("Q1_rn",row_number().over(Q1_percentile)).where("Q1_rn == 1")\
                            .select(labelCol,inputCol,lit("Q1").alias("quantile"))
            Q2_data_frame = percent_data_frame.withColumn("Q2_rn",row_number().over(Q2_percentile)).where("Q2_rn == 1")\
                            .select(labelCol,inputCol,lit("Q2").alias("quantile"))
            Q3_data_frame = percent_data_frame.withColumn("Q3_rn",row_number().over(Q3_percentile)).where("Q3_rn == 1")\
                            .select(labelCol,inputCol,lit("Q3").alias("quantile"))
            self.logger.debug(
                "[NUM_IQR_FILTER] REPARTITION_CONST Not Asserted")
            merge_all = Q1_data_frame.unionAll(Q2_data_frame).unionAll(
                Q3_data_frame).persist()

            self.logger.debug("[NUM_IQR_FILTER] Qs Merged")

            udf_parse_list_to_map = udf(
                lambda maps: dict(list(tuple(x) for x in maps)),
                MapType(StringType(), StringType()))

            self.logger.debug("[NUM_IQR_FILTER] Before merge quantiles")
            aggregate_quantiles = merge_all.groupBy(labelCol).agg(
                collect_list(struct("quantile", inputCol)).alias("quant_info"))
            self.logger.debug("AGG ONLY")
            aggregate_quantiles = aggregate_quantiles.select(
                labelCol,
                udf_parse_list_to_map("quant_info").alias("quant_info"))
            self.logger.debug("TRANSFORM")
            iqr_data_frame = aggregate_quantiles.withColumn("Q1",col("quant_info").getItem("Q1").cast("float"))\
                .withColumn("Q2",col("quant_info").getItem("Q2").cast("float"))\
                .withColumn("Q3",col("quant_info").getItem("Q3").cast("float"))
            self.logger.debug("QUANTILE_EXTRACTION")
        else:
            cur_label_list = data_frame.select(
                labelCol).distinct().rdd.flatMap(list).collect()
            cur_iqr_list = list()
            cnt = -1
            for cur_item in cur_label_list:
                cnt = cnt + 1
                data_frame.where(
                    col(labelCol) == cur_item).registerTempTable("cur_table")
                self.logger.debug("{0}/{1},::{2}".format(
                    cnt, len(cur_label_list),
                    sc.sql(
                        "select {0} from cur_table".format(labelCol)).count()))
                cur_iqr = sc.sql(
                    "select {0}, percentile_approx({1},0.25) as Q1, percentile_approx({2},0.5) as Q2, percentile_approx({3},0.75) as Q3 from cur_table group by {4}"
                    .format(labelCol, inputCol, inputCol, inputCol,
                            labelCol)).first().asDict()
                cur_iqr_list.append(cur_iqr)
                sc.catalog.dropTempView("cur_table")
                #percent_data_frame = data_frame.select(labelCol, inputCol, percent_rank().over(value_order).alias("percentile")).repartition(REPARTITION_CONST).cache().checkpoint()
            iqr_data_frame = sc.createDataFrame(cur_iqr_list).repartition(
                REPARTITION_CONST)

        if REPARTITION_CONST is None:
            iqr_data_frame = iqr_data_frame.withColumn("IQR",col("Q3")-col("Q1"))\
                                       .withColumn("LB",col("Q1")-1.5*col("IQR"))\
                                       .withColumn("UB",col("Q3")+1.5*col("IQR"))\
                                       .select(labelCol,"LB","UB")
        else:
            iqr_data_frame = iqr_data_frame.withColumn("IQR",col("Q3")-col("Q1"))\
                                       .withColumn("LB",col("Q1")-1.5*col("IQR"))\
                                       .withColumn("UB",col("Q3")+1.5*col("IQR"))\
                                       .select(labelCol,"LB","UB").repartition(REPARTITION_CONST).persist()

            self.logger.debug("CUR_ITEMID_ALL_COUNT:{0}".format(
                iqr_data_frame.count()))

        self.logger.debug("[NUM_IQR_FILTER] iqr_data_frame merged")
        if REPARTITION_CONST is None:
            self.logger.debug(
                "[NUM_IQR_FILTER] RETURN_PREP, REPARTITION_CONST NOT ASSERTED")
            ret_data_frame = data_frame.repartition(labelCol).join(iqr_data_frame,labelCol).where((col("LB").cast("float") <= col(inputCol).cast("float")) & (col("UB").cast("float")>=col(inputCol).cast("float")))\
                                                                 .drop("LB").drop("UB").persist()
            ref_df = ret_data_frame.repartition(labelCol).groupBy(labelCol)\
                               .agg(count(inputCol).alias("ref_count"),avg(inputCol).alias("ref_avg"),stddev_pop(inputCol).alias("ref_std")).persist()
            self.logger.debug("CHECK DF")
            self.logger.debug(ref_df.count())
            self.logger.debug(ret_data_frame.count())

            return (ret_data_frame, ref_df)
        else:
            self.logger.debug(
                "[NUM_IQR_FILTER] RETURN_PREP, REPARTITION_CONST ASSERTED: {0}"
                .format(REPARTITION_CONST))
            ret_data_frame = data_frame.join(iqr_data_frame,labelCol).where((col("LB").cast("float") <= col(inputCol).cast("float")) & (col("UB").cast("float")>=col(inputCol).cast("float")))\
                                                                 .drop("LB").drop("UB").repartition(REPARTITION_CONST)
            ref_df = ret_data_frame.groupBy(labelCol)\
                               .agg(count(inputCol).alias("ref_count"),avg(inputCol).alias("ref_avg"),stddev_pop(inputCol).alias("ref_std")).repartition(REPARTITION_CONST)

            return (ret_data_frame, ref_df)
Пример #24
0
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()
    df = spark.read.format('csv')\
                   .option('sep', ';')\
                   .option('header', 'true')\
                   .load('user.csv')

    df.repartition(5).select(max('id')).show()
    df.repartition(5).select(avg('age')).show()

    df.select(round(lit(2.5)), bround(lit(2.5))).show()
    df.select(round(lit("2.5")), bround(lit("2.5"))).show()


    spark.range(0, 10, 2)\
         .select(monotonically_increasing_id().alias('id_v2'),
                round((rand() * 5 + 5)).alias('rand')).show()

    spark.range(0, 10, 2)\
         .select(monotonically_increasing_id().alias('id'),
                 pow(col('id'), 2).alias('pow')).show()

    df.describe().show()

    df.select(count('age'), mean('age'), min('age'), max('age'),
              stddev_pop('age')).show()

    df.select(corr(col('age'), col('name'))).show()
    df.stat.corr(col('age'), col('name')).show()
Пример #25
0
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()


# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()

# by default, if we use function `variance` or `stddev`,
# Spark performs the formula for sample variance or standard deviation

# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()
Пример #26
0
df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"),
          stddev_samp("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
          covar_pop("InvoiceNo", "Quantity")).show()

# COMMAND ----------
Пример #27
0
#df.filter(df.SYMBOL.startswith("KOTAK")).show()
#df.filter(col("SYMBOL").startswith("KOTAKBANK")).show()
#df.columns.str.replace(' ','')
#df.select(regexp_replace("OPEN_PRICE"," ","")).show(5)
#df.where(col("SERIES").like("%EQ")).orderBy(desc(" OPEN_PRICE")).describe().show()
#df_strip_spaces.where(col(" SERIES").like("%EQ")).orderBy(to_date(col("DATE1")))

#counts aggregations
df_strip_spaces.count()
df_strip_spaces.select(count("SERIES")).show()
df_strip_spaces.select(countDistinct("SERIES")).show()
df_strip_spaces.select(approx_count_distinct("SERIES", .1)).show()
df_strip_spaces.select(first("SERIES"), last("SERIES"), min("SERIES"),
                       max("SERIES"), sum("OPEN_PRICE"),
                       sumDistinct("OPEN_PRICE")).show()
df_strip_spaces.select(mean("OPEN_PRICE")).show()
df_strip_spaces.select(avg("OPEN_PRICE")).show()
df_strip_spaces.select(("SERIES"))
df_strip_spaces.groupBy("SERIES", "SYMBOL").count().show()
df_strip_spaces.where("SYMBOL like '%BANK%'").groupBy("SERIES").avg().show()
df_strip_spaces.select(avg("OPEN_PRICE"))
df_strip_spaces.select(var_pop("OPEN_PRICE"), stddev_pop("OPEN_PRICE")).show()
#df.select(covar_pop("OPEN PRICE","CLOSE PRICE"),corr("OPEN PRICE","CLOSE PRICE")).show()

windowSpec = Window.partitionBy("SYMBOL", to_date("DATE1")).orderBy(
    to_date("DATE1")).rowsBetween(Window.unboundedPreceding, Window.currentRow)
win = sum(col("OPEN PRICE")).over(windowSpec)
df.select("DATE", "OPEN PRICE", win.alias("d")).orderBy(to_date("DATE")).show()
win1 = dense_rank().over(windowSpec)
df.select(rank().over(windowSpec)).show()
Пример #28
0
    pw_df = pw_df.localCheckpoint(eager = True)
    print("Merged to:", pw_df.count())
    print()

#Okay now we have a list of outages, restore_times, locations, core_ids
#First let's calculate some high level metrics

#size of outages
pw_finalized_outages = pw_finalized_outages.withColumn("cluster_size", F.size(F.array_distinct("core_id")))

#standard deviation outage times
pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_stddev", F.explode("outage_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"),*exprs)

#range of outage times
pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_range", F.array_max("outage_times") - F.array_min("outage_times"))

#standard deviation and range of restore times
pw_finalized_outages = pw_finalized_outages.withColumn("restore_times", col("restore_time"))
pw_finalized_outages = pw_finalized_outages.withColumn("restore_time", F.explode("restore_time"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_time' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.avg("restore_time").alias("restore_times_mean"),*exprs)

pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_stddev", F.explode("restore_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
Пример #29
0
def main():
    """
    Lavora sui dati del database originale per fare ETL e caricare sulla
    dwh i dati che ci servono
    """

    spark = SparkSession \
        .builder \
        .appName('DBAnalysis') \
        .config('spark.driver.extraClassPath', 'postgresql-42.2.10.jar') \
        .getOrCreate()

    properties = {
        'driver': 'org.postgresql.Driver',
        'url': 'jdbc:postgresql://postgres:5432/postgres',
        'user': '******',
        'password': '******',
        'dbtable': ' spotify_details',
    }

    properties_dwh = {
        'driver': 'org.postgresql.Driver',
        'url': 'jdbc:postgresql://postgres_dwh:5432/postgres',
        'user': '******',
        'password': '******'
    }

    df2 = spark.read \
        .format('jdbc') \
        .option('driver', properties['driver']) \
        .option('url', properties['url']) \
        .option('user', properties['user']) \
        .option('password', properties['password']) \
        .option('dbtable', properties['dbtable']) \
        .load()

    count_tracks_distribution = df2.groupby('id_playlist')\
                                .count()

    # Creo la colonna year_month
    df3 = df2.withColumn(
        'year_month',
        date_format(to_timestamp(df2.timestamp, "yyyy-MM-dd'T'HH:mm:ssXXX"),
                    "yyyy-MM").alias('year_month'))

    # Aggrego sulla playlist e conto le ricorrenze
    df4 = df3.groupby('id_playlist', 'year_month').count()

    # Per assegnare un mese alla playlist si é deciso di scegliere il mese con maggiori "aggiunte" di canzoni
    # Per ogni playlist seleziono solo quella con ricorrenze per mese maggiore
    w = Window.partitionBy('id_playlist')

    df5 = df4.withColumn('max', f.max('count').over(w))\
        .where(f.col('count') == f.col('max'))\
        .drop('max', 'count')

    month_distribution = df5\
        .where(f.col('year_month')>="2020-01")\
        .groupby('year_month')\
        .count()

    # Il df completo ha l'informazione sul mese di riferimento assegnata ad ogni playlist
    spotify_complete = df2.join(df5, on=['id_playlist'], how='left')

    df_complete2 =  spotify_complete.groupBy("id_playlist", 'name_playlist' , 'year_month')\
                    .agg(f.mean('danceability'),f.stddev_pop('danceability'),f.mean('energy'),f.stddev_pop('energy'),f.mean('valence'),f.stddev_pop('valence'))\
                    .sort('year_month', ascending=True)

    newColumns = [
        "id_playlist", "name_playlist", "year_month", "avgdanceability",
        "stdddanceability", "avgenergy", "stddenergy", "avgvalence",
        "stddvalence"
    ]
    df_complete2 = df_complete2.toDF(*newColumns)

    df_complete3 = df_complete2.groupBy('year_month')\
    .agg(f.mean('avgdanceability'),f.mean('stdddanceability'),f.mean('avgenergy'),f.mean('stddenergy'),f.mean('avgvalence'),f.mean('stddvalence'))\
    .sort('year_month', ascending=True)

    newColumns = [
        "timestamp", "mean_danceability", "stdev_danceability", "mean_energy",
        "stdev_energy", "mean_valence", "stdev_valence"
    ]
    audiofeatures_stat = df_complete3.toDF(*newColumns)

    audiofeatures_stat.write.jdbc(url=properties_dwh['url'],
                                  table='audiofeatures_stat',
                                  mode='overwrite',
                                  properties=properties_dwh)
    spotify_complete.write.jdbc(url=properties_dwh['url'],
                                table='spotify_complete',
                                mode='overwrite',
                                properties=properties_dwh)
    month_distribution.write.jdbc(url=properties_dwh['url'],
                                  table='month_distribution',
                                  mode='overwrite',
                                  properties=properties_dwh)
    count_tracks_distribution.write.jdbc(url=properties_dwh['url'],
                                         table='count_tracks_distribution',
                                         mode='overwrite',
                                         properties=properties_dwh)
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)
Пример #31
0
# get the min and max amount of calories burned
dailyActivitiesDF.select(min("CaloriesBurned"), max("CaloriesBurned")).show()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Statistical functions
# MAGIC
# MAGIC - We can do some basic statistical functions as well using the Spark API

# COMMAND ----------

# standard deviation and variance
dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"),
                         stddev_pop("CaloriesBurned"),
                         stddev_samp("CaloriesBurned")).show()

# COMMAND ----------

# Any extreme points in our data?
dailyActivitiesDF.select(skewness("CaloriesBurned"),
                         kurtosis("CaloriesBurned")).show()

# COMMAND ----------

# Covariance and Correlation
dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"),
                         covar_samp("CaloriesBurned", "Steps"),
                         covar_pop("CaloriesBurned", "Steps")).show()