Exemplo n.º 1
0
    def describe_float_1d(df, column, current_result, nrows):
        stats_df = df.select(column).na.drop().agg(
            mean(col(column)).alias("mean"),
            df_min(col(column)).alias("min"),
            df_max(col(column)).alias("max"),
            variance(col(column)).alias("variance"),
            kurtosis(col(column)).alias("kurtosis"),
            stddev(col(column)).alias("std"),
            skewness(col(column)).alias("skewness"),
            df_sum(col(column)).alias("sum")).toPandas()

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr(
                "percentile_approx(`{col}`,CAST({n} AS DOUBLE))".format(
                    col=column, n=x)).toPandas().iloc[:, 0])
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column).na.drop().select(
            df_abs(col(column) - stats["mean"]).alias("delta")).agg(
                df_sum(col("delta"))).toPandas().iloc[0, 0] /
                        float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        hist_data = create_hist_data(df, column, stats["min"], stats["max"],
                                     bins)

        return stats
Exemplo n.º 2
0
    def dataStatistics(self, categoricalFeatures, numericalFeatures):
        # self.dataTranform()
        self.categoricalFeatures = None if categoricalFeatures == None else categoricalFeatures
        self.numericalFeatures = None if numericalFeatures == None else numericalFeatures
        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}
        dataset = self.dataset
        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        for colm in self.numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListSubTemp = []
                for val in summ:
                    summaryListSubTemp.append(round(float(val), 4))
                summaryListTemp.append(summaryListSubTemp)
            summaryDict[colm] = summaryListTemp
        summaryList.extend(['skewness', 'kurtosis', 'variance'])
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = self.categoricalFeatures
        skewnessList = []
        kurtosisList = []
        varianceList = []
        skewKurtVarDict = {}
        for colm in self.numericalFeatures:
            skewness = (dataset.select(F.skewness(dataset[colm])).toPandas())
            for i, row in skewness.iterrows():
                for j, column in row.iteritems():
                    skewnessList.append(round(column, 4))
            kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas())
            for i, row in kurtosis.iterrows():
                for j, column in row.iteritems():
                    kurtosisList.append(round(column, 4))
            variance = (dataset.select(F.variance(dataset[colm])).toPandas())
            for i, row in variance.iterrows():
                for j, column in row.iteritems():
                    varianceList.append(round(column, 4))

        for skew, kurt, var, colm in zip(skewnessList, kurtosisList,
                                         varianceList, self.numericalFeatures):
            print(skew, kurt, var)
            skewKurtVarList = []
            skewKurtVarList.append(skew)
            skewKurtVarList.append(kurt)
            skewKurtVarList.append(var)
            skewKurtVarDict[colm] = skewKurtVarList

        for (keyOne, valueOne), (keyTwo,
                                 valueTwo) in zip(summaryDict.items(),
                                                  skewKurtVarDict.items()):
            print(keyOne, valueOne, keyTwo, valueTwo)
            if keyOne == keyTwo:
                valueOne.extend(valueTwo)
                summaryDict[keyOne] = valueOne
        return summaryDict
Exemplo n.º 3
0
    def __calc_stats(self, df, resolution):
        """
        Calculates statistics for every column in the Spark DF and returns a seperate DF with the results.
        Statistics: sum, min, max, count, mean, kurtosis, skewness, stddev, variance.
        :param df: DF containing the columns that you want to run your statistics calculations on
        :param resolution: int resolution in milli or microseconds OR string '5m'/'1h'/'1d'
        :return: aggregation dataframe containing statistics
        """

        if type(resolution) is str:
            # resolution to microseconds
            res_dict = {'5m': 300000000, '1h': 3600000000, '1d': 86400000000}
            agg_interval = res_dict[resolution]

        elif type(resolution) is int:
            if len(str(resolution)) < 16:
                resolution = int(str(resolution).ljust(16, '0'))
            agg_interval = resolution

        ts_col = F.col('timestamp')
        df_ori_cols = list(set(df.columns) - set(['timestamp']))

        df = df.withColumn('interval_start',
                           (F.floor(ts_col / agg_interval) * agg_interval))  #\
        #.withColumn('interval_stop', F.ceil(ts_col/agg_interval) * agg_interval)\
        #.orderBy(F.col('interval_start'))
        agg_df = df.groupBy('interval_start').agg(
            F.max(ts_col).alias('max_ts'))

        # TODO Column type checking: string columns are automatically ignored and parse as NaN, so
        # TODO drop NaN columns?

        # TODO: interval_stop ignore, as well as drop max_ts
        # TODO: filter out NaN columns

        # TODO: question: run the statistics job as a seperate job without having to make a udf script

        stat_cols = df_ori_cols  #[c for c in df_ori_cols if c not in ['interval_start', 'interval_stop', 'timestamp', 'max_ts']]
        for column in stat_cols:
            grouped_df = df.groupBy('interval_start')\
                           .agg(F.sum(column).alias('sum_%s' % column),
                                F.min(column).alias('min_%s' % column),
                                F.max(column).alias('max_%s' % column),
                                F.count(column).alias('count_%s' % column),
                                F.kurtosis(column).alias('kurtosis_%s' % column),
                                F.mean(column).alias('mean_%s' % column),
                                F.skewness(column).alias('skewness_%s' % column),
                                F.stddev(column).alias('stddev_%s' % column),
                                F.variance(column).alias('var_%s' % column))
            agg_df = grouped_df.join(agg_df, on='interval_start')
        #agg_df = agg_df.drop('max_ts').drop(F.when(F.col('*').isna())).dropna(how='all').drop_duplicates()

        return agg_df
Exemplo n.º 4
0
def describe_numeric_1d(df,
                        bins,
                        column,
                        current_result,
                        nrows,
                        k=2,
                        dtype='int'):
    stats_df = df.select(column).na.drop().agg(
        mean(col(column)).alias('mean'),
        min(col(column)).alias('min'),
        max(col(column)).alias('max'),
        variance(col(column)).alias('variance'),
        kurtosis(col(column)).alias('kurtosis'),
        stddev(col(column)).alias('std'),
        skewness(col(column)).alias('skewness'),
        sum(col(column)).alias('sum')).toPandas()

    if dtype.lower() == 'int':
        select_expr = 'percentile({c},CAST({p} AS DOUBLE))'
    else:
        select_expr = 'percentile_approx({c},CAST({p} AS DOUBLE))'
    for p in [0.05, 0.25, 0.5, 0.75, 0.95]:
        stats_df[pretty_name(p)] = (df.select(column).na.drop().selectExpr(
            select_expr.format(c=column, p=p)).toPandas().ix[:, 0])
    stats = stats_df.ix[0].copy()
    stats.name = column
    stats['range'] = stats['max'] - stats['min']
    q3, q1 = stats[pretty_name(0.75)], stats[pretty_name(0.25)]
    stats['iqr'] = q3 - q1
    stats['cv'] = stats['std'] / float(stats['mean'])
    stats['mad'] = (df.select(column).na.drop().select(
        abs(col(column) - stats['mean']).alias('delta')).agg(sum(
            col('delta'))).toPandas().ix[0, 0] /
                    float(current_result['count']))
    stats['type'] = 'NUM'
    stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count()
    stats['p_zeros'] = stats['n_zeros'] / float(nrows)
    stats['high_idx'] = df.select(column).where(
        col(column) > q3 + k * (q3 - q1)).count()
    stats['low_idx'] = df.select(column).where(
        col(column) < q1 - k * (q3 - q1)).count()

    # generate histograms
    hist_data = generate_hist_data(df, column, stats['min'], stats['max'],
                                   bins)
    stats['histogram'] = complete_histogram(hist_data)
    stats['mini_histogram'] = mini_histogram(hist_data)
    return stats
Exemplo n.º 5
0
def preprocess_file(input_file_name, spark, sample=True):
    ratings = []
    with open(input_file_name, "r") as fp:
      line = fp.readline()
      while line:
        (user_id,num_ratings) = line.split("|")
        num_ratings = int(num_ratings)
        for i in range(0,num_ratings):
          line = fp.readline()
          (item_id, rating, time, time2) = line.split()
          ratings.append((int(user_id), int(item_id), int(rating)))
        line = fp.readline()
    df = spark.createDataFrame(ratings, 
            ["user_id","item_id","rating"])
    # skewness
    # skewness = df.agg(f.skewness("rating"))
    # skewness.show()
    rdd1 = df.rdd.map(map_ratings)
    df2 = spark.createDataFrame(rdd1)
    if sample is True:
      sampled = df2.sampleBy(
              "rating", 
              fractions={
                  1: 0.334, 
                  2: 1, 
                  3: 0.5930, 
                  4: 0.30258, 
                  5: 0.0899145}, 
              seed=0)
      # skewness
      skewness = sampled.agg(f.skewness("rating"))
      skewness.show()
      #+--------------------+
      #|    skewness(rating)|
      #+--------------------+
      #|-8.24123249452558E-5|
      return sampled
    else:
      return df2
Exemplo n.º 6
0
def learn3():
    df1 = ss.read.csv('F:/Research/data/ccFraud.csv',
                      header=True,
                      inferSchema=True)
    df1.show()
    # 按gender列对df进行分组,并统计每组的行数
    df2 = df1.groupby('gender').count()
    df2.show()
    df3 = df1.describe(['balance', 'numTrans', 'numIntlTrans'])
    df3.show()
    # 检查偏度
    df1.agg({'balance': 'skewness'}).show()
    df1.agg(
        functions.max('balance').alias('max'),
        functions.avg('balance').alias('avg'),
        functions.mean('balance').alias('mean'),
        functions.stddev('balance').alias('stddev'),
        functions.sum('balance').alias('sum'),
        functions.skewness('balance').alias('skewness'),
        functions.variance('balance').alias('variance'),
        functions.sumDistinct('balance').alias('sumDistinct')).show()
    corr1 = df1.corr('balance', 'numTrans')
    print(corr1)
Exemplo n.º 7
0
train_data.printSchema()

# COMMAND ----------

#Summary of all the features in the file
train_data.describe().show()

# COMMAND ----------

#Summary of the numerical data available in the dataset
train_data.describe().select("summary", "outcome", "char_38").show()

# COMMAND ----------

#Finding Skewness and kurtosis for the char_38 column
train_data.select(skewness('char_38'), kurtosis('char_38')).show()

# COMMAND ----------

#Checking if there exists any Null values in the data
train_data.where(
    reduce(lambda x, y: x | y,
           (func.col(x).isNull() for x in train_data.columns))).show()

# COMMAND ----------

#Finding the count for each category of activity
train_data.groupBy('activity_category').count().sort(func.desc('count')).show()

# COMMAND ----------
Exemplo n.º 8
0
meta_data = {f: {} for f in origin_dtypes.keys()}

for f, v in origin_dtypes.items():
    tmp = df.withColumn(f'{f}_to_float', df[f'{f}'].cast('float'))
    if tmp.filter(fn.isnull(f'{f}_to_float')).count() < df_length * 0.4:
        num_value = tmp.groupBy(f).count().count()
        if num_value <= 10:
            col_type[f] = 'C'
            meta_data[f]['count'] = num_value
        else:
            col_type[f] = 'N'
            mi, _5, _25, median, _75, _95, mx = tmp.approxQuantile(
                f'{f}_to_float', [0.0, 0.005, 0.25, 0.5, 0.75, 0.995, 1.0], 0)
            others = tmp.select(
                fn.mean(f'{f}_to_float'), fn.stddev(f'{f}_to_float'),
                fn.skewness(f'{f}_to_float')).take(1)[-1].asDict()
            skew = others.get(f'skewness({f})', 0)
            meta_data[f]['max'] = mx
            meta_data[f]['min'] = mi
            meta_data[f]['median'] = median
            meta_data[f]['mean'] = others.get(f'avg({f})', 0)
            meta_data[f]['std'] = others.get(f'stddev_samp({f})', 0)
            meta_data[f]['skew'] = skew

            iqr = _75 - _25
            if skew > 2.5:
                low = _5
                high = mx + 1.5 * iqr
            elif skew < -2.5:
                low = mi - 1.5 * iqr
                high = _95
	percs['summary'] = [str(p) + '%' for p in percentiles]
	spark_describe = df_in.describe().toPandas()
	new_df = pd.concat([spark_describe, percs],ignore_index = True)
	new_df = new_df.round(2)
	return new_df[['summary']+columns]

#Describe
selected = [s for s in df.columns if 'var_' in s]
print(selected)
df.select(selected).describe().show()
selected = ['var_0','var_1','var_2','var_3','var_4', 'var_5']
describe_pd(df,selected)
describe_pd(df,selected,deciles=True)

#Skewness and kurtosis
df.select(skewness('var_0'),kurtosis('var_0')).show()


#Plot histogram
var = [ 'var_108']
bins = np.arange(0,105,5.0)
df[var].describe().show()
plt.figure(figsize=(10,8))
plt.hist(df_new[var].astype(float),alpha=0.8,histtype='bar',ec='black')

df.dtypes
df[df.var_100 <14.0]

#Correlation matrix
selected = ['target','var_0','var_1','var_2','var_3','var_4', 'var_5']
features = df.select(selected).rdd.map(lambda row: row[0:])
Exemplo n.º 10
0
        return True
    elif np.isnan(p) == True:
        return False
    else:
        return False


# Create UDF funcs
get_pval_udf = F.udf(lambda vars: get_normal_pval(vars), FloatType())
if_norm_udf = F.udf(lambda p: if_norm(p), BooleanType())

# COMMAND ----------

eventsDataAll = eventsData.select('ActionGeo_FullName', 'wERA_3d', 'wERA_60d', 'nArticles') \
                                        .groupBy('ActionGeo_FullName') \
                                        .agg( F.skewness('wERA_3d'),
                                              F.kurtosis('wERA_3d'),
                                              F.stddev('wERA_3d'),
                                              F.variance('wERA_3d'),
                                              F.collect_list('wERA_3d').alias('list_wERA_3d'),
                                              F.skewness('wERA_60d'),
                                              F.kurtosis('wERA_60d'),
                                              F.stddev('wERA_60d'),
                                              F.variance('wERA_60d'),
                                              F.collect_list('wERA_60d').alias('list_wERA_60d'),
                                              F.sum('nArticles').alias('nArticles'),
                                              F.count(F.lit(1)).alias('n_observations')
                                        )

# get p-value and define normalcy
eventsDataAll = eventsDataAll.withColumn(
Exemplo n.º 11
0
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"),
          stddev_samp("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
          covar_pop("InvoiceNo", "Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()

# COMMAND ----------

from pyspark.sql.functions import count
Exemplo n.º 12
0
    def run_pipeline(self):
        try:
            logging.info(
                "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/"
            )
            # check collect_list and collect_set
            #collect_set() function returns all values from an input column with duplicate values eliminated.
            #collect_list() function returns all values from an input column with duplicates

            logging.info(
                'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/'
            )
            simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600),
                          ("Robert", "Sales", 4100),
                          ("Maria", "Finance", 3000), ("James", "Sales", 3000),
                          ("Scott", "Finance", 3300), ("Jen", "Finance", 3900),
                          ("Jeff", "Marketing", 3000),
                          ("Kumar", "Marketing", 2000),
                          ("Saif", "Sales", 4100)]
            schema = ["employee_name", "department", "salary"]

            df = self.spark.createDataFrame(data=simpleData,
                                            schema=schema).cache()
            df.show(truncate=False)

            from pyspark.sql.functions import approx_count_distinct, collect_list
            from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count
            from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness
            from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct
            from pyspark.sql.functions import variance, var_samp, var_pop
            df.printSchema()
            df.show(truncate=False)

            print("approx_count_distinct: " + \
                  str(df.select(approx_count_distinct("salary")).collect()[0][0]))

            print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

            df.select(collect_list("salary")).show(truncate=False)

            df.select(collect_set("salary")).show(truncate=False)

            df2 = df.select(countDistinct("department", "salary"))
            df2.show(truncate=False)
            print("Distinct Count of Department & Salary: " +
                  str(df2.collect()[0][0]))

            print("count: " + str(df.select(count("salary")).collect()[0]))
            dffirst = df.select(first("salary"))
            dffirst.show(truncate=False)
            df.select(last("salary")).show(truncate=False)
            df.select(kurtosis("salary")).show(truncate=False)
            df.select(max("salary")).show(truncate=False)
            df.select(min("salary")).show(truncate=False)
            df.select(mean("salary")).show(truncate=False)
            df.select(skewness("salary")).show(truncate=False)
            df.select(stddev("salary"), stddev_samp("salary"), \
                      stddev_pop("salary")).show(truncate=False)
            df.select(sum("salary")).show(truncate=False)
            df.select(sumDistinct("salary")).show(truncate=False)
            df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \
                .show(truncate=False)

            logging.info('run_pipeline method ended')
        except Exception as exp:
            logging.error("An error occured while running the pipeline > " +
                          str(exp))
            # send email notification
            # log error to database
            sys.exit(1)

        return
def skewTemperature(df, spark):
    return df.select(F.skewness('temperature')).first()[0]
Exemplo n.º 14
0
# COMMAND ----------

# MAGIC %md
# MAGIC **Skewness**: a measure of symmetry, or more precisely, the lack of symmetry. A distribution, or data set, is symmetric if it looks the same to the left and right of the center point.
# MAGIC
# MAGIC **Kurtosis**: a measure of whether the data are heavy-tailed or light-tailed relative to a normal distribution. That is, data sets with high kurtosis tend to have heavy tails or outliers. Data sets with low kurtosis tend to have light tails or a lack of outliers.
# MAGIC
# MAGIC **Standard Deviation**: a statistical measure of the dispersion of the data relative to its mean. It is calculated with the square root of the variance. A low standard deviation indicates that the values tend to be close to the mean of the dataset, while a high standard deviation indicates that the values are spread out over a wider range.
# MAGIC
# MAGIC **Variance**: a measure of variability. It is calculated by taking the average of squared deviations from the mean. Variance tells you the degree of spread in your data set. The more spread the data, the larger the variance is in relation to the mean.

# COMMAND ----------

goldsteinDataPartitioned = goldsteinData.select('ActionGeo_FullName', 'if_conflict', 'wGRA_1d', 'wGRA_60d', 'nArticles') \
                                        .groupBy('ActionGeo_FullName', 'if_conflict') \
                                        .agg( F.skewness('wGRA_1d'),
                                              F.kurtosis('wGRA_1d'),
                                              F.stddev('wGRA_1d'),
                                              F.variance('wGRA_1d'),
                                              F.collect_list('wGRA_1d').alias('list_wGRA_1d'),
                                              F.skewness('wGRA_60d'),
                                              F.kurtosis('wGRA_60d'),
                                              F.stddev('wGRA_60d'),
                                              F.variance('wGRA_60d'),
                                              F.collect_list('wGRA_60d').alias('list_wGRA_60d'),
                                              F.sum('nArticles').alias('nArticles'),
                                              F.count(F.lit(1)).alias('n_observations')
                                        )

goldsteinDataPartitioned.limit(4).toPandas()
Exemplo n.º 15
0
# **Note:** Use `count(lit(1))` rather than `count(1)` as an alternative to `count("*")`.

# The `agg` method returns the same results and can be applied to grouped data:
rides.agg(count("*"), count("distance"), countDistinct("distance"),
          approx_count_distinct("distance")).show()

# Use the `sum` and `sumDistinct` functions to compute various column sums:
from pyspark.sql.functions import sum, sumDistinct
rides.agg(sum("distance"), sumDistinct("distance")).show()

# **Question:** When would one use the `sumDistinct` function?

# Spark SQL provides a number of summary statistics:
from pyspark.sql.functions import mean, stddev, variance, skewness, kurtosis
rides.agg(mean("distance"), stddev("distance"), variance("distance"),
          skewness("distance"), kurtosis("distance")).show()

# **Note:** `mean` is an alias for `avg`, `stddev` is an alias for the sample
# standard deviation `stddev_samp`, and `variance` is an alias for the sample
# variance `var_samp`.  The population standard deviation and population
# variance are available via `stddev_pop` and `var_pop`, respectively.

# Use the `min` and `max` functions to compute the minimum and maximum, respectively:
from pyspark.sql.functions import min, max
rides.agg(min("distance"), max("distance")).show()

# Use the `first` and `last` functions to compute the first and last values, respectively:
from pyspark.sql.functions import first, last
rides \
  .orderBy("distance") \
  .agg(first("distance", ignorenulls=False), last("distance", ignorenulls=False)) \
Exemplo n.º 16
0
def get_builtin_aggregator_column(agg, ctx):
    try:
        aggregator = ctx.aggregators[agg["aggregator"]]

        try:
            input = ctx.populate_values(agg["input"],
                                        aggregator["input"],
                                        preserve_column_refs=False)
        except CortexException as e:
            e.wrap("input")
            raise

        if aggregator["name"] == "approx_count_distinct":
            return F.approxCountDistinct(input["col"],
                                         input.get("rsd")).alias(agg["name"])
        if aggregator["name"] == "avg":
            return F.avg(input).alias(agg["name"])
        if aggregator["name"] in {
                "collect_set_int", "collect_set_float", "collect_set_string"
        }:
            return F.collect_set(input).alias(agg["name"])
        if aggregator["name"] == "count":
            return F.count(input).alias(agg["name"])
        if aggregator["name"] == "count_distinct":
            return F.countDistinct(*input).alias(agg["name"])
        if aggregator["name"] == "covar_pop":
            return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"])
        if aggregator["name"] == "covar_samp":
            return F.covar_samp(input["col1"],
                                input["col2"]).alias(agg["name"])
        if aggregator["name"] == "kurtosis":
            return F.kurtosis(input).alias(agg["name"])
        if aggregator["name"] in {"max_int", "max_float", "max_string"}:
            return F.max(input).alias(agg["name"])
        if aggregator["name"] == "mean":
            return F.mean(input).alias(agg["name"])
        if aggregator["name"] in {"min_int", "min_float", "min_string"}:
            return F.min(input).alias(agg["name"])
        if aggregator["name"] == "skewness":
            return F.skewness(input).alias(agg["name"])
        if aggregator["name"] == "stddev":
            return F.stddev(input).alias(agg["name"])
        if aggregator["name"] == "stddev_pop":
            return F.stddev_pop(input).alias(agg["name"])
        if aggregator["name"] == "stddev_samp":
            return F.stddev_samp(input).alias(agg["name"])
        if aggregator["name"] in {"sum_int", "sum_float"}:
            return F.sum(input).alias(agg["name"])
        if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}:
            return F.sumDistinct(input).alias(agg["name"])
        if aggregator["name"] == "var_pop":
            return F.var_pop(input).alias(agg["name"])
        if aggregator["name"] == "var_samp":
            return F.var_samp(input).alias(agg["name"])
        if aggregator["name"] == "variance":
            return F.variance(input).alias(agg["name"])

        raise ValueError("missing builtin aggregator")  # unexpected

    except CortexException as e:
        e.wrap("aggregate " + agg["name"])
        raise
    spark.read  # Our DataFrameReader
    .option("header", "true")  # Let Spark know we have a header
    .option("inferSchema",
            "false")  # Infering the schema (it is a small dataset)
    .format("com.databricks.spark.csv").csv(
        "/FileStore/tables/telecomData/churn_bigml_20-55239.csv",
        schema=schema,
        nullValue='NA')  # Enforce the Schema 
    .cache()  # Mark the DataFrame as cached.
)

testDF.printSchema()
testDF.count()

#Data skew done using pyspark functions and display as pie chart and adding to dashboard
trainSet.select(f.skewness(trainSet['total_international_charge']),
                f.skewness(trainSet['total_day_charge']),
                f.skewness(trainSet['total_evening_charge']),
                f.skewness(trainSet['total_night_charge']))

# churn is related to the total international call charges:
trainSet.groupBy("churn").sum("total_international_charge").show()

# churn is related to the total international num of calls:
trainSet.groupBy("churn").sum("total_international_num_calls").show()

#Use sparksql to analyze data
# create a temp view for persistence for this session
trainSet.createOrReplaceTempView("UserAccount")

# create a catalog as an interface that can be used to create, drop, alter, or query underlying databases, tables, functions
Exemplo n.º 18
0
# MAGIC
# MAGIC ## Statistical functions
# MAGIC
# MAGIC - We can do some basic statistical functions as well using the Spark API

# COMMAND ----------

# standard deviation and variance
dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"),
                         stddev_pop("CaloriesBurned"),
                         stddev_samp("CaloriesBurned")).show()

# COMMAND ----------

# Any extreme points in our data?
dailyActivitiesDF.select(skewness("CaloriesBurned"),
                         kurtosis("CaloriesBurned")).show()

# COMMAND ----------

# Covariance and Correlation
dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"),
                         covar_samp("CaloriesBurned", "Steps"),
                         covar_pop("CaloriesBurned", "Steps")).show()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Multiple languages in one notebook
# MAGIC
Exemplo n.º 19
0
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column),
                                                                                   stats_df["mean"].ix[0],
                                                                                   current_result["count"])).toPandas().ix[0][0]

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().ix[:,0]
                                        )
        stats = stats_df.ix[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats
Exemplo n.º 20
0
        np.percentile(df_in.select(x).collect(), percentiles) for x in columns
    ])
    percs = pd.DataFrame(percs, columns=columns)
    percs['summary'] = [str(p) + '%' for p in percentiles]
    spark_describe = df_in.describe().toPandas()
    new_df = pd.concat([spark_describe, percs], ignore_index=True)
    new_df = new_df.round(2)
    return new_df[['summary'] + columns]


describe_pd(ds, ['Score'])

# skewness and kurtosis
from pyspark.sql.functions import skewness, kurtosis
var = 'Score'
ds.select(skewness(var), kurtosis(var)).show()

# histogram
import matplotlib.pyplot as plt
var = 'Score'
plot_data = ds.select(var).toPandas()
x = plot_data[var]
bins = [0, 3.6, 3.8, 3.9, 4]
hist, bin_edges = np.histogram(x,
                               bins,
                               weights=np.zeros_like(x) +
                               100. / x.size)  # make the histogram
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(1, 1, 1)
# Plot the histogram heights against integers on the x axis
ax.bar(range(len(hist)), hist, width=1, alpha=0.8, ec='black', color='gold')
Exemplo n.º 21
0
    def describe_float_1d(df, column, current_result, nrows):
        if spark_version == "1.6+":
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum"),
                                                       count(col(column) == 0.0).alias('n_zeros')
                                                       ).toPandas()
        else:
            stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       df_sum(col(column)).alias("sum"),
                                                       count(col(column) == 0.0).alias('n_zeros')
                                                       ).toPandas()
            stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]
            stats_df["std"] = np.sqrt(stats_df["variance"])
            stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]
            stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column),
                                                                                   stats_df["mean"].iloc[0],
                                                                                   current_result["count"])).toPandas().iloc[0][0]

        for x in [0.05, 0.25, 0.5, 0.75, 0.95]:
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().iloc[:,0]
                                        )
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        # Large histogram
        imgdata = BytesIO()
        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)
        figure = plt.figure(figsize=(6, 4))
        plot = plt.subplot()
        plt.bar(hist_data["left_edge"],
                hist_data["count"],
                width=hist_data["width"],
                facecolor='#337ab7')
        plot.set_ylabel("Frequency")
        plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0)
        plot.figure.savefig(imgdata)
        imgdata.seek(0)
        stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
        #TODO Think about writing this to disk instead of caching them in strings
        plt.close(plot.figure)

        stats['mini_histogram'] = mini_histogram(hist_data)

        return stats
Exemplo n.º 22
0
def numStats(dataframe, field):
    '''
    This function works with pandas and spark dataframe
    Parameter field must be a String value, and it must make reference to a single column name
    Parameter field must make reference to a numerical variable
    This function does not consider null values on its calculations
    '''
    if isinstance(dataframe, pd.DataFrame) == True:
        df = dataframe[field].dropna() # Selecting column and droping null values
        # Count of Values
        count = df.count()
        countNullValues = dataframe.shape[0] - count
        # Central Tendency
        mean_value = df.sum() / count
        median_value = df.median()
        # Min, max, and Percentiles
        min_value = df.min()
        max_value = df.max()
        percentile_25 = df.quantile(0.25)
        percentile_75 = df.quantile(0.75)
        # Variation
        stddev_value = df.std()
        range_value = max_value - min_value
        IQR_value = percentile_75 - percentile_25
        #Shape
        skewness_value = df.skew()
        kurtosis_value = df.kurt()
    elif isinstance(dataframe, DataFrame) == True:
        df = dataframe.select(field).dropna(how='any')
        # Count of Values
        count=df.count()
        countNullValues = dataframe.select(field).count() - count
        # Central Tendency
        mean_process=df.agg(avg(col(field))) # The result of agg is a spark dataframe
        mean_value=mean_process.collect()[0][mean_process.columns[0]] # The result of collect is a list of row. [0] first element & [mean_process.columns[0] name of the column
        median_value=df.approxQuantile(col=field,probabilities=[0.5],relativeError=0.05)[0] # The result of approxQuantile is a list. [0] first element
        # Min, Max, and Percentiles
        min_process=df.agg(min(col(field))) # The result of agg is a spark dataframe
        min_value=min_process.collect()[0][min_process.columns[0]] # The result of collect is a list of row. [0] first element & [min_process.columns[0] name of the column
        max_process=df.agg(max(col(field))) # The result of agg is a spark dataframe
        max_value=max_process.collect()[0][max_process.columns[0]] # The result of collect is a list of row. [0] first element & [max_process.columns[0] name of the column
        percentile_25=df.approxQuantile(col=field,probabilities=[0.25],relativeError=0.05)[0] # The result of approxQuantile is a list. [0] first element
        percentile_75=df.approxQuantile(col=field,probabilities=[0.75],relativeError=0.05)[0] # The result of approxQuantile is a list. [0] first element
        # Variation
        stddev_process=df.agg(stddev(col(field))) # The result of agg is a spark dataframe
        stddev_value=stddev_process.collect()[0][stddev_process.columns[0]] # The result of collect is a list of row. [0] first element & [stddev_process.columns[0] name of the column
        range_value=max_value-min_value # Calculation of the range of values
        IQR_value=percentile_75-percentile_25 # Calculation of the Interquartile range
        # Shape
        skewness_process=df.agg(skewness(col(field))) # The result of agg is a spark dataframe
        skewness_value=skewness_process.collect()[0][skewness_process.columns[0]] # The result of collect is a list of row. [0] first element & [skewness_process.columns[0] name of the column
        kurtosis_process=df.agg(kurtosis(col(field))) # The result of agg is a spark dataframe
        kurtosis_value=round(kurtosis_process.collect()[0][kurtosis_process.columns[0]],2) # The result of collect is a list of row. [0] first element & [kurtosis_process.columns[0] name of the column
    
    
    # Printing summary of statistics
    print('Summary of Descriptive Statistics - ',field)
    print('**********************************************************')
    print('Count of values          : ',count)
    print('Count of Null values     : ',countNullValues)
    print('Central tendency:-----------------------------------------')
    print('Mean(Average)            : ',round(mean_value,2))
    print('Median(Percentile 25)    : ',round(median_value,2))
    print('Min, Max, and Percentiles:--------------------------------')
    print('Minimum                  : ',round(min_value,2))
    print('Maximum                  : ',round(max_value,2))
    print('Percentile 25 (Q1)       : ',round(percentile_25,2))
    print('Percentile 75 (Q3)       : ',round(percentile_75,2))
    print('Variation:------------------------------------------------')
    print('Standard Deviation       : ',round(stddev_value,2))
    print('Range                    : ',round(range_value,2))
    print('Interquartile Range (IQR): ',round(IQR_value,2))
    print('Shape:----------------------------------------------------')
    print('Skewness                 : ',round(skewness_value,2))
    print('Kurtosis                 : ',round(kurtosis_value,2))
    print('**********************************************************')
    # Creating a dictionary with descriptive statistics
    data = {'Statistic': ['count', 'Count Null Values', 'mean', 'median', 'min', 'max', 'percentile25', 'percentile75', 'stddev', 'range', 'IQR', 'skewness', 'kurtosis'],
            'Values': [count,countNullValues,mean_value,median_value,min_value,max_value,percentile_25,percentile_75,stddev_value,range_value,IQR_value,skewness_value,kurtosis_value]}
    # Creating a pandas dataframe
    summary_stats = pd.DataFrame(data)
    return summary_stats # This function returns a pandas dataframe
Exemplo n.º 23
0
         END AS minDist
FROM distPOI0
"""
distPOI2 = spark.sql(query)
distPOI2.registerTempTable("distPOI2")
distPOI2.show()

# Stage 2 Analysis

# grouping data by POI
by_POI = distPOI2.groupBy("POI")

by_POI.avg("minDist").show()

by_POI.agg(F.stddev("minDist")).show()

by_POI.min("minDist").show()

by_POI.max("minDist").show()

by_POI.agg(F.skewness("minDist")).show()

by_POI.agg(F.kurtosis("minDist")).show()

query = """SELECT COUNT(_ID) Requests, POI, AVG(minDist) AS Mean,  percentile_approx(minDist, 0.5) AS Median,
MAX(minDist) AS poiRadius_km, COUNT(_ID)/(3.14159*POWER(MAX(minDist),2)) AS Density_Requests_by_km2
FROM distPOI2
GROUP BY POI
"""
spark.sql(query).show()
Exemplo n.º 24
0
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list,
                     relation):
    try:
        dataset = spark.read.parquet(dataset_add)
        label = ''
        for y in label_colm:
            label = y

        Schema = dataset.schema
        stringFeatures = []
        numericalFeatures = []
        for x in Schema:
            if (str(x.dataType) == "StringType"):
                for y in feature_colm:
                    if x.name == y:
                        stringFeatures.append(x.name)
            else:
                for y in feature_colm:
                    if x.name == y:
                        numericalFeatures.append(x.name)

        summaryList = ['mean', 'stddev', 'min', 'max']
        summaryDict = {}

        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        for colm in numericalFeatures:
            summaryListTemp = []
            for value in summaryList:
                summ = list(
                    dataset.select(colm).summary(value).toPandas()[colm])
                summaryListSubTemp = []
                for val in summ:
                    summaryListSubTemp.append(round(float(val), 4))
                # print(summaryListSubTemp)
                summaryListTemp.append(summaryListSubTemp)
            # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm])
            # summaryListTemp.append(varianceListTemp)
            summaryDict[colm] = summaryListTemp
        # summaryList.append('variance')
        summaryDict['summaryName'] = summaryList
        summaryDict['categoricalColumn'] = stringFeatures
        skewnessList = []
        kurtosisList = []
        varianceList = []
        skewKurtVarDict = {}
        for colm in numericalFeatures:
            skewness = (dataset.select(F.skewness(dataset[colm])).toPandas())
            for i, row in skewness.iterrows():
                for j, column in row.iteritems():
                    skewnessList.append(round(column, 4))
            kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas())
            for i, row in kurtosis.iterrows():
                for j, column in row.iteritems():
                    kurtosisList.append(round(column, 4))
            variance = (dataset.select(F.variance(dataset[colm])).toPandas())
            for i, row in variance.iterrows():
                for j, column in row.iteritems():
                    varianceList.append(round(column, 4))

        for skew, kurt, var, colm in zip(skewnessList, kurtosisList,
                                         varianceList, numericalFeatures):
            print(skew, kurt, var)
            skewKurtVarList = []
            skewKurtVarList.append(skew)
            skewKurtVarList.append(kurt)
            skewKurtVarList.append(var)
            skewKurtVarDict[colm] = skewKurtVarList

        for (keyOne, valueOne), (keyTwo,
                                 valueTwo) in zip(summaryDict.items(),
                                                  skewKurtVarDict.items()):
            print(keyOne, valueOne, keyTwo, valueTwo)
            if keyOne == keyTwo:
                valueOne.extend(valueTwo)
                summaryDict[keyOne] = valueOne
        print(summaryDict)
        print(summaryList.extend(['skewness', 'kurtosis', 'variance']))
        print(summaryDict)
        # for colm in numericalFeatures:
        #     skewness = (dataset.select(F.skewness(dataset[colm])).alias('skewness_' + colm))
        #     kurtosis = (dataset.select(F.kurtosis(dataset[colm])).alias('kurtosis_' + colm))
        #     variance = (dataset.select(F.variance(dataset[colm]).alias('kurtosis_' + colm)))
        if relation == 'linear':
            dataset = dataset
        if relation == 'non_linear':
            dataset = Relationship(dataset, relation_list)

        dataset.show()
        for x in Schema:
            if (str(x.dataType) == "StringType" and x.name == label):
                for labelkey in label_colm:
                    label_indexer = StringIndexer(inputCol=label,
                                                  outputCol='indexed_' +
                                                  label).fit(dataset)
                    dataset = label_indexer.transform(dataset)
                    label = 'indexed_' + label
            else:
                label = label
        indexed_features = []
        for colm in stringFeatures:
            indexer = StringIndexer(inputCol=colm,
                                    outputCol='indexed_' + colm).fit(dataset)
            indexed_features.append('indexed_' + colm)
            dataset = indexer.transform(dataset)
        final_features = numericalFeatures + indexed_features
        response_chi_test = chi_square_test(dataset=dataset,
                                            features=indexed_features,
                                            label_col=label,
                                            stringFeatures=stringFeatures)

        featureassembler = VectorAssembler(inputCols=final_features,
                                           outputCol="features")
        dataset = featureassembler.transform(dataset)
        dataset.show()
        vec_indexer = VectorIndexer(inputCol='features',
                                    outputCol='vec_indexed_features',
                                    maxCategories=4).fit(dataset)
        categorical_features = vec_indexer.categoryMaps
        print("Choose %d categorical features: %s" %
              (len(categorical_features), ", ".join(
                  str(k) for k in categorical_features.keys())))
        vec_indexed = vec_indexer.transform(dataset)
        vec_indexed.show()
        finalized_data = vec_indexed.select(label, 'vec_indexed_features')
        train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                           seed=40)
        rf = RandomForestClassifier(labelCol=label,
                                    featuresCol='vec_indexed_features',
                                    numTrees=10)
        model = rf.fit(train_data)
        predictions = model.transform(test_data)
        print(model.featureImportances)
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)
        import pyspark.sql.functions as F
        import builtins
        round = getattr(builtins, 'round')
        feature_importance = model.featureImportances.toArray().tolist()
        print(feature_importance)
        # feature_importance = [round(x,4) for x in feature_importance]
        featureImportance = []
        for x in feature_importance:
            featureImportance.append(round(x, 4))
        print(featureImportance)

        features_column_for_user = numericalFeatures + stringFeatures
        feature_imp = {
            'feature_importance': featureImportance,
            "feature_column": features_column_for_user
        }
        response_dict = {
            'feature_importance': feature_imp,
            'ChiSquareTestData': response_chi_test,
            'summaryDict': summaryDict
        }
        return response_dict
    except Exception as e:
        print("exception is  = " + str(e))
    "avg_purchases",
    "mean_purchases").show()


# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()

Exemplo n.º 26
0
 def skew(data_frame, measure_column_name):
     return data_frame.select(
         FN.skewness(measure_column_name)).collect()[0][0]
    .option("inferSchema",
            "true")  # Infering the schema (it is a small dataset)
    .csv(fileName)  # Location of our data
    .cache()  # Mark the DataFrame as cached.
)
trainDF.count()  # Materialize the cache
trainDF.printSchema()

testDF = (spark.read.option("header", "true").option(
    "inferSchema", "true").format("com.databricks.spark.csv").load(
        "/FileStore/tables/test.csv").cache())
testDF.printSchema()
testDF.count()

#skewness
trainDF.select(f.skewness(trainDF['cont1']), f.skewness(trainDF['cont2']),
               f.skewness(trainDF['cont3']), f.skewness(trainDF['cont10']))

#show high correlation observed from scatter plot
trainDF.stat.corr("cont12", "cont11")

trainDF.stat.corr("cont1", "cont9")
trainDF.stat.corr("cont14", "loss")

trainDF.createOrReplaceTempView("insurance")

spark.sql("SELECT avg(insurance.loss) as AVG_LOSS FROM insurance").show()
spark.sql("SELECT min(insurance.loss) as MIN_LOSS FROM insurance").show()
spark.sql("SELECT max(insurance.loss) as MAX_LOSS FROM insurance").show()

#rename loss to label
Exemplo n.º 28
0
    def get_data(self):
        """
        Returns statistics about attributes in a data frame
        """

        from pyspark.sql import functions

        # Correlation pairs
        corr_pairs = list(
            chunks(list(itertools.product(self.attrs, self.attrs)),
                   len(self.attrs)))

        # Cache data
        self.data.cache()

        df_count = self.data.count()

        # TODO: Implement median using df.approxQuantile('col', [.5], .25)

        stats = []
        for i, name in enumerate(self.attrs):
            df_col = functions.col(name)
            stats.append(functions.lit(name))
            stats.append(functions.max(df_col).alias('max_{}'.format(name)))
            stats.append(functions.min(df_col).alias('min_{}'.format(name)))
            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.stddev(df_col),
                                    4).alias('stddev_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))
            stats.append(
                functions.count(df_col).alias('count_{}'.format(name)))
            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.avg(df_col),
                                    4).alias('avg_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))

            stats.append(
                functions.approx_count_distinct(df_col).alias(
                    'distinct_{}'.format(name)))
            stats.append((df_count - functions.count(df_col)).alias(
                'missing_{}'.format(name)))

            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.skewness(df_col),
                                    2).alias('skewness_{}'.format(name)))
                stats.append(
                    functions.round(functions.kurtosis(df_col),
                                    2).alias('kurtosis_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))
                stats.append(functions.lit('-'))

            if self.params['correlation']:
                for pair in corr_pairs[i]:
                    if all([
                            pair[0] in self.numeric_attrs, pair[1]
                            in self.numeric_attrs
                    ]):
                        stats.append(
                            functions.round(functions.corr(*pair),
                                            4).alias('corr_{}'.format(i)))
                    else:
                        stats.append(functions.lit('-'))

        self.data = self.data.agg(*stats)
        aggregated = self.data.take(1)[0]
        n = len(self.names)
        rows = [aggregated[i:i + n] for i in range(0, len(aggregated), n)]

        return {"rows": rows, "attributes": self.get_column_names().split(',')}
Exemplo n.º 29
0
    if p < alpha: # if norm
      return True
    elif np.isnan(p) == True:
      return False
    else:
      return False

# Create UDF funcs
get_pval_udf = F.udf(lambda vars: get_normal_pval(vars), FloatType())
if_norm_udf = F.udf(lambda p: if_norm(p), BooleanType())

# COMMAND ----------

toneDataAll = toneData.select('ActionGeo_FullName', 'wTRA_1d', 'wTRA_60d', 'nArticles') \
                                        .groupBy('ActionGeo_FullName') \
                                        .agg( F.skewness('wTRA_1d'),
                                              F.kurtosis('wTRA_1d'),
                                              F.stddev('wTRA_1d'),
                                              F.variance('wTRA_1d'),
                                              F.collect_list('wTRA_1d').alias('list_wTRA_1d'),
                                              F.skewness('wTRA_60d'),
                                              F.kurtosis('wTRA_60d'),
                                              F.stddev('wTRA_60d'),
                                              F.variance('wTRA_60d'),
                                              F.collect_list('wTRA_60d').alias('list_wTRA_60d'),
                                              F.sum('nArticles').alias('nArticles'),
                                              F.count(F.lit(1)).alias('n_observations')
                                        )

# get p-value and define normalcy
toneDataAll = toneDataAll.withColumn('p_value_1d', get_pval_udf(toneDataAll.list_wTRA_1d))
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
df.select(stddev("salary"), stddev_samp("salary"), \
    stddev_pop("salary")).show(truncate=False)
df.select(sum("salary")).show(truncate=False)
df.select(sumDistinct("salary")).show(truncate=False)
df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \
  .show(truncate=False)
Exemplo n.º 31
0
 def desc_stats(self, dataframe, field):
     # Parameters:
     # datadrame: must be a spark dataframe
     # field: String value. It must match a single column name
     # About nulls values: This function does not consider null values on its calculations
     # Importing Libraries and Modules
     from pyspark.sql.functions import col, avg, min, max, stddev, skewness, kurtosis
     # Selecting column. Dropping null values
     df = dataframe.select(field).dropna(how='any')
     # Count of Values
     count = df.count()
     # Central Tendency
     mean_process = df.agg(avg(
         col(field)))  # The result of agg is a spark dataframe
     mean_value = mean_process.collect()[0][mean_process.columns[
         0]]  # The result of collect is a list of row. [0] first element & [mean_process.columns[0] name of the column
     median_value = df.approxQuantile(
         col=field, probabilities=[0.5], relativeError=0.05)[
             0]  # The result of approxQuantile is a list. [0] first element
     # Min, Max, and Percentiles
     min_process = df.agg(min(
         col(field)))  # The result of agg is a spark dataframe
     min_value = min_process.collect()[0][min_process.columns[
         0]]  # The result of collect is a list of row. [0] first element & [min_process.columns[0] name of the column
     max_process = df.agg(max(
         col(field)))  # The result of agg is a spark dataframe
     max_value = max_process.collect()[0][max_process.columns[
         0]]  # The result of collect is a list of row. [0] first element & [max_process.columns[0] name of the column
     percentile_25 = df.approxQuantile(
         col=field, probabilities=[0.25], relativeError=0.05)[
             0]  # The result of approxQuantile is a list. [0] first element
     percentile_75 = df.approxQuantile(
         col=field, probabilities=[0.75], relativeError=0.05)[
             0]  # The result of approxQuantile is a list. [0] first element
     # Variation
     stddev_process = df.agg(stddev(
         col(field)))  # The result of agg is a spark dataframe
     stddev_value = stddev_process.collect()[0][stddev_process.columns[
         0]]  # The result of collect is a list of row. [0] first element & [stddev_process.columns[0] name of the column
     range_value = max_value - min_value  # Calculation of the range of values
     IQR_value = percentile_75 - percentile_25  # Calculation of the Interquartile range
     # Shape
     skewness_process = df.agg(skewness(
         col(field)))  # The result of agg is a spark dataframe
     skewness_value = skewness_process.collect(
     )[0][skewness_process.columns[
         0]]  # The result of collect is a list of row. [0] first element & [skewness_process.columns[0] name of the column
     kurtosis_process = df.agg(kurtosis(
         col(field)))  # The result of agg is a spark dataframe
     kurtosis_value = round(
         kurtosis_process.collect()[0][kurtosis_process.columns[0]], 2
     )  # The result of collect is a list of row. [0] first element & [kurtosis_process.columns[0] name of the column
     # Printing summary of statistics
     print('Summary of Descriptive Statistics - ', field)
     print('**********************************************************')
     print('Count of values          : ', count)
     print('Central tendency:-----------------------------------------')
     print('Mean(Average)            : ', round(mean_value, 2))
     print('Median(Percentile 25)    : ', round(median_value, 2))
     print('Min, Max, and Percentiles:--------------------------------')
     print('Minimum                  : ', round(min_value, 2))
     print('Maximum                  : ', round(max_value, 2))
     print('Percentile 25 (Q1)       : ', round(percentile_25, 2))
     print('Percentile 75 (Q3)       : ', round(percentile_75, 2))
     print('Variation:------------------------------------------------')
     print('Standard Deviation       : ', round(stddev_value, 2))
     print('Range                    : ', round(range_value, 2))
     print('Interquartile Range (IQR): ', round(IQR_value, 2))
     print('Shape:----------------------------------------------------')
     print('Skewness                 : ', round(skewness_value, 2))
     print('Kurtosis                 : ', round(kurtosis_value, 2))
     print('**********************************************************')
     # Creating a dictionary with descriptive statistics
     data = {
         'Statistic': [
             'count', 'mean', 'median', 'min', 'max', 'percentile25',
             'percentile75', 'stddev', 'range', 'iqr', 'skewness',
             'kurtosis'
         ],
         'Values': [
             count, mean_value, median_value, min_value, max_value,
             percentile_25, percentile_75, stddev_value, range_value,
             IQR_value, skewness_value, kurtosis_value
         ]
     }
     # Creating a pandas dataframe
     summary_stats = pd.DataFrame(data)
     return summary_stats  # This function returns a pandas dataframe