Exemplo n.º 1
0
# standard deviation and variance
dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"),
                         stddev_pop("CaloriesBurned"),
                         stddev_samp("CaloriesBurned")).show()

# COMMAND ----------

# Any extreme points in our data?
dailyActivitiesDF.select(skewness("CaloriesBurned"),
                         kurtosis("CaloriesBurned")).show()

# COMMAND ----------

# Covariance and Correlation
dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"),
                         covar_samp("CaloriesBurned", "Steps"),
                         covar_pop("CaloriesBurned", "Steps")).show()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Multiple languages in one notebook
# MAGIC
# MAGIC - One cool thing about Databricks is that we can combine languages within a notebook
# MAGIC - So one example of this could be that our Data Scientists are comfortable writing Python, then our Data Engineers optimise that using Scala

# COMMAND ----------

# MAGIC %sql
# MAGIC
Exemplo n.º 2
0
# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"),
          stddev_samp("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
          covar_pop("InvoiceNo", "Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()

# COMMAND ----------

from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"), expr("count(Quantity)")).show()

# COMMAND ----------
from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()


# COMMAND ----------

from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
Exemplo n.º 4
0
rides.agg(min("distance"), max("distance")).show()

# Use the `first` and `last` functions to compute the first and last values, respectively:
from pyspark.sql.functions import first, last
rides \
  .orderBy("distance") \
  .agg(first("distance", ignorenulls=False), last("distance", ignorenulls=False)) \
  .show()

# **Note:** Null values sort before valid numerical values.

# Use the `corr`, `covar_samp`, or `covar_pop` functions to measure the linear
# association between two columns:
from pyspark.sql.functions import corr, covar_samp, covar_pop
rides \
  .agg(corr("distance", "duration"), covar_samp("distance", "duration"), covar_pop("distance", "duration")) \
  .show()

# The `collect_list` and `collect_set` functions return a column of array type:
from pyspark.sql.functions import collect_list, collect_set
rides.agg(collect_set("service")).show(truncate=False)

# **Note:** `collect_list` does not remove duplicates and will return a very
# long array in this case.

# ## Grouping data

# Use the `agg` method with the `groupBy` (or `groupby`) method to refine your
# analysis:
rides \
  .groupBy("rider_student") \
Exemplo n.º 5
0
def get_builtin_aggregator_column(agg, ctx):
    try:
        aggregator = ctx.aggregators[agg["aggregator"]]

        try:
            input = ctx.populate_values(agg["input"],
                                        aggregator["input"],
                                        preserve_column_refs=False)
        except CortexException as e:
            e.wrap("input")
            raise

        if aggregator["name"] == "approx_count_distinct":
            return F.approxCountDistinct(input["col"],
                                         input.get("rsd")).alias(agg["name"])
        if aggregator["name"] == "avg":
            return F.avg(input).alias(agg["name"])
        if aggregator["name"] in {
                "collect_set_int", "collect_set_float", "collect_set_string"
        }:
            return F.collect_set(input).alias(agg["name"])
        if aggregator["name"] == "count":
            return F.count(input).alias(agg["name"])
        if aggregator["name"] == "count_distinct":
            return F.countDistinct(*input).alias(agg["name"])
        if aggregator["name"] == "covar_pop":
            return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"])
        if aggregator["name"] == "covar_samp":
            return F.covar_samp(input["col1"],
                                input["col2"]).alias(agg["name"])
        if aggregator["name"] == "kurtosis":
            return F.kurtosis(input).alias(agg["name"])
        if aggregator["name"] in {"max_int", "max_float", "max_string"}:
            return F.max(input).alias(agg["name"])
        if aggregator["name"] == "mean":
            return F.mean(input).alias(agg["name"])
        if aggregator["name"] in {"min_int", "min_float", "min_string"}:
            return F.min(input).alias(agg["name"])
        if aggregator["name"] == "skewness":
            return F.skewness(input).alias(agg["name"])
        if aggregator["name"] == "stddev":
            return F.stddev(input).alias(agg["name"])
        if aggregator["name"] == "stddev_pop":
            return F.stddev_pop(input).alias(agg["name"])
        if aggregator["name"] == "stddev_samp":
            return F.stddev_samp(input).alias(agg["name"])
        if aggregator["name"] in {"sum_int", "sum_float"}:
            return F.sum(input).alias(agg["name"])
        if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}:
            return F.sumDistinct(input).alias(agg["name"])
        if aggregator["name"] == "var_pop":
            return F.var_pop(input).alias(agg["name"])
        if aggregator["name"] == "var_samp":
            return F.var_samp(input).alias(agg["name"])
        if aggregator["name"] == "variance":
            return F.variance(input).alias(agg["name"])

        raise ValueError("missing builtin aggregator")  # unexpected

    except CortexException as e:
        e.wrap("aggregate " + agg["name"])
        raise
Exemplo n.º 6
0
def simple_regression (x, y, link):
  # First, join everything together

  joined_data = x.join(link, 'SNP', 'inner').join(y, ['GENE', 'GT_SAMPLE_NAME', 'STUDY','TN'], 'inner')

  df = joined_data.groupBy('SNP','GENE','STUDY','TN').agg(var_samp('GT_dosage'), var_samp('ADJ_EXP'), covar_samp('GT_dosage','ADJ_EXP'), count('GT_dosage')).withColumnRenamed('var_samp(GT_dosage)', 'ss_xx').withColumnRenamed('var_samp(ADJ_EXP)','ss_yy').withColumnRenamed('covar_samp(GT_dosage, ADJ_EXP)','ss_xy').withColumnRenamed('count(GT_dosage)','n')
  return(df.select('SNP','GENE','STUDY','TN', (df.ss_xy / df.ss_xx).alias('BETA'), (sqrt((df.ss_yy - (df.ss_xy * df.ss_xy / df.ss_xx))/(df.n - 2.0)) / sqrt(df.ss_xx)).alias('SE_BETA'), 'n').na.drop())
Exemplo n.º 7
0
    min("StockCode"), max("StockCode")).show(2)

#sum,sumDistinct, avg
from pyspark.sql.functions import sum, sumDistinct, avg
df.select(sum("Quantity"), sumDistinct("Quantity"), avg("Quantity")).show(2)

#표본분산 , 표본표준편차
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_samp("Quantity"), stddev_samp("Quantity")).show(2)

#모분산, 모표본편차
from pyspark.sql.functions import var_pop, stddev_pop
df.select(var_pop("Quantity"), stddev_pop("Quantity")).show(2)

#비대칭도, 척도
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show(2)

#공분산과 상관관계
from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity"),
          covar_samp("InvoiceNo", "Quantity")).show(2)

#복합데이터 타입의 집계
from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show(2)

# COMMAND ----------

# COMMAND ----------