예제 #1
0
def main():
    spark = SparkSession.builder.appName('cruise').getOrCreate()
    df = spark.read.csv('./data/cruise_ship_info.csv',
                        inferSchema=True,
                        header=True)
    df.printSchema()
    df.show()
    df.describe().show()
    df.groupBy('Cruise_line').count().show()
    indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_cat")
    indexed = indexer.fit(df).transform(df)
    indexed.head(5)
    print(indexed.columns)
    assembler = VectorAssembler(inputCols=[
        'Age', 'Tonnage', 'passengers', 'length', 'cabins',
        'passenger_density', 'cruise_cat'
    ],
                                outputCol="features")
    output = assembler.transform(indexed)
    output.select("features", "crew").show()
    final_data = output.select("features", "crew")
    train_data, test_data = final_data.randomSplit([0.7, 0.3])
    lr = LinearRegression(labelCol='crew')
    lrModel = lr.fit(train_data)
    print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,
                                                  lrModel.intercept))
    test_results = lrModel.evaluate(test_data)
    print("RMSE: {}".format(test_results.rootMeanSquaredError))
    print("MSE: {}".format(test_results.meanSquaredError))
    print("R2: {}".format(test_results.r2))
    df.select(corr('crew', 'passengers')).show()
    df.select(corr('crew', 'cabins')).show()
예제 #2
0
def main():
    spark = SparkSession.builder.appName('walmart_stock').getOrCreate()
    df = spark.read.csv('./data/walmart_stock.csv', header=True,
                        inferSchema=True)
    df.printSchema()
    df.columns
    df.head(5)
    df.describe().show()
    result = df.describe()
    result.select(result['summary'],
                  format_number(result['Open'].cast('float'), 2).alias('Open'),
                  format_number(result['High'].cast('float'), 2).alias('High'),
                  format_number(result['Low'].cast('float'), 2).alias('Low'),
                  format_number(result['Close'].cast('float'), 2).alias('Close'),
                  result['Volume'].cast('int').alias('Volume')).show()
    df.withColumn('HV Ratio', df['High']/df['Volume']).select('HV Ratio').show()
    df.orderBy(df['High'].desc()).head(1)[0][0]
    df.select(mean(df['Close'])).show()
    df.select(max(df['Volume']), min(df['Volume'])).show()
    df.filter(df['Close'] < 60).select(count(df['Close'])).show()
    df.filter('Close < 60').count()
    df.filter(df['Close'] < 60).count()
    df.filter(df['High'] > 80).count()/df.count()*100
    df.select(corr(df['High'], df['Volume'])).show()
    yeardf = df.withColumn('Year', year(df['Date']))
    max_df = yeardf.groupBy('Year').max()
    max_df.select('Year', 'max(High)').show()
    monthdf = df.withColumn('Month', month('Date'))
    monthavgs = monthdf.select(['Month', 'Close']).groupBy('Month').mean()
    monthavgs.select('Month', 'avg(Close)').orderBy('Month').show()
    def test_correlation_in_spark(self):
        self.df.stat.corr("Quantity", "UnitPrice")

        # shows all data
        # self.df.show()

        umm = self.df.select(corr(
            "Quantity", "UnitPrice")).collect()[0]['corr(Quantity, UnitPrice)']

        #  negitive correlation
        self.assertLess(umm, 0)
예제 #4
0
def correlacion(dataset):
    data_a = dataset
    data_b = dataset
    for i in data_a.columns:
        k = data_a.columns.index(i)
        while k < len(data_a.columns):
            data_b = data_b.withColumn(
                'correlacion_temporal_' + str(i) + '_' +
                str(data_a.columns[k]),
                F.corr(i, data_a.columns[k]).over(w))
            k = k + 1

    return data_b
    def raw_predicted_risks(self,
                            target_file,
                            cur_lab,
                            cur_lab_def,
                            top_lists=10,
                            ascending=False):
        import pyspark

        # TODO Labs probably can be masked in mimic_data_Abstracter.
        from pyspark.sql.functions import col, datediff, corr, isnan, count, udf
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        # TODO need to abstract this
        udf_prob = udf(lambda x: float(x.toArray().tolist()[1]))

        self.logger.info("TARGET_FILE:{0}".format(target_file))
        try:
            te_result = self.spark.read.parquet(target_file) \
                .withColumn("Prob", udf_prob("Probability").cast("double"))
        except pyspark.sql.utils.AnalysisException as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.debug("FILE NOT EXISTS! {0}".format(target_file))
            return
        self.logger.info("CURRENT_FILE:{0}".format(target_file))
        corr_result = te_result.join(
            cur_lab, (cur_lab.ID == te_result.ID) &
            (datediff(te_result.TIME_SPAN.TIME_TO, cur_lab.TIME_OBS)
             == 0)).groupBy("ITEMID").agg(
                 corr(
                     col("Prob").cast("double"),
                     col("VALUE").cast("double")).alias("Pearson_Correlation"),
                 count("*").alias("Num_OBS")).persist()

        return_df = corr_result.join(cur_lab_def, "ITEMID")\
            .where((~col("Pearson_Correlation").isNull()) & (~isnan("Pearson_Correlation")))\
            .orderBy(col("Pearson_Correlation") if ascending else col("Pearson_Correlation").desc()).limit(top_lists).toPandas()

        corr_result.unpersist()

        return return_df
예제 #6
0
def sdf_correlation(sdf, cols1=None, cols2=None):
    if cols1 is None or cols2 is None:
        if cols1 is None and cols2 is None:
            cols1 = sdf.columns
        elif cols1 is None:
            cols1 = cols2
        cols2 = []
    if not isinstance(cols1, list):
        cols1 = [cols1]
    if not isinstance(cols1, list):
        cols2 = [cols2]
    if len(cols2) == 0:
        all_combinations = [i for i in itertools.combinations(cols1, 2)]
    else:
        all_combinations = [(i, j) for i in cols1 for j in cols2 if i != j]
    assert len(cols1 + cols2) > 1
    agg_func_list = [
        F.corr(*x).alias('_'.join(x) + '__corr') for x in all_combinations
    ]
    sdf = sdf.agg(*agg_func_list)
    return sdf
예제 #7
0
파일: ld.py 프로젝트: robertjklein/fishing
def add_geno_r2_to_gt_ld_plan(ld_plan):
    """
  Given an LD plan to which additive genotypes have been added with add_genotypes_to_ld_plan, 
  aggregates by variant to compute geno_r2 statistics
  r2 calculation comes from Rogers and Huff 2008 -- it's just Pearson's r2
  for the additive genotype
  """

    # First, remove NA
    data = ld_plan.select("filename1", "VAR_IDX1", "filename2", "VAR_IDX2",
                          "SAMPLE_IDX", "GT_ADD1", "GT_ADD2").dropna()

    # Now aggregate and compute r
    x = data.groupby(['filename1', 'VAR_IDX1', 'filename2',
                      'VAR_IDX2']).agg(corr("GT_ADD1", "GT_ADD2").alias("r"))

    # Now, square it to get r2
    geno_r2_result = x.withColumn('geno_r2', x.r * x.r)

    return (geno_r2_result.select("filename1", "VAR_IDX1", "filename2",
                                  "VAR_IDX2", "geno_r2"))
             'Tonnage',
             'passengers',
             'length',
             'cabins',
             'passenger_density',
             'cruise_cat'],
    outputCol="features")

output = assembler.transform(indexed)
output.select("features", "crew").show()
final_data = output.select("features", "crew")
train_data,test_data = final_data.randomSplit([0.7,0.3])

# Create a Linear Regression Model object
lr = LinearRegression(labelCol='crew')

# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

# Print the coefficients and intercept for linear regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept:  " + str(lrModel.intercept))

test_results = lrModel.evaluate(test_data)
print("RMSE: "+ str(test_results.rootMeanSquaredError))
print("MSE: " + str(test_results.meanSquaredError))
print("R2: " + str(test_results.r2))
# R2 of 0.86 is pretty good, let's check the data a little closer

df.select(corr('crew','passengers')).show()
df.select(corr('crew','cabins')).show()
예제 #9
0
def basic_practice_exercise(path, spark):
    # Use the walmart_stock.csv file to Answer and complete the  tasks below!
    # Start a simple Spark Session

    # Load the Walmart Stock CSV File, have Spark infer the data types.
    df = spark.read.csv(path + 'walmart_stock.csv',
                        sep=',',
                        header=True,
                        inferSchema=True)

    # What are the column names?
    print(df.columns)

    # What does the Schema look like?
    df.printSchema()

    # Print out the first 5 columns.
    print(df.head(5))

    # Use describe() to learn about the DataFrame.
    df.describe().show()

    # Bonus Question!
    # There are too many decimal places for mean and stddev in the describe() dataframe. Format the numbers to just
    # show up to two decimal places. Pay careful attention to the datatypes that .describe() returns, we didn't cover
    # how to do this exact formatting, but we covered something very similar. [Check this link for a
    # hint] (http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.Column.cast)
    # If you get stuck on this, don 't worry, just view the solutions.
    new_df = df.describe()
    new_df.printSchema()
    new_df.select(
        new_df['Date'],
        format_number(new_df['Open'].cast(DoubleType()), 2).alias('Open'),
        format_number(new_df['High'].cast(DoubleType()), 2).alias('High'),
        format_number(new_df['Close'].cast(DoubleType()), 2).alias('Close'),
        format_number(new_df['Volume'].cast(IntegerType()), 2).alias('Volume'),
        format_number(new_df['Adj Close'].cast(DoubleType()),
                      2).alias('Adj Close')).show()

    # Create a new dataframe with a column called HV Ratio that is the ratio
    # of the High Price versus volume of stock traded for a day.
    df2 = df.withColumn('HV Ratio', df['High'] / df['Volume'])
    df2.show()

    # What day had the Peak High in Price?
    max_price = df2.agg({'High': 'max'}).collect()[0][0]
    df2.filter((df2['High'] == max_price)).select(df2['Date']).show()

    # What is the mean of the Close column?
    df2.agg({'Close': 'mean'}).show()
    df2.groupBy().mean('Close').show()

    # What is the max and min of the Volume column?
    print([df2['Volume'], df2['Close']])
    # print(isinstance(df2['Volume'], Column))
    df2.agg(min(df2['Volume']), max(df2['Volume'])).show()

    #### How many days was the Close lower than 60 dollars?
    df2.filter((df2.Close < 60)).groupBy().count().show()

    #### What percentage of the time was the High greater than 80 dollars ?
    #### In other words, (Number of Days High>80)/(Total Days in the dataset)
    # print(df2.filter(df2['High'] > 80).count())
    print("Percentage",
          (df2.filter(df2['High'] > 80).count() / df2.count()) * 100)

    #### What is the Pearson correlation between High and Volume?
    #### [Hint](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameStatFunctions.corr)
    df2.select(corr(df2['High'], df2['Volume'])).show()

    #### What is the max High per year?
    df2.groupBy(year(
        df2['Date']).alias('Year')).max('High').orderBy('Year').show()

    #### What is the average Close for each Calendar Month?
    #### In other words, across all the years, what is the average Close price for Jan,Feb, Mar, etc...
    #### Your result will have a value for each of these months.
    df2.groupBy(month(
        df2['Date']).alias('Months')).mean('Close').orderBy('Months').show()
예제 #10
0
spark = SparkSession.builder.appName('lin_reg').getOrCreate()

# 2-读取数据
from pyspark.ml.regression import LinearRegression
#将爬取的数据上传hdfs后,修改文件读取地址
df = spark.read.csv('hdfs://localhost:9000/input_spark/res2.csv',
                    inferSchema=True,
                    header=True)

# 3-探索分析数据
print('-------------- 探索分析数据 -----------------')
print((df.count(), len(df.columns)))  # 查看数据规模
df.printSchema()  # 查看数据结构类型
df.describe().show(5, False)  # 查看数据集的统计数据,包括平均值,标准差,数量统计等。
from pyspark.sql.functions import corr
df.select(corr('square', 'price')).show()  # 计算数据方差

# 4-数据转换,适应模型算法中的要求
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler  # 导入库VectorAssembler

print('-------------- 数据转换 ------------------')
#修改列名
vec_assmebler = VectorAssembler(
    inputCols=['square', 'floors', 'rooms', 'subway', 'area'],
    outputCol='features')  # 转换,这里相对将多元一次方程中的各变量存放到一个向量中
features_df = vec_assmebler.transform(df)

features_df.printSchema()  # 查看变换后的结构。

model_df = features_df.select('features', 'price')  # 构建用于线性回归的数据模型
예제 #11
0
final_data = output.select("features", "crew")
train_data, test_data = final_data.randomSplit([0.7, 0.3])
train_data.describe().show()
test_data.describe().show()
from pyspark.ml.regression import LinearRegression
# Creamos un objeto de modelo de regresión lineal
lr = LinearRegression(labelCol='crew')
# Ajustamos el modelo a los datos y llamamos a este modelo lrModel
lrModel = lr.fit(train_data)
# Imprimimos el coeficiente de regresión b (pendiente) e intercepto a. El modelo de regresión lineal encuentra el mejor valor para estos coeficientes y así obtener una línea que mejor se ajuste a los datos.
# El intercepto (a menudo etiquetado como la constante a) es el valor medio esperado de Y cuando todo X = 0.
# Y= a +bX
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients,
                                              lrModel.intercept))
test_results = lrModel.evaluate(test_data)
print("RMSE: {}".format(
    test_results.rootMeanSquaredError))  # Raíz del error cuadrático medio
print("MSE: {}".format(
    test_results.meanSquaredError))  # Error cuadrático medio
print(
    "R2: {}".format(test_results.r2)
)  # coeficiente de determinación (valores entre 0 y 1. El 1 es ajuste perfecto)
# R2 de 0.94 es bastante bueno, revisemos los datos un poco más
from pyspark.sql.functions import corr
df.select(corr('crew', 'passengers')).show()  # correlación entre dos variables
df.select(corr('crew', 'cabins')).show()  # correlación entre dos variables
"""
De acuerdo, ¡entonces quizás tenga sentido! 
Buenas noticias para nosotros, ¡esta es la información que podemos aportar a la empresa!
"""
예제 #12
0
# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"),
          stddev_samp("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
          covar_pop("InvoiceNo", "Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()

# COMMAND ----------

from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"), expr("count(Quantity)")).show()

# COMMAND ----------
예제 #13
0
rides_sdf \
  .groupby("rider_student", "rider_gender", "service") \
  .count() \
  .show()

# **Example:** Two categorical variables and one continuous variable
rides_sdf \
  .cube("rider_student", "rider_gender") \
  .agg(F.grouping_id(), F.mean("distance"), F.stddev("distance")) \
  .orderBy("rider_student", "rider_gender") \
  .show()

# **Example:** Two categorical variables and two continuous variables
rides_sdf \
  .groupBy("rider_student", "rider_gender") \
  .agg(F.corr("distance", "duration")) \
  .orderBy("rider_student") \
  .show()
  
# ### Faceted plots

# Generally, carefully crafted visualizations are more enlightening.  Before we
# produce more visualizations, let us fill in the missing values for rider_gender
# using pandas functionality:
rides_pdf["rider_gender"] = rides_pdf["rider_gender"].fillna("missing")

# **Question:** Does this syntax look somewhat familiar?

# **Example:** Three categorical variables

# Specify the desired order of the categories:
        joined.stars, joined.city,
        (F.substring(joined.date, 0, 7).alias("date")))
    joined_shortened_date.printSchema()
    joined_shortened_date.show(5, False)
    joined_shortened_date.groupBy("city",
                                  "date").pivot("date").avg("stars").show()

    ###### 10.2 ####
    selected = review.select(review.text, review.useful)
    udf_applied = selected.select(selected.text, selected.useful,
                                  my_udf(selected['text']).alias("my_udf"))
    nested = udf_applied.select(udf_applied.useful, udf_applied.text,
                                "my_udf.avgl", "my_udf.medl")
    nested.show(5, True)
    corr_avgl_useful = nested.select(nested.useful, nested.avgl).agg(
        F.corr("useful", "avgl"))  # corr_avgl_useful is also a dataframe
    corr_avgl_useful.show()
    corr_medl_useful = nested.select(nested.useful, nested.medl).agg(
        F.corr("useful", "medl"))  # this is also a dataframe
    corr_medl_useful.show()

    filtered_review.write.format("json").save(output_folder + "/10.1.1")
    grouped_PBR.write.format("json").save(output_folder + "/10.1.2")
    joined_shortened_date.write.format("json").save(output_folder + "/10.1.3")
    corr_medl_useful.write.format("json").save(output_folder +
                                               "/10.2-corr_AVG_USEFUL")
    corr_avgl_useful.write.format("json").save(output_folder +
                                               "/10.2-corr_MED_USEFUL")

    # Write results into the output folder
    # counts.write.format("json").save(output_folder)
예제 #15
0
# (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1))

df.where(col("Description").eqNullSafe("hello")).show()

# in Python
df.selectExpr(
    "CustomerId",
    "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)
# -- in SQL
# SELECT customerId, (POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity
# FROM dfTable

# compute correlation of two columns
from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()
# -- in SQL
# SELECT corr(Quantity, UnitPrice) FROM dfTable

# summary statistics
df.describe().show()

# in Python
df.stat.crosstab("StockCode", "Quantity").show()
df.stat.freqItems(["StockCode", "Quantity"]).show()

# in Python
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)

# Adding an ID Field
  "CustomerId",
  "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, round, bround

df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)


# COMMAND ----------

from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()


# COMMAND ----------

df.describe().show()


# COMMAND ----------

from pyspark.sql.functions import count, mean, stddev_pop, min, max


# COMMAND ----------

colName = "UnitPrice"
예제 #17
0
df.filter("Close < 60").count()
# También:
df.filter (df['Close'] <60).count()
# También: 
from pyspark.sql.functions import count
result = df.filter(df['Close'] < 60)
result.select(count('Close')).show()
# What percentage of the time was the High greater than 80 dollars ?
# In other words, (Number of Days High>80)/(Total Days in the dataset)
# 9.14 percent of the time it was over 80
# Many ways to do this
(df.filter(df["High"]>80).count()*1.0/df.count())*100
# What is the Pearson correlation between High and Volume?
# http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameStatFunctions.corr
from pyspark.sql.functions import corr
df.select(corr("High","Volume")).show()
# What is the max High per year?
from pyspark.sql.functions import year
yeardf = df.withColumn("Year",year(df["Date"])) # Añado columna year, se obtiene de Date
max_df = yeardf.groupBy('Year').max() # Selecciona los valores máximos de cada columna para cada año , Groupby lo usamos cuando tenemos varios valores repetidos en una columna, en este caso, los años en la columna Year.
# 2015
max_df.select('Year','max(High)').show()
# What is the average Close for each Calendar Month?
# In other words, across all the years, what is the average Close price for Jan,Feb, Mar, etc... Your result will have a value for each of these months.
from pyspark.sql.functions import month
monthdf = df.withColumn("Month",month("Date")) # Añado columna Month, se obtiene de Date
monthcolumn = monthdf.select("Month","Close") # Selecciono sólo columna Month donde aparece todos los meses
monthcolumn.show()
monthavgs = monthcolumn.select("Month","Close").groupBy("Month").mean() # Agrupamos por meses la columna monthcolumn y calculamos su media
monthavgs.select("Month","avg(Close)").orderBy('Month').show() # Ordenamos de menor a mayor
예제 #18
0
# standard deviation and variance
dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"),
                         stddev_pop("CaloriesBurned"),
                         stddev_samp("CaloriesBurned")).show()

# COMMAND ----------

# Any extreme points in our data?
dailyActivitiesDF.select(skewness("CaloriesBurned"),
                         kurtosis("CaloriesBurned")).show()

# COMMAND ----------

# Covariance and Correlation
dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"),
                         covar_samp("CaloriesBurned", "Steps"),
                         covar_pop("CaloriesBurned", "Steps")).show()

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC ## Multiple languages in one notebook
# MAGIC
# MAGIC - One cool thing about Databricks is that we can combine languages within a notebook
# MAGIC - So one example of this could be that our Data Scientists are comfortable writing Python, then our Data Engineers optimise that using Scala

# COMMAND ----------

# MAGIC %sql
예제 #19
0
spTotal = spOut.join(spIn, joinTables).orderBy('Wafer_ID', 'Process_stage')

#Drop the Action
spTotal = spTotal.drop('Action')

# In[4]:

#Count the differences between two times
from pyspark.sql import functions as f
Duration = (f.unix_timestamp('Qtime_OUT') - f.unix_timestamp('Qtime_IN'))
spTotal = spTotal.withColumn('Duration',
                             Duration).orderBy('Wafer_ID', 'Process_stage')

#Pearson correlation coefficient
spResult = spTotal.groupBy('Process_stage').agg(
    f.abs(f.corr('yield',
                 'Duration')).alias('Pearson')).orderBy('Pearson',
                                                        ascending=False)

#Collect the result from spResult
Results = spResult.select('Process_stage').rdd.flatMap(lambda x: x).collect()
Results = Results[:5]

#Select the data into pandas
dfResults = []

for index in Results:
    dfResults.append(
        spTotal.where(spTotal.Process_stage == index).select(
            'yield', 'Duration').toPandas())

# In[5]:
#./bin/spark-submit 1_regression.py
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('LinearRegressionPractise').getOrCreate()

from pyspark.ml.regression import LinearRegression

data = spark.read.csv("1_regression.csv", inferSchema=True, header=True)
#data.printSchema() # check datatype of each column

##############################################
## Check for correlation between features ####
##############################################
from pyspark.sql.functions import corr

data.select(corr('Avg Session Length', 'Time on Website')).show()
data.select(corr('Avg Session Length', 'Time on App')).show()
data.select(corr('Time on App', 'Time on Website')).show()

# DATA WRANGLING PACKAGES NEEDED FOR MACHINE LEARNING
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

#####################################################################
## For any machine learning algorithm							#####
## Spark needs a particular format								#####
## Where labels are in one column								#####
## And a features column - containing all selected features		#####
#####################################################################

#assembler = VectorAssembler(inputCols=['future_dif', 'held_weight', 'shell_weight'], outputCol='features')
예제 #21
0
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

# 2-读取数据
from pyspark.ml.regression import LinearRegression
df=spark.read.csv('Linear_regression_dataset.csv',inferSchema=True,header=True)

# 3-探索分析数据
print('-------------- 探索分析数据 -----------------')
print((df.count(), len(df.columns)))                 # 查看数据规模
df.printSchema()  						             # 查看数据结构类型
df.describe().show(5,False)                          # 查看数据集的统计数据,包括平均值,标准差,数量统计等。
from pyspark.sql.functions import corr
df.select(corr('var_1','output')).show()             # 计算数据方差

# 4-数据转换,适应模型算法中的要求
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler       # 导入库VectorAssembler

print('-------------- 数据转换 ------------------')
vec_assmebler=VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features') # 转换,这里相对将多元一次方程中的各变量存放到一个向量中
features_df=vec_assmebler.transform(df)             

features_df.printSchema() # 查看变换后的结构。

model_df=features_df.select('features','output')     # 构建用于线性回归的数据模型

# 5-将数据划分为 训练数据和预测数据
train_df,test_df=model_df.randomSplit([0.7,0.3])     # 训练数据和预测数据的比例为 7比3

print((train_df.count(), len(train_df.columns)))
예제 #22
0
output = assembler.transform(indexed)

output.show()

output.select('features', 'crew').show()

final_data = output.select('features', 'crew')

final_data.describe().show()

train_data, test_data = final_data.randomSplit([0.7, 0.3])

train_data.describe().show()

test_data.describe().show()
lr = LinearRegression(labelCol='crew')

lrmodel = lr.fit(train_data)

print("Coefficients {} Intercept{}".format(lrmodel.coefficients,
                                           lrmodel.intercept))

test_results = lrmodel.evaluate(test_data)

print("RMSE{}".format(test_results.rootMeanSquaredError))
print("R2{}".format(test_results.r2))

shipdf.select(corr('crew', 'passengers')).show()

spark.stop()
예제 #23
0
# COMMAND ----------

train_data.describe().show()

# COMMAND ----------

print(ship_Result)

# COMMAND ----------

for s in ship_Result:
    print(s)

# COMMAND ----------

df = ship_Result.residuals.show()

# COMMAND ----------

ship_Result.degreesOfFreedom

# COMMAND ----------

from pyspark.sql.functions import corr

# COMMAND ----------

data.select(corr('passengers', 'crew')).show()

# COMMAND ----------
from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()


# COMMAND ----------

from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
예제 #25
0
	df4.show()
	df5 = data.select(F.sum("Close"))	
	df5.show()
	#To round() to round the decimal value
	df5 = data.select(F.round(F.avg("Close")))	
	df5.show()
	data.printSchema()
	#What day had the Peak High in Price?
	df6=data.orderBy(data["High"].desc())
	df6.show(n=5)
	#TO find out mean of close column
	df7=data.select(F.round(mean("Close")))
	df7.show()
	#To find the max and min value in volume column
	df8 = data.select(F.max("Volume"),F.min("Volume"))
	df8.show()
	#How many days was the Close lower than 60 dollars?
	#Way1
	df9 = data.filter("Close<60").count()
	print("***********************Selva******************************")
	print(df9)
	#Way2
	res=data.filter(data["Close"]<60)
	df10 = res.select(F.count("Close"))
	df10.show()
	##What is the Pearson correlation between High and Volume?
	df11 = data.select(F.corr("High","Volume"))
	df11.show()
	
	
	
예제 #26
0
# COMMAND ----------

#How many days was the Close lower than 60 dollars?
df.filter('Close<60').count()

# COMMAND ----------

#What percentage of the time was the High greater than 80 dollars
(df.filter('High>80').count() * 1.0 / df.count()) * 100

# COMMAND ----------

#What is the Pearson correlation between High and Volume?
from pyspark.sql.functions import corr
df.select(corr('High', 'Volume')).show()

# COMMAND ----------

#What is the max High per year?
from pyspark.sql.functions import year

yeardf = df.withColumn('Year', year(df['Date']))
max_yf = yeardf.groupBy('Year').max()
max_yf.select('Year', 'max(High)').show()

# COMMAND ----------

#What is the average Close for each Calendar Month?
from pyspark.sql.functions import month
monthdf = df.withColumn("Month", month("Date"))
예제 #27
0
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()
    df = spark.read.format('csv')\
                   .option('sep', ';')\
                   .option('header', 'true')\
                   .load('user.csv')

    df.repartition(5).select(max('id')).show()
    df.repartition(5).select(avg('age')).show()

    df.select(round(lit(2.5)), bround(lit(2.5))).show()
    df.select(round(lit("2.5")), bround(lit("2.5"))).show()


    spark.range(0, 10, 2)\
         .select(monotonically_increasing_id().alias('id_v2'),
                round((rand() * 5 + 5)).alias('rand')).show()

    spark.range(0, 10, 2)\
         .select(monotonically_increasing_id().alias('id'),
                 pow(col('id'), 2).alias('pow')).show()

    df.describe().show()

    df.select(count('age'), mean('age'), min('age'), max('age'),
              stddev_pop('age')).show()

    df.select(corr(col('age'), col('name'))).show()
    df.stat.corr(col('age'), col('name')).show()
예제 #28
0
    def get_data(self):
        """
        Returns statistics about attributes in a data frame
        """

        from pyspark.sql import functions

        # Correlation pairs
        corr_pairs = list(
            chunks(list(itertools.product(self.attrs, self.attrs)),
                   len(self.attrs)))

        # Cache data
        self.data.cache()

        df_count = self.data.count()

        # TODO: Implement median using df.approxQuantile('col', [.5], .25)

        stats = []
        for i, name in enumerate(self.attrs):
            df_col = functions.col(name)
            stats.append(functions.lit(name))
            stats.append(functions.max(df_col).alias('max_{}'.format(name)))
            stats.append(functions.min(df_col).alias('min_{}'.format(name)))
            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.stddev(df_col),
                                    4).alias('stddev_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))
            stats.append(
                functions.count(df_col).alias('count_{}'.format(name)))
            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.avg(df_col),
                                    4).alias('avg_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))

            stats.append(
                functions.approx_count_distinct(df_col).alias(
                    'distinct_{}'.format(name)))
            stats.append((df_count - functions.count(df_col)).alias(
                'missing_{}'.format(name)))

            if name in self.numeric_attrs:
                stats.append(
                    functions.round(functions.skewness(df_col),
                                    2).alias('skewness_{}'.format(name)))
                stats.append(
                    functions.round(functions.kurtosis(df_col),
                                    2).alias('kurtosis_{}'.format(name)))
            else:
                stats.append(functions.lit('-'))
                stats.append(functions.lit('-'))

            if self.params['correlation']:
                for pair in corr_pairs[i]:
                    if all([
                            pair[0] in self.numeric_attrs, pair[1]
                            in self.numeric_attrs
                    ]):
                        stats.append(
                            functions.round(functions.corr(*pair),
                                            4).alias('corr_{}'.format(i)))
                    else:
                        stats.append(functions.lit('-'))

        self.data = self.data.agg(*stats)
        aggregated = self.data.take(1)[0]
        n = len(self.names)
        rows = [aggregated[i:i + n] for i in range(0, len(aggregated), n)]

        return {"rows": rows, "attributes": self.get_column_names().split(',')}
예제 #29
0
import pyspark.sql.functions as fns

hdfs_addr = "hdfs://ec2-54-208-153-234.compute-1.amazonaws.com:9000"

sc = pyspark.SparkContext(
    "spark://ec2-54-208-153-234.compute-1.amazonaws.com:7077", "Correlation")
#sc = pyspark.SparkContext("local", "Correlation")
spark = SparkSession(sc)

reviews = spark.read.csv(f"{hdfs_addr}/datasets/kindle_reviews.csv",
                         header=True)
#reviews = spark.read.csv(f"kindle_reviews.csv", header=True)
asinr = reviews.select('asin', 'reviewText')
asinrl = asinr.withColumn('reviewLength', fns.length('reviewText'))

meta = spark.read.json(f"{hdfs_addr}/datasets/meta_Kindle_Store.json")
#meta = spark.read.json("meta_Kindle_Store.json")

# asin_avgl = asinrl.groupBy('asin').avg('reviewLength')
asin_avgl = asinrl.groupBy('asin').avg('reviewLength')

#asin_avgl.take(5)

# join asin_avgl and meta with column asin
table = asin_avgl.join(meta.select('asin', 'price'), ['asin'])

# return Column for the Pearson Correlation Coefficient
table = table.agg(
    fns.corr('avg(reviewLength)', "price").alias('pearson_correlation'))

table.select("pearson_correlation").show()
예제 #30
0
def main():

    # Cargamos los ficheros csv directamente como dataframes
    spark = SparkSession.builder.getOrCreate()
    sc = spark.sparkContext
    df1 = spark.read.format("csv").option(
        "header", "true").load("simpsons_characters.csv")
    df2 = spark.read.format("csv").option("header",
                                          "true").load("simpsons_episodes.csv")
    df3 = spark.read.format("csv").option(
        "header", "true").load("simpsons_locations.csv")
    df4 = spark.read.format("csv").option(
        "header", "true").load("simpsons_script_lines.csv")

    #A) numero de ubicaciones diferentes que aparecen en cada episodio
    dfScriptLines = df4.groupBy('episode_id').agg(
        functions.approx_count_distinct(
            df4.location_id).alias("count")).selectExpr(
                "episode_id as id", "count")
    dfLocations = df2.join(dfScriptLines, on="id",
                           how='outer').select("id", "imdb_rating", "count")
    dfaux1 = dfLocations.withColumn(
        "imdb_rating_double", dfLocations["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("count_double",
                               dfLocations["count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(dfaux2.imdb_rating_double,
                       dfaux2.count_double).alias('Pearson-Imdb-NumLocations'))
    dfcof.show()

    #B) numero de personajes que aparecen en cada episodio
    dfe = df4.groupBy('episode_id').agg(
        functions.approx_count_distinct(
            df4.character_id).alias("character_count")).selectExpr(
                "episode_id as id", "character_count")
    dfjoin1 = df2.join(dfe, on="id",
                       how='outer').select("id", "imdb_rating",
                                           "character_count")
    dfaux1 = dfjoin1.withColumn("imdb_rating_double",
                                dfLocations["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("character_count_double",
                               dfaux1["character_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.character_count_double).alias('Pearson-Imdb-NumCharacters'))
    dfcof.show()

    #C y D) numero de personajes masculinos y femeninos que aparecen en cada episodio
    dfc = df1.select('id', 'gender').selectExpr("id as character_id", "gender")
    dfe = df4.select('character_id', 'episode_id')
    dfjoin2 = dfc.join(dfe, on="character_id",
                       how='outer').select("episode_id", "character_id",
                                           "gender")
    dfCharacMasc = dfjoin2.filter(
        dfjoin2.gender == 'm').groupBy('episode_id').agg(
            functions.approx_count_distinct(
                dfjoin2.character_id).alias("masc_count")).selectExpr(
                    "episode_id as id", "masc_count")
    dfCharacFem = dfjoin2.filter(
        dfjoin2.gender == 'f').groupBy('episode_id').agg(
            functions.approx_count_distinct(
                dfjoin2.character_id).alias("fem_count")).selectExpr(
                    "episode_id as id", "fem_count")
    dfjoin3 = dfjoin1.join(dfCharacMasc, on="id",
                           how='outer').select("id", "imdb_rating",
                                               "character_count", "masc_count")
    dfjoin4 = dfjoin3.join(dfCharacFem, on="id",
                           how='outer').select("id", "imdb_rating",
                                               "character_count", "masc_count",
                                               "fem_count")
    dfaux1 = dfjoin4.withColumn("imdb_rating_double",
                                dfjoin4["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("masc_count_double",
                               dfaux1["masc_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.masc_count_double).alias('Pearson-Imdb-NumCharactersMasc'))
    dfcof.show()
    dfaux1 = dfjoin4.withColumn("imdb_rating_double",
                                dfjoin4["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("fem_count_double",
                               dfaux1["fem_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.fem_count_double).alias('Pearson-Imdb-NumCharactersFem'))
    dfcof.show()

    #E) numero de palabras
    dfe = df4.groupBy('episode_id').agg(
        functions.sum(df4.word_count).alias("word_count")).selectExpr(
            "episode_id as id", "word_count")
    dfjoin1 = df2.join(dfe, on="id",
                       how='outer').select("id", "imdb_rating", "word_count")
    dfaux1 = dfjoin1.withColumn("imdb_rating_double",
                                dfjoin1["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("word_count_double",
                               dfaux1["word_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.word_count_double).alias('Pearson-Imdb-NumWords'))
    dfcof.show()

    #F) cantidad total de diálogos
    dfe2 = df4.filter(df4.speaking_line == True).groupBy('episode_id').agg(
        functions.count(df4.raw_text).alias("raw_text_count")).selectExpr(
            "episode_id as id", "raw_text_count")
    dfjoin2 = dfjoin1.join(dfe2, on="id",
                           how='outer').select("id", "imdb_rating",
                                               "word_count", "raw_text_count")
    dfaux1 = dfjoin2.withColumn("imdb_rating_double",
                                dfjoin1["imdb_rating"].cast(DoubleType()))
    dfaux2 = dfaux1.withColumn("raw_text_count_double",
                               dfaux1["raw_text_count"].cast(DoubleType()))
    dfcof = dfaux2.agg(
        functions.corr(
            dfaux2.imdb_rating_double,
            dfaux2.raw_text_count_double).alias('Pearson-Imdb-NumRawText'))
    dfcof.show()

    sc.stop()
train_data,test_data = final.randomSplit([0.7,0.3])

--------machine learning ------------
#initiate LinearRegressorObject
regressor = LinearRegression(featuresCol='features',labelCol='label',predictionCol='predicted label')
#fit train_data
model = regressor.fit(train_data)

#evaluate model by calling evaluate() method which will give number of option to evaluate model ,such as r2,rootMeanSquaredError
eval_model = model.evaluate(test_data)
eval_model.rootMeanSquaredError
eval_model.r2

#prediction of test_data by model 
prediction = model.transform(test_data.select('features'))

#check coefficients and intercepts of model (can decide which feature has more effect on label(independent variable)
coeff=model.coefficients  
intercept= model.intercept
a= zip(featcols,coeff)
b= set(a)
for x,y in b:
    print(x ,':' ,y)
    print('\n')
#analysis only
from pyspark.sql.functions import corr
data.select(corr('feature1','label')).show()
data.select(corr('feature1','feature2')).show()