def main(): spark = SparkSession.builder.appName('cruise').getOrCreate() df = spark.read.csv('./data/cruise_ship_info.csv', inferSchema=True, header=True) df.printSchema() df.show() df.describe().show() df.groupBy('Cruise_line').count().show() indexer = StringIndexer(inputCol="Cruise_line", outputCol="cruise_cat") indexed = indexer.fit(df).transform(df) indexed.head(5) print(indexed.columns) assembler = VectorAssembler(inputCols=[ 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'cruise_cat' ], outputCol="features") output = assembler.transform(indexed) output.select("features", "crew").show() final_data = output.select("features", "crew") train_data, test_data = final_data.randomSplit([0.7, 0.3]) lr = LinearRegression(labelCol='crew') lrModel = lr.fit(train_data) print("Coefficients: {} Intercept: {}".format(lrModel.coefficients, lrModel.intercept)) test_results = lrModel.evaluate(test_data) print("RMSE: {}".format(test_results.rootMeanSquaredError)) print("MSE: {}".format(test_results.meanSquaredError)) print("R2: {}".format(test_results.r2)) df.select(corr('crew', 'passengers')).show() df.select(corr('crew', 'cabins')).show()
def main(): spark = SparkSession.builder.appName('walmart_stock').getOrCreate() df = spark.read.csv('./data/walmart_stock.csv', header=True, inferSchema=True) df.printSchema() df.columns df.head(5) df.describe().show() result = df.describe() result.select(result['summary'], format_number(result['Open'].cast('float'), 2).alias('Open'), format_number(result['High'].cast('float'), 2).alias('High'), format_number(result['Low'].cast('float'), 2).alias('Low'), format_number(result['Close'].cast('float'), 2).alias('Close'), result['Volume'].cast('int').alias('Volume')).show() df.withColumn('HV Ratio', df['High']/df['Volume']).select('HV Ratio').show() df.orderBy(df['High'].desc()).head(1)[0][0] df.select(mean(df['Close'])).show() df.select(max(df['Volume']), min(df['Volume'])).show() df.filter(df['Close'] < 60).select(count(df['Close'])).show() df.filter('Close < 60').count() df.filter(df['Close'] < 60).count() df.filter(df['High'] > 80).count()/df.count()*100 df.select(corr(df['High'], df['Volume'])).show() yeardf = df.withColumn('Year', year(df['Date'])) max_df = yeardf.groupBy('Year').max() max_df.select('Year', 'max(High)').show() monthdf = df.withColumn('Month', month('Date')) monthavgs = monthdf.select(['Month', 'Close']).groupBy('Month').mean() monthavgs.select('Month', 'avg(Close)').orderBy('Month').show()
def test_correlation_in_spark(self): self.df.stat.corr("Quantity", "UnitPrice") # shows all data # self.df.show() umm = self.df.select(corr( "Quantity", "UnitPrice")).collect()[0]['corr(Quantity, UnitPrice)'] # negitive correlation self.assertLess(umm, 0)
def correlacion(dataset): data_a = dataset data_b = dataset for i in data_a.columns: k = data_a.columns.index(i) while k < len(data_a.columns): data_b = data_b.withColumn( 'correlacion_temporal_' + str(i) + '_' + str(data_a.columns[k]), F.corr(i, data_a.columns[k]).over(w)) k = k + 1 return data_b
def raw_predicted_risks(self, target_file, cur_lab, cur_lab_def, top_lists=10, ascending=False): import pyspark # TODO Labs probably can be masked in mimic_data_Abstracter. from pyspark.sql.functions import col, datediff, corr, isnan, count, udf from pyspark.ml.evaluation import BinaryClassificationEvaluator # TODO need to abstract this udf_prob = udf(lambda x: float(x.toArray().tolist()[1])) self.logger.info("TARGET_FILE:{0}".format(target_file)) try: te_result = self.spark.read.parquet(target_file) \ .withColumn("Prob", udf_prob("Probability").cast("double")) except pyspark.sql.utils.AnalysisException as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) self.logger.info(message) self.logger.debug("FILE NOT EXISTS! {0}".format(target_file)) return self.logger.info("CURRENT_FILE:{0}".format(target_file)) corr_result = te_result.join( cur_lab, (cur_lab.ID == te_result.ID) & (datediff(te_result.TIME_SPAN.TIME_TO, cur_lab.TIME_OBS) == 0)).groupBy("ITEMID").agg( corr( col("Prob").cast("double"), col("VALUE").cast("double")).alias("Pearson_Correlation"), count("*").alias("Num_OBS")).persist() return_df = corr_result.join(cur_lab_def, "ITEMID")\ .where((~col("Pearson_Correlation").isNull()) & (~isnan("Pearson_Correlation")))\ .orderBy(col("Pearson_Correlation") if ascending else col("Pearson_Correlation").desc()).limit(top_lists).toPandas() corr_result.unpersist() return return_df
def sdf_correlation(sdf, cols1=None, cols2=None): if cols1 is None or cols2 is None: if cols1 is None and cols2 is None: cols1 = sdf.columns elif cols1 is None: cols1 = cols2 cols2 = [] if not isinstance(cols1, list): cols1 = [cols1] if not isinstance(cols1, list): cols2 = [cols2] if len(cols2) == 0: all_combinations = [i for i in itertools.combinations(cols1, 2)] else: all_combinations = [(i, j) for i in cols1 for j in cols2 if i != j] assert len(cols1 + cols2) > 1 agg_func_list = [ F.corr(*x).alias('_'.join(x) + '__corr') for x in all_combinations ] sdf = sdf.agg(*agg_func_list) return sdf
def add_geno_r2_to_gt_ld_plan(ld_plan): """ Given an LD plan to which additive genotypes have been added with add_genotypes_to_ld_plan, aggregates by variant to compute geno_r2 statistics r2 calculation comes from Rogers and Huff 2008 -- it's just Pearson's r2 for the additive genotype """ # First, remove NA data = ld_plan.select("filename1", "VAR_IDX1", "filename2", "VAR_IDX2", "SAMPLE_IDX", "GT_ADD1", "GT_ADD2").dropna() # Now aggregate and compute r x = data.groupby(['filename1', 'VAR_IDX1', 'filename2', 'VAR_IDX2']).agg(corr("GT_ADD1", "GT_ADD2").alias("r")) # Now, square it to get r2 geno_r2_result = x.withColumn('geno_r2', x.r * x.r) return (geno_r2_result.select("filename1", "VAR_IDX1", "filename2", "VAR_IDX2", "geno_r2"))
'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'cruise_cat'], outputCol="features") output = assembler.transform(indexed) output.select("features", "crew").show() final_data = output.select("features", "crew") train_data,test_data = final_data.randomSplit([0.7,0.3]) # Create a Linear Regression Model object lr = LinearRegression(labelCol='crew') # Fit the model to the data and call this model lrModel lrModel = lr.fit(train_data) # Print the coefficients and intercept for linear regression print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) test_results = lrModel.evaluate(test_data) print("RMSE: "+ str(test_results.rootMeanSquaredError)) print("MSE: " + str(test_results.meanSquaredError)) print("R2: " + str(test_results.r2)) # R2 of 0.86 is pretty good, let's check the data a little closer df.select(corr('crew','passengers')).show() df.select(corr('crew','cabins')).show()
def basic_practice_exercise(path, spark): # Use the walmart_stock.csv file to Answer and complete the tasks below! # Start a simple Spark Session # Load the Walmart Stock CSV File, have Spark infer the data types. df = spark.read.csv(path + 'walmart_stock.csv', sep=',', header=True, inferSchema=True) # What are the column names? print(df.columns) # What does the Schema look like? df.printSchema() # Print out the first 5 columns. print(df.head(5)) # Use describe() to learn about the DataFrame. df.describe().show() # Bonus Question! # There are too many decimal places for mean and stddev in the describe() dataframe. Format the numbers to just # show up to two decimal places. Pay careful attention to the datatypes that .describe() returns, we didn't cover # how to do this exact formatting, but we covered something very similar. [Check this link for a # hint] (http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.Column.cast) # If you get stuck on this, don 't worry, just view the solutions. new_df = df.describe() new_df.printSchema() new_df.select( new_df['Date'], format_number(new_df['Open'].cast(DoubleType()), 2).alias('Open'), format_number(new_df['High'].cast(DoubleType()), 2).alias('High'), format_number(new_df['Close'].cast(DoubleType()), 2).alias('Close'), format_number(new_df['Volume'].cast(IntegerType()), 2).alias('Volume'), format_number(new_df['Adj Close'].cast(DoubleType()), 2).alias('Adj Close')).show() # Create a new dataframe with a column called HV Ratio that is the ratio # of the High Price versus volume of stock traded for a day. df2 = df.withColumn('HV Ratio', df['High'] / df['Volume']) df2.show() # What day had the Peak High in Price? max_price = df2.agg({'High': 'max'}).collect()[0][0] df2.filter((df2['High'] == max_price)).select(df2['Date']).show() # What is the mean of the Close column? df2.agg({'Close': 'mean'}).show() df2.groupBy().mean('Close').show() # What is the max and min of the Volume column? print([df2['Volume'], df2['Close']]) # print(isinstance(df2['Volume'], Column)) df2.agg(min(df2['Volume']), max(df2['Volume'])).show() #### How many days was the Close lower than 60 dollars? df2.filter((df2.Close < 60)).groupBy().count().show() #### What percentage of the time was the High greater than 80 dollars ? #### In other words, (Number of Days High>80)/(Total Days in the dataset) # print(df2.filter(df2['High'] > 80).count()) print("Percentage", (df2.filter(df2['High'] > 80).count() / df2.count()) * 100) #### What is the Pearson correlation between High and Volume? #### [Hint](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameStatFunctions.corr) df2.select(corr(df2['High'], df2['Volume'])).show() #### What is the max High per year? df2.groupBy(year( df2['Date']).alias('Year')).max('High').orderBy('Year').show() #### What is the average Close for each Calendar Month? #### In other words, across all the years, what is the average Close price for Jan,Feb, Mar, etc... #### Your result will have a value for each of these months. df2.groupBy(month( df2['Date']).alias('Months')).mean('Close').orderBy('Months').show()
spark = SparkSession.builder.appName('lin_reg').getOrCreate() # 2-读取数据 from pyspark.ml.regression import LinearRegression #将爬取的数据上传hdfs后,修改文件读取地址 df = spark.read.csv('hdfs://localhost:9000/input_spark/res2.csv', inferSchema=True, header=True) # 3-探索分析数据 print('-------------- 探索分析数据 -----------------') print((df.count(), len(df.columns))) # 查看数据规模 df.printSchema() # 查看数据结构类型 df.describe().show(5, False) # 查看数据集的统计数据,包括平均值,标准差,数量统计等。 from pyspark.sql.functions import corr df.select(corr('square', 'price')).show() # 计算数据方差 # 4-数据转换,适应模型算法中的要求 from pyspark.ml.linalg import Vector from pyspark.ml.feature import VectorAssembler # 导入库VectorAssembler print('-------------- 数据转换 ------------------') #修改列名 vec_assmebler = VectorAssembler( inputCols=['square', 'floors', 'rooms', 'subway', 'area'], outputCol='features') # 转换,这里相对将多元一次方程中的各变量存放到一个向量中 features_df = vec_assmebler.transform(df) features_df.printSchema() # 查看变换后的结构。 model_df = features_df.select('features', 'price') # 构建用于线性回归的数据模型
final_data = output.select("features", "crew") train_data, test_data = final_data.randomSplit([0.7, 0.3]) train_data.describe().show() test_data.describe().show() from pyspark.ml.regression import LinearRegression # Creamos un objeto de modelo de regresión lineal lr = LinearRegression(labelCol='crew') # Ajustamos el modelo a los datos y llamamos a este modelo lrModel lrModel = lr.fit(train_data) # Imprimimos el coeficiente de regresión b (pendiente) e intercepto a. El modelo de regresión lineal encuentra el mejor valor para estos coeficientes y así obtener una línea que mejor se ajuste a los datos. # El intercepto (a menudo etiquetado como la constante a) es el valor medio esperado de Y cuando todo X = 0. # Y= a +bX print("Coefficients: {} Intercept: {}".format(lrModel.coefficients, lrModel.intercept)) test_results = lrModel.evaluate(test_data) print("RMSE: {}".format( test_results.rootMeanSquaredError)) # Raíz del error cuadrático medio print("MSE: {}".format( test_results.meanSquaredError)) # Error cuadrático medio print( "R2: {}".format(test_results.r2) ) # coeficiente de determinación (valores entre 0 y 1. El 1 es ajuste perfecto) # R2 de 0.94 es bastante bueno, revisemos los datos un poco más from pyspark.sql.functions import corr df.select(corr('crew', 'passengers')).show() # correlación entre dos variables df.select(corr('crew', 'cabins')).show() # correlación entre dos variables """ De acuerdo, ¡entonces quizás tenga sentido! Buenas noticias para nosotros, ¡esta es la información que podemos aportar a la empresa! """
# COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show() # COMMAND ---------- from pyspark.sql.functions import count df.groupBy("InvoiceNo").agg( count("Quantity").alias("quan"), expr("count(Quantity)")).show() # COMMAND ----------
rides_sdf \ .groupby("rider_student", "rider_gender", "service") \ .count() \ .show() # **Example:** Two categorical variables and one continuous variable rides_sdf \ .cube("rider_student", "rider_gender") \ .agg(F.grouping_id(), F.mean("distance"), F.stddev("distance")) \ .orderBy("rider_student", "rider_gender") \ .show() # **Example:** Two categorical variables and two continuous variables rides_sdf \ .groupBy("rider_student", "rider_gender") \ .agg(F.corr("distance", "duration")) \ .orderBy("rider_student") \ .show() # ### Faceted plots # Generally, carefully crafted visualizations are more enlightening. Before we # produce more visualizations, let us fill in the missing values for rider_gender # using pandas functionality: rides_pdf["rider_gender"] = rides_pdf["rider_gender"].fillna("missing") # **Question:** Does this syntax look somewhat familiar? # **Example:** Three categorical variables # Specify the desired order of the categories:
joined.stars, joined.city, (F.substring(joined.date, 0, 7).alias("date"))) joined_shortened_date.printSchema() joined_shortened_date.show(5, False) joined_shortened_date.groupBy("city", "date").pivot("date").avg("stars").show() ###### 10.2 #### selected = review.select(review.text, review.useful) udf_applied = selected.select(selected.text, selected.useful, my_udf(selected['text']).alias("my_udf")) nested = udf_applied.select(udf_applied.useful, udf_applied.text, "my_udf.avgl", "my_udf.medl") nested.show(5, True) corr_avgl_useful = nested.select(nested.useful, nested.avgl).agg( F.corr("useful", "avgl")) # corr_avgl_useful is also a dataframe corr_avgl_useful.show() corr_medl_useful = nested.select(nested.useful, nested.medl).agg( F.corr("useful", "medl")) # this is also a dataframe corr_medl_useful.show() filtered_review.write.format("json").save(output_folder + "/10.1.1") grouped_PBR.write.format("json").save(output_folder + "/10.1.2") joined_shortened_date.write.format("json").save(output_folder + "/10.1.3") corr_medl_useful.write.format("json").save(output_folder + "/10.2-corr_AVG_USEFUL") corr_avgl_useful.write.format("json").save(output_folder + "/10.2-corr_MED_USEFUL") # Write results into the output folder # counts.write.format("json").save(output_folder)
# (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)) df.where(col("Description").eqNullSafe("hello")).show() # in Python df.selectExpr( "CustomerId", "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2) # -- in SQL # SELECT customerId, (POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity # FROM dfTable # compute correlation of two columns from pyspark.sql.functions import corr df.stat.corr("Quantity", "UnitPrice") df.select(corr("Quantity", "UnitPrice")).show() # -- in SQL # SELECT corr(Quantity, UnitPrice) FROM dfTable # summary statistics df.describe().show() # in Python df.stat.crosstab("StockCode", "Quantity").show() df.stat.freqItems(["StockCode", "Quantity"]).show() # in Python from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(2) # Adding an ID Field
"CustomerId", "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2) # COMMAND ---------- from pyspark.sql.functions import lit, round, bround df.select(round(lit("2.5")), bround(lit("2.5"))).show(2) # COMMAND ---------- from pyspark.sql.functions import corr df.stat.corr("Quantity", "UnitPrice") df.select(corr("Quantity", "UnitPrice")).show() # COMMAND ---------- df.describe().show() # COMMAND ---------- from pyspark.sql.functions import count, mean, stddev_pop, min, max # COMMAND ---------- colName = "UnitPrice"
df.filter("Close < 60").count() # También: df.filter (df['Close'] <60).count() # También: from pyspark.sql.functions import count result = df.filter(df['Close'] < 60) result.select(count('Close')).show() # What percentage of the time was the High greater than 80 dollars ? # In other words, (Number of Days High>80)/(Total Days in the dataset) # 9.14 percent of the time it was over 80 # Many ways to do this (df.filter(df["High"]>80).count()*1.0/df.count())*100 # What is the Pearson correlation between High and Volume? # http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameStatFunctions.corr from pyspark.sql.functions import corr df.select(corr("High","Volume")).show() # What is the max High per year? from pyspark.sql.functions import year yeardf = df.withColumn("Year",year(df["Date"])) # Añado columna year, se obtiene de Date max_df = yeardf.groupBy('Year').max() # Selecciona los valores máximos de cada columna para cada año , Groupby lo usamos cuando tenemos varios valores repetidos en una columna, en este caso, los años en la columna Year. # 2015 max_df.select('Year','max(High)').show() # What is the average Close for each Calendar Month? # In other words, across all the years, what is the average Close price for Jan,Feb, Mar, etc... Your result will have a value for each of these months. from pyspark.sql.functions import month monthdf = df.withColumn("Month",month("Date")) # Añado columna Month, se obtiene de Date monthcolumn = monthdf.select("Month","Close") # Selecciono sólo columna Month donde aparece todos los meses monthcolumn.show() monthavgs = monthcolumn.select("Month","Close").groupBy("Month").mean() # Agrupamos por meses la columna monthcolumn y calculamos su media monthavgs.select("Month","avg(Close)").orderBy('Month').show() # Ordenamos de menor a mayor
# standard deviation and variance dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"), stddev_pop("CaloriesBurned"), stddev_samp("CaloriesBurned")).show() # COMMAND ---------- # Any extreme points in our data? dailyActivitiesDF.select(skewness("CaloriesBurned"), kurtosis("CaloriesBurned")).show() # COMMAND ---------- # Covariance and Correlation dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"), covar_samp("CaloriesBurned", "Steps"), covar_pop("CaloriesBurned", "Steps")).show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ## Multiple languages in one notebook # MAGIC # MAGIC - One cool thing about Databricks is that we can combine languages within a notebook # MAGIC - So one example of this could be that our Data Scientists are comfortable writing Python, then our Data Engineers optimise that using Scala # COMMAND ---------- # MAGIC %sql
spTotal = spOut.join(spIn, joinTables).orderBy('Wafer_ID', 'Process_stage') #Drop the Action spTotal = spTotal.drop('Action') # In[4]: #Count the differences between two times from pyspark.sql import functions as f Duration = (f.unix_timestamp('Qtime_OUT') - f.unix_timestamp('Qtime_IN')) spTotal = spTotal.withColumn('Duration', Duration).orderBy('Wafer_ID', 'Process_stage') #Pearson correlation coefficient spResult = spTotal.groupBy('Process_stage').agg( f.abs(f.corr('yield', 'Duration')).alias('Pearson')).orderBy('Pearson', ascending=False) #Collect the result from spResult Results = spResult.select('Process_stage').rdd.flatMap(lambda x: x).collect() Results = Results[:5] #Select the data into pandas dfResults = [] for index in Results: dfResults.append( spTotal.where(spTotal.Process_stage == index).select( 'yield', 'Duration').toPandas()) # In[5]:
#./bin/spark-submit 1_regression.py from pyspark.sql import SparkSession spark = SparkSession.builder.appName('LinearRegressionPractise').getOrCreate() from pyspark.ml.regression import LinearRegression data = spark.read.csv("1_regression.csv", inferSchema=True, header=True) #data.printSchema() # check datatype of each column ############################################## ## Check for correlation between features #### ############################################## from pyspark.sql.functions import corr data.select(corr('Avg Session Length', 'Time on Website')).show() data.select(corr('Avg Session Length', 'Time on App')).show() data.select(corr('Time on App', 'Time on Website')).show() # DATA WRANGLING PACKAGES NEEDED FOR MACHINE LEARNING from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler ##################################################################### ## For any machine learning algorithm ##### ## Spark needs a particular format ##### ## Where labels are in one column ##### ## And a features column - containing all selected features ##### ##################################################################### #assembler = VectorAssembler(inputCols=['future_dif', 'held_weight', 'shell_weight'], outputCol='features')
from pyspark.sql import SparkSession spark=SparkSession.builder.appName('lin_reg').getOrCreate() # 2-读取数据 from pyspark.ml.regression import LinearRegression df=spark.read.csv('Linear_regression_dataset.csv',inferSchema=True,header=True) # 3-探索分析数据 print('-------------- 探索分析数据 -----------------') print((df.count(), len(df.columns))) # 查看数据规模 df.printSchema() # 查看数据结构类型 df.describe().show(5,False) # 查看数据集的统计数据,包括平均值,标准差,数量统计等。 from pyspark.sql.functions import corr df.select(corr('var_1','output')).show() # 计算数据方差 # 4-数据转换,适应模型算法中的要求 from pyspark.ml.linalg import Vector from pyspark.ml.feature import VectorAssembler # 导入库VectorAssembler print('-------------- 数据转换 ------------------') vec_assmebler=VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features') # 转换,这里相对将多元一次方程中的各变量存放到一个向量中 features_df=vec_assmebler.transform(df) features_df.printSchema() # 查看变换后的结构。 model_df=features_df.select('features','output') # 构建用于线性回归的数据模型 # 5-将数据划分为 训练数据和预测数据 train_df,test_df=model_df.randomSplit([0.7,0.3]) # 训练数据和预测数据的比例为 7比3 print((train_df.count(), len(train_df.columns)))
output = assembler.transform(indexed) output.show() output.select('features', 'crew').show() final_data = output.select('features', 'crew') final_data.describe().show() train_data, test_data = final_data.randomSplit([0.7, 0.3]) train_data.describe().show() test_data.describe().show() lr = LinearRegression(labelCol='crew') lrmodel = lr.fit(train_data) print("Coefficients {} Intercept{}".format(lrmodel.coefficients, lrmodel.intercept)) test_results = lrmodel.evaluate(test_data) print("RMSE{}".format(test_results.rootMeanSquaredError)) print("R2{}".format(test_results.r2)) shipdf.select(corr('crew', 'passengers')).show() spark.stop()
# COMMAND ---------- train_data.describe().show() # COMMAND ---------- print(ship_Result) # COMMAND ---------- for s in ship_Result: print(s) # COMMAND ---------- df = ship_Result.residuals.show() # COMMAND ---------- ship_Result.degreesOfFreedom # COMMAND ---------- from pyspark.sql.functions import corr # COMMAND ---------- data.select(corr('passengers', 'crew')).show() # COMMAND ----------
from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show() # COMMAND ---------- from pyspark.sql.functions import count df.groupBy("InvoiceNo").agg( count("Quantity").alias("quan"),
df4.show() df5 = data.select(F.sum("Close")) df5.show() #To round() to round the decimal value df5 = data.select(F.round(F.avg("Close"))) df5.show() data.printSchema() #What day had the Peak High in Price? df6=data.orderBy(data["High"].desc()) df6.show(n=5) #TO find out mean of close column df7=data.select(F.round(mean("Close"))) df7.show() #To find the max and min value in volume column df8 = data.select(F.max("Volume"),F.min("Volume")) df8.show() #How many days was the Close lower than 60 dollars? #Way1 df9 = data.filter("Close<60").count() print("***********************Selva******************************") print(df9) #Way2 res=data.filter(data["Close"]<60) df10 = res.select(F.count("Close")) df10.show() ##What is the Pearson correlation between High and Volume? df11 = data.select(F.corr("High","Volume")) df11.show()
# COMMAND ---------- #How many days was the Close lower than 60 dollars? df.filter('Close<60').count() # COMMAND ---------- #What percentage of the time was the High greater than 80 dollars (df.filter('High>80').count() * 1.0 / df.count()) * 100 # COMMAND ---------- #What is the Pearson correlation between High and Volume? from pyspark.sql.functions import corr df.select(corr('High', 'Volume')).show() # COMMAND ---------- #What is the max High per year? from pyspark.sql.functions import year yeardf = df.withColumn('Year', year(df['Date'])) max_yf = yeardf.groupBy('Year').max() max_yf.select('Year', 'max(High)').show() # COMMAND ---------- #What is the average Close for each Calendar Month? from pyspark.sql.functions import month monthdf = df.withColumn("Month", month("Date"))
spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() df = spark.read.format('csv')\ .option('sep', ';')\ .option('header', 'true')\ .load('user.csv') df.repartition(5).select(max('id')).show() df.repartition(5).select(avg('age')).show() df.select(round(lit(2.5)), bround(lit(2.5))).show() df.select(round(lit("2.5")), bround(lit("2.5"))).show() spark.range(0, 10, 2)\ .select(monotonically_increasing_id().alias('id_v2'), round((rand() * 5 + 5)).alias('rand')).show() spark.range(0, 10, 2)\ .select(monotonically_increasing_id().alias('id'), pow(col('id'), 2).alias('pow')).show() df.describe().show() df.select(count('age'), mean('age'), min('age'), max('age'), stddev_pop('age')).show() df.select(corr(col('age'), col('name'))).show() df.stat.corr(col('age'), col('name')).show()
def get_data(self): """ Returns statistics about attributes in a data frame """ from pyspark.sql import functions # Correlation pairs corr_pairs = list( chunks(list(itertools.product(self.attrs, self.attrs)), len(self.attrs))) # Cache data self.data.cache() df_count = self.data.count() # TODO: Implement median using df.approxQuantile('col', [.5], .25) stats = [] for i, name in enumerate(self.attrs): df_col = functions.col(name) stats.append(functions.lit(name)) stats.append(functions.max(df_col).alias('max_{}'.format(name))) stats.append(functions.min(df_col).alias('min_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.stddev(df_col), 4).alias('stddev_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append( functions.count(df_col).alias('count_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.avg(df_col), 4).alias('avg_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append( functions.approx_count_distinct(df_col).alias( 'distinct_{}'.format(name))) stats.append((df_count - functions.count(df_col)).alias( 'missing_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.skewness(df_col), 2).alias('skewness_{}'.format(name))) stats.append( functions.round(functions.kurtosis(df_col), 2).alias('kurtosis_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append(functions.lit('-')) if self.params['correlation']: for pair in corr_pairs[i]: if all([ pair[0] in self.numeric_attrs, pair[1] in self.numeric_attrs ]): stats.append( functions.round(functions.corr(*pair), 4).alias('corr_{}'.format(i))) else: stats.append(functions.lit('-')) self.data = self.data.agg(*stats) aggregated = self.data.take(1)[0] n = len(self.names) rows = [aggregated[i:i + n] for i in range(0, len(aggregated), n)] return {"rows": rows, "attributes": self.get_column_names().split(',')}
import pyspark.sql.functions as fns hdfs_addr = "hdfs://ec2-54-208-153-234.compute-1.amazonaws.com:9000" sc = pyspark.SparkContext( "spark://ec2-54-208-153-234.compute-1.amazonaws.com:7077", "Correlation") #sc = pyspark.SparkContext("local", "Correlation") spark = SparkSession(sc) reviews = spark.read.csv(f"{hdfs_addr}/datasets/kindle_reviews.csv", header=True) #reviews = spark.read.csv(f"kindle_reviews.csv", header=True) asinr = reviews.select('asin', 'reviewText') asinrl = asinr.withColumn('reviewLength', fns.length('reviewText')) meta = spark.read.json(f"{hdfs_addr}/datasets/meta_Kindle_Store.json") #meta = spark.read.json("meta_Kindle_Store.json") # asin_avgl = asinrl.groupBy('asin').avg('reviewLength') asin_avgl = asinrl.groupBy('asin').avg('reviewLength') #asin_avgl.take(5) # join asin_avgl and meta with column asin table = asin_avgl.join(meta.select('asin', 'price'), ['asin']) # return Column for the Pearson Correlation Coefficient table = table.agg( fns.corr('avg(reviewLength)', "price").alias('pearson_correlation')) table.select("pearson_correlation").show()
def main(): # Cargamos los ficheros csv directamente como dataframes spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext df1 = spark.read.format("csv").option( "header", "true").load("simpsons_characters.csv") df2 = spark.read.format("csv").option("header", "true").load("simpsons_episodes.csv") df3 = spark.read.format("csv").option( "header", "true").load("simpsons_locations.csv") df4 = spark.read.format("csv").option( "header", "true").load("simpsons_script_lines.csv") #A) numero de ubicaciones diferentes que aparecen en cada episodio dfScriptLines = df4.groupBy('episode_id').agg( functions.approx_count_distinct( df4.location_id).alias("count")).selectExpr( "episode_id as id", "count") dfLocations = df2.join(dfScriptLines, on="id", how='outer').select("id", "imdb_rating", "count") dfaux1 = dfLocations.withColumn( "imdb_rating_double", dfLocations["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("count_double", dfLocations["count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr(dfaux2.imdb_rating_double, dfaux2.count_double).alias('Pearson-Imdb-NumLocations')) dfcof.show() #B) numero de personajes que aparecen en cada episodio dfe = df4.groupBy('episode_id').agg( functions.approx_count_distinct( df4.character_id).alias("character_count")).selectExpr( "episode_id as id", "character_count") dfjoin1 = df2.join(dfe, on="id", how='outer').select("id", "imdb_rating", "character_count") dfaux1 = dfjoin1.withColumn("imdb_rating_double", dfLocations["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("character_count_double", dfaux1["character_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.character_count_double).alias('Pearson-Imdb-NumCharacters')) dfcof.show() #C y D) numero de personajes masculinos y femeninos que aparecen en cada episodio dfc = df1.select('id', 'gender').selectExpr("id as character_id", "gender") dfe = df4.select('character_id', 'episode_id') dfjoin2 = dfc.join(dfe, on="character_id", how='outer').select("episode_id", "character_id", "gender") dfCharacMasc = dfjoin2.filter( dfjoin2.gender == 'm').groupBy('episode_id').agg( functions.approx_count_distinct( dfjoin2.character_id).alias("masc_count")).selectExpr( "episode_id as id", "masc_count") dfCharacFem = dfjoin2.filter( dfjoin2.gender == 'f').groupBy('episode_id').agg( functions.approx_count_distinct( dfjoin2.character_id).alias("fem_count")).selectExpr( "episode_id as id", "fem_count") dfjoin3 = dfjoin1.join(dfCharacMasc, on="id", how='outer').select("id", "imdb_rating", "character_count", "masc_count") dfjoin4 = dfjoin3.join(dfCharacFem, on="id", how='outer').select("id", "imdb_rating", "character_count", "masc_count", "fem_count") dfaux1 = dfjoin4.withColumn("imdb_rating_double", dfjoin4["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("masc_count_double", dfaux1["masc_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.masc_count_double).alias('Pearson-Imdb-NumCharactersMasc')) dfcof.show() dfaux1 = dfjoin4.withColumn("imdb_rating_double", dfjoin4["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("fem_count_double", dfaux1["fem_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.fem_count_double).alias('Pearson-Imdb-NumCharactersFem')) dfcof.show() #E) numero de palabras dfe = df4.groupBy('episode_id').agg( functions.sum(df4.word_count).alias("word_count")).selectExpr( "episode_id as id", "word_count") dfjoin1 = df2.join(dfe, on="id", how='outer').select("id", "imdb_rating", "word_count") dfaux1 = dfjoin1.withColumn("imdb_rating_double", dfjoin1["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("word_count_double", dfaux1["word_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.word_count_double).alias('Pearson-Imdb-NumWords')) dfcof.show() #F) cantidad total de diálogos dfe2 = df4.filter(df4.speaking_line == True).groupBy('episode_id').agg( functions.count(df4.raw_text).alias("raw_text_count")).selectExpr( "episode_id as id", "raw_text_count") dfjoin2 = dfjoin1.join(dfe2, on="id", how='outer').select("id", "imdb_rating", "word_count", "raw_text_count") dfaux1 = dfjoin2.withColumn("imdb_rating_double", dfjoin1["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("raw_text_count_double", dfaux1["raw_text_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.raw_text_count_double).alias('Pearson-Imdb-NumRawText')) dfcof.show() sc.stop()
train_data,test_data = final.randomSplit([0.7,0.3]) --------machine learning ------------ #initiate LinearRegressorObject regressor = LinearRegression(featuresCol='features',labelCol='label',predictionCol='predicted label') #fit train_data model = regressor.fit(train_data) #evaluate model by calling evaluate() method which will give number of option to evaluate model ,such as r2,rootMeanSquaredError eval_model = model.evaluate(test_data) eval_model.rootMeanSquaredError eval_model.r2 #prediction of test_data by model prediction = model.transform(test_data.select('features')) #check coefficients and intercepts of model (can decide which feature has more effect on label(independent variable) coeff=model.coefficients intercept= model.intercept a= zip(featcols,coeff) b= set(a) for x,y in b: print(x ,':' ,y) print('\n') #analysis only from pyspark.sql.functions import corr data.select(corr('feature1','label')).show() data.select(corr('feature1','feature2')).show()