def make_new_user_predictions(user_ratings_df): user_factors = get_user_factors(user_ratings_df) # print('user_factors: {}'.format(user_factors)) predictions = np.dot(user_factors, item_factors.T) prediction_df = spark.createDataFrame( zip(item_ids.tolist(), predictions.tolist()), ['item', 'res_prediction']) res_prediction_stats_df = (prediction_df.agg( F.avg(F.col('res_prediction')).alias('avg_res_prediction'), F.stddev_samp(F.col('res_prediction')).alias('stddev_res_prediction'))) predicted_rating_df = ( prediction_df.crossJoin(rating_stats_df).crossJoin( res_prediction_stats_df).crossJoin(residual_stats_df).join( item_bias_df, on='item').withColumn( 'prediction', ((F.col('res_prediction') - F.col('avg_res_prediction')) * F.col('stddev_residual') / F.col('stddev_res_prediction') # / 2 + F.col('avg_residual') + F.col('avg_rating') + F.col('item_bias')) # * (1 - 1 / F.pow(F.col('count_item_rating'), 0.6)) ) # .filter(F.col('prediction') > 0) ) predicted_rating_stats_df = (predicted_rating_df.agg( F.avg(F.col('prediction')).alias('avg_prediction'), F.stddev_samp(F.col('prediction')).alias('stddev_prediction'))) # print('prediction_df') # prediction_df.show(truncate=False) print('predicted_rating_df') predicted_rating_df.show(truncate=False) print('residual_stats_df') residual_stats_df.show(truncate=False) print('res_prediction_stats_df') res_prediction_stats_df.show(truncate=False) print('rating_stats_df') rating_stats_df.show(truncate=False) print('predicted_rating_stats_df') predicted_rating_stats_df.show(truncate=False) return predicted_rating_df
def _fit(self, data): inputCol = self.getInputCol() outputCol = self.getOutputCol() mean, stddev = data.agg(avg(inputCol), stddev_samp(inputCol)).first() return ImputeNormalModel( mean=float(mean), stddev=float(stddev), inputCol=inputCol, outputCol=outputCol, )
def standardize_column(self, column_name): def standardize_column_helper(mean, sd): return udf(lambda x: old_div((x - mean) * 1.0, sd) if x != None else x) mean = self._data_frame.select(F.mean(column_name)).collect()[0][0] StdDev = self._data_frame.select( F.stddev_samp(column_name)).collect()[0][0] self._data_frame = self._data_frame.withColumn( column_name + "_fs_standardized", standardize_column_helper(mean, StdDev)(col(column_name))) self._data_frame = self._data_frame.withColumn( column_name + "_fs_standardized", self._data_frame[column_name + "_fs_standardized"].cast('float')) return self._data_frame
def evaluate_demographics(self, target_file=[]): cur_demo = self.add_demo() from pyspark.sql.functions import udf udf_age = udf(lambda x: x.toArray().tolist()[0]) cur_demo = cur_demo.withColumn("AGE", udf_age("demo_feature")) cur_target_file = self.spark.read.parquet(self.out_file_name) anal_df = cur_target_file.select("ID").distinct().join(cur_demo, "ID") from pyspark.sql.functions import avg, stddev_samp, count anal_df.groupBy().agg(avg("AGE"), stddev_samp("AGE")).show() self.logger.info(cur_target_file.count()) cur_death = self.get_hospital_death() self.logger.info(anal_df.count()) anal_df.join(cur_death, "ID").groupBy("IS_DEAD").agg(count("*")).show()
def run_pipeline(self): try: logging.info( "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/" ) # check collect_list and collect_set #collect_set() function returns all values from an input column with duplicate values eliminated. #collect_list() function returns all values from an input column with duplicates logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.show(truncate=False) from pyspark.sql.functions import approx_count_distinct, collect_list from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct from pyspark.sql.functions import variance, var_samp, var_pop df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) dffirst = df.select(first("salary")) dffirst.show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return
# _read csv file flightdata = spark.read.option('inferSchema', 'true').option('header', 'true').csv('2015-summary.csv') flightdata.show(5) flightdata.printSchema() # _add new column using withColumn, we are just printing the updated dataframe by show, but it should be taken into new variable as new dataframe. flightdata.withColumn("newCol", col("count") + 10).show(4) # _using select, we can also mention column names explicitly in place of * flightdata_mod = flightdata.select("*", (col("count") + 20).alias("newCol2")) flightdata_mod.show(5) # _basic statistical functions flightdata.select(mean("count")).show() flightdata.select(min("count")).show() flightdata.select(max("count")).show() flightdata.select(stddev_pop("count")).show() flightdata.select(stddev_samp("count")).show() flightdata.select() # _group by and aggregations flightdata.groupBy("DEST_COUNTRY_NAME").agg(sum('count')).show(5) dest_count_data = flightdata.groupBy("DEST_COUNTRY_NAME").agg({'count': 'sum'}) # _write the data to csv after coalesce dest_count_data_merged = dest_count_data.coalesce(1) dest_count_data_merged.write.format('csv').option('header', 'true').save('dest_country')
dailyActivitiesDF.select(min("CaloriesBurned"), max("CaloriesBurned")).show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ## Statistical functions # MAGIC # MAGIC - We can do some basic statistical functions as well using the Spark API # COMMAND ---------- # standard deviation and variance dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"), stddev_pop("CaloriesBurned"), stddev_samp("CaloriesBurned")).show() # COMMAND ---------- # Any extreme points in our data? dailyActivitiesDF.select(skewness("CaloriesBurned"), kurtosis("CaloriesBurned")).show() # COMMAND ---------- # Covariance and Correlation dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"), covar_samp("CaloriesBurned", "Steps"), covar_pop("CaloriesBurned", "Steps")).show() # COMMAND ----------
from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show(2) #first, last, min, max from pyspark.sql.functions import first, last, min, max df.select( first("StockCode").alias("First_stock"), last("StockCode"), min("StockCode"), max("StockCode")).show(2) #sum,sumDistinct, avg from pyspark.sql.functions import sum, sumDistinct, avg df.select(sum("Quantity"), sumDistinct("Quantity"), avg("Quantity")).show(2) #표본분산 , 표본표준편차 from pyspark.sql.functions import var_samp, stddev_samp df.select(var_samp("Quantity"), stddev_samp("Quantity")).show(2) #모분산, 모표본편차 from pyspark.sql.functions import var_pop, stddev_pop df.select(var_pop("Quantity"), stddev_pop("Quantity")).show(2) #비대칭도, 척도 from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show(2) #공분산과 상관관계 from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity")).show(2) #복합데이터 타입의 집계
def get_column_stdev(self, column): return self.spark_df.select(stddev_samp(col(column))).collect()[0][0]
def _transform(self, requests_df): ''' Predicts the rating for requested users and restaurants. Parameters ========== requests_df (pyspark.sql.DataFrame) Data used to request predictions of ratings. Columns are 'user' and 'item'. Values of 'user' and 'item' must be numeric. Returns ======= final_prediction_df (pyspark.sql.DataFrame) Predictions with 'user', 'item' and 'prediction'. Prediction will be a floating point number. ''' # print('Transform starting!') start_time = time.monotonic() if self.useALS: self.prediction_df = self.recommender.transform(requests_df) self.prediction_stats_df = ( self.prediction_df .dropna(how='all', subset=['prediction']) .agg( F.avg(F.col('prediction')).alias('avg_prediction'), F.stddev_samp(F.col('prediction')).alias('stddev_prediction') ) ) # print('prediction_df') # self.prediction_df.show() # print('prediction_stats_df') # self.prediction_stats_df.show() # print('rating_stats_df') # self.rating_stats_df.show() # print('residual_stats_df') # self.residual_stats_df.show() if self.useBias: final_prediction_df = ( self.prediction_df .crossJoin(self.rating_stats_df) # .crossJoin(self.prediction_stats_df) # .crossJoin(self.residual_stats_df) .join(self.user_bias_df, on='user') .join(self.item_bias_df, on='item') .fillna({ 'user_bias': 0.0, 'item_bias': 0.0 }) .withColumn( 'prediction', ( F.coalesce( F.col('prediction') # - F.col('avg_prediction') , F.lit(0.0) ) # * F.col('stddev_residual') # / F.col('stddev_prediction') # + F.col('avg_residual') + F.col('avg_rating') + F.col('user_bias') + F.col('item_bias') ) # * (1 - (1 / F.pow(F.col('count_item_rating')), self.lambda_3)) ) .select( 'user', 'item', 'rating', 'prediction' ) ) else: final_prediction_df = ( self.prediction_df .dropna(how='all', subset=['prediction']) # .fillna({'prediction': F.col('avg_prediction')}) # .crossJoin(self.residual_stats_df) # .crossJoin(self.prediction_stats_df) # .withColumn( # 'prediction', # ( # F.col('prediction') # - F.col('avg_prediction') # ) # * F.col('stddev_residual') # / F.col('stddev_prediction') # + F.col('avg_residual') # ) ) else: final_prediction_df = ( requests_df .crossJoin(self.rating_stats_df) .join(self.user_bias_df, on='user') .join(self.item_bias_df, on='item') .fillna({ 'user_bias': 0.0, 'item_bias': 0.0 }) .withColumn( 'prediction', F.col('avg_rating') + F.col('user_bias') + F.col('item_bias') ) .select( 'user', 'item', 'rating', 'prediction' ) ) print('Transform done in {} seconds'.format(time.monotonic() - start_time)) # print('final_prediction_df') # final_prediction_df.show() return final_prediction_df
def _fit(self, ratings_df): ''' Fit ALS model using reviews as training data. Parameters ========== ratings_df (pyspark.sql.DataFrame) Data used to train recommender model. Columns are 'user', 'item', and 'rating'. Values of user and item must be numeric. Values of rating range from 1 to 5. Returns ======= self ''' # avg_rating_df = ( # ratings_df # .groupBy() # .avg(self.getRatingCol()) # .withColumnRenamed('avg({})'.format(self.getRatingCol()), # 'avg_rating') # ) # print('Fit starting!') start_time = time.monotonic() # print('ratings_df') # ratings_df.show() rating_stats_df = ( ratings_df .agg( F.avg(self.getRatingCol()).alias('avg_rating'), F.stddev_samp(self.getRatingCol()).alias('stddev_rating') ) ) # print('ratings_stats_df:') # rating_stats_df.show() # if not self.getUseALS(): # self.setLambda_1(0.0) # self.setLambda_2(0.0) item_bias_df = ( ratings_df .crossJoin(rating_stats_df) .withColumn( 'diffs_item_rating', F.col(self.getRatingCol()) - F.col('avg_rating') ) .groupBy(self.getItemCol()) .agg( F.avg(F.col('diffs_item_rating')).alias('avg_diffs_item_rating'), F.nanvl( F.stddev_samp(F.col('diffs_item_rating')), F.lit(2.147483647E9) ).alias('stddev_diffs_item_rating'), F.count("*").alias('count_item_rating') ) .withColumn( 'stderr_diffs_item_rating', (self.getLambda_1() + F.col('stddev_diffs_item_rating')) / F.sqrt('count_item_rating') ) .withColumn( 'item_bias', F.col('avg_diffs_item_rating') / (1 + F.col('stderr_diffs_item_rating')) ) .select( self.getItemCol(), 'item_bias', 'avg_diffs_item_rating', 'stderr_diffs_item_rating', 'stddev_diffs_item_rating', 'count_item_rating' ) ) # print('item_bias_df:') # item_bias_df.show(5) # item_bias_df.printSchema() # print('item_bias_df NaN') # item_bias_df.where(F.isnan("item_bias")).show() user_bias_df = ( ratings_df .crossJoin(rating_stats_df) .join(item_bias_df, on=self.getItemCol()) .withColumn( 'diffs_user_rating', F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('item_bias') ) .groupBy(self.getUserCol()) .agg( F.avg(F.col('diffs_user_rating')).alias('avg_diffs_user_rating'), F.nanvl( F.stddev_samp(F.col('diffs_user_rating')), F.lit(2.147483647E9) ).alias('stddev_diffs_user_rating'), F.count("*").alias('count_user_rating') ) .withColumn( 'stderr_diffs_user_rating', (self.getLambda_2() + F.col('stddev_diffs_user_rating')) / F.sqrt('count_user_rating') ) .withColumn( 'user_bias', F.col('avg_diffs_user_rating') / (1 + F.col('stderr_diffs_user_rating')) ) .select( self.getUserCol(), 'user_bias', 'avg_diffs_user_rating', 'stderr_diffs_user_rating', 'stddev_diffs_user_rating', 'count_user_rating' ) ) # print('user_bias_df:') # user_bias_df.show(5) # print('user_bias_df NaN') # user_bias_df.where(F.isnan("user_bias")).show() if self.getUseALS(): if self.getUseBias(): residual_df = ( ratings_df .crossJoin(rating_stats_df) .join(user_bias_df, on=self.getUserCol()) .join(item_bias_df, on=self.getItemCol()) .withColumn( self.getRatingCol(), F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('user_bias') - F.col('item_bias') ) .select( self.getUserCol(), self.getItemCol(), self.getRatingCol() ) ) else: residual_df = ratings_df # self.setColdStartStrategy('drop') residual_stats_df = ( residual_df .agg( F.avg(F.col(self.getRatingCol())).alias('avg_residual'), F.stddev(F.col(self.getRatingCol())).alias('stddev_residual') ) ) # print('residual_df') # residual_df.show() # print('residual_df NaN') # residual_df.where(F.isnan("rating")).show() # print('residual_stats_df') # residual_stats_df.show() als_model = ALS( rank=self.getRank(), maxIter=self.getMaxIter(), regParam=self.getRegParam(), numUserBlocks=self.getNumUserBlocks(), numItemBlocks=self.getNumItemBlocks(), implicitPrefs=self.getImplicitPrefs(), alpha=self.getAlpha(), userCol=self.getUserCol(), itemCol=self.getItemCol(), ratingCol=self.getRatingCol(), nonnegative=self.getNonnegative(), checkpointInterval=self.getCheckpointInterval(), intermediateStorageLevel=self.getIntermediateStorageLevel(), finalStorageLevel=self.getFinalStorageLevel() ) recommender = als_model.fit(residual_df) else: recommender = None residual_stats_df = None print('Fit done in {} seconds'.format(time.monotonic() - start_time)) return( RecommenderModel( self.getUseALS(), self.getUseBias(), self.getLambda_3(), # self.getColdStartStrategy(), recommender, rating_stats_df, residual_stats_df, user_bias_df, item_bias_df ) )
Often you will want to compute a metric over a set of values that share a common characteristic, like the average price of a house in a certain region. To achieve this, you would need to group the data by region and compute an aggregate metric on that subgroup of data. We’ve already seen in the video a couple of these aggregation metrics, on landingprices.csv_. We’ll inspect a few more now and apply them to _~/workspace/mnt/data_lake/landing/purchased.csv_. In particular, you’ll use the spark.sql aggregation functions avg() to compute the average value of some column in a group, stddev_samp() to compute the standard (sample) deviation and max() (which we alias as sfmax so as not to shadow Python’s built-in max()) to retrieve the largest value of some column in a group. Instructions 100 XP - Use the .groupBy() method to group the data by the “Country” column. - In these groups, compute the average of the “Salary” column and name the resulting column “average_salary”. - Compute the standard deviation of the “Salary” column in each group in the same aggregation. - Retrieve the largest “Salary” in each group, in the same aggregation, and name the resulting column “highest_salary”. ''' from pyspark.sql.functions import col, avg, stddev_samp, max as sfmax aggregated = (purchased # Group rows by 'Country' .groupBy(col('Country')) .agg( # Calculate the average salary per group avg('Salary').alias('average_salary'), # Calculate the standard deviation per group stddev_samp('Salary'), # Retain the highest salary per group sfmax('Salary').alias('highest_salary') ) ) aggregated.show()
def get_builtin_aggregator_column(agg, ctx): try: aggregator = ctx.aggregators[agg["aggregator"]] try: input = ctx.populate_values(agg["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise if aggregator["name"] == "approx_count_distinct": return F.approxCountDistinct(input["col"], input.get("rsd")).alias(agg["name"]) if aggregator["name"] == "avg": return F.avg(input).alias(agg["name"]) if aggregator["name"] in { "collect_set_int", "collect_set_float", "collect_set_string" }: return F.collect_set(input).alias(agg["name"]) if aggregator["name"] == "count": return F.count(input).alias(agg["name"]) if aggregator["name"] == "count_distinct": return F.countDistinct(*input).alias(agg["name"]) if aggregator["name"] == "covar_pop": return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "covar_samp": return F.covar_samp(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "kurtosis": return F.kurtosis(input).alias(agg["name"]) if aggregator["name"] in {"max_int", "max_float", "max_string"}: return F.max(input).alias(agg["name"]) if aggregator["name"] == "mean": return F.mean(input).alias(agg["name"]) if aggregator["name"] in {"min_int", "min_float", "min_string"}: return F.min(input).alias(agg["name"]) if aggregator["name"] == "skewness": return F.skewness(input).alias(agg["name"]) if aggregator["name"] == "stddev": return F.stddev(input).alias(agg["name"]) if aggregator["name"] == "stddev_pop": return F.stddev_pop(input).alias(agg["name"]) if aggregator["name"] == "stddev_samp": return F.stddev_samp(input).alias(agg["name"]) if aggregator["name"] in {"sum_int", "sum_float"}: return F.sum(input).alias(agg["name"]) if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}: return F.sumDistinct(input).alias(agg["name"]) if aggregator["name"] == "var_pop": return F.var_pop(input).alias(agg["name"]) if aggregator["name"] == "var_samp": return F.var_samp(input).alias(agg["name"]) if aggregator["name"] == "variance": return F.variance(input).alias(agg["name"]) raise ValueError("missing builtin aggregator") # unexpected except CortexException as e: e.wrap("aggregate " + agg["name"]) raise
def getVol(df): #get volatility by day df_std = df.agg(func.stddev_samp(df.percent)) df_std = df_std.withColumnRenamed("stddev_samp(percent)", "volatility") return df_std
df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list
count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show()
df['column1'] + df['column2'] * df['column3']) df3.show() print('--- Aggregations and quick statistics -------') # the dataframe doesn't have headers that's why we need the column names ADULT_COLUMN_NAMES = [ "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income" ] # I downloaded the adult.data file from # https://archive.ics.uci.edu/ml/datasets/adult # to my data folder and renamed it to adult.data.csv csv_df = session.read.csv('data/adult.data.csv', header=False, inferSchema=True) # we'll set the column names one by one on this loop for new_col, old_col in zip(ADULT_COLUMN_NAMES, csv_df.columns): csv_df = csv_df.withColumnRenamed(old_col, new_col) # quick descriptive statistics csv_df.describe().show() # get average work hours per age work_hours_df = csv_df.groupBy('age').agg( funcs.avg('hours_per_week'), funcs.stddev_samp('hours_per_week')).sort('age') work_hours_df.show(100) print('---- The End :) -----')
encode = OneHotEncoder(inputCol="studentIdx", outputCol="studentclassVec") # Let's apply the same procedure to the label(target) variable # No need to apply onehot encoding to label (only string indexing is required) label_StrIdx = StringIndexer(inputCol="default", outputCol="label") # Build the first stages for the pipeline stages = [strIdx, encode, label_StrIdx] # For numerical variables, let's do transform those to standard scaled variables from pyspark.sql.functions import col, stddev_samp numCols = ['income', 'balance'] for c in numCols: df = df.withColumn(c + "Scaled", col(c) / df.agg(stddev_samp(c)).first()[0]) # Finally, you can define the inputs for mordel # In this case, the vector of the categorical variables and the scaled numerical variables were assinged inputs = ["studentclassVec", "incomeScaled", "balanceScaled"] # As all input features need to be vectorized, VectorAssembler function has to be used assembler = VectorAssembler(inputCols=inputs, outputCol="features") # Add the assembler to the previous stage stages += [assembler] # Put stages to build the Pipeline # - The stage consists of string indexer, onehot encoder, scaler, and vector assembler pipeline = Pipeline(stages=stages)
def std_dev(data_frame, measure_column_name): return data_frame.select( FN.stddev_samp(measure_column_name)).collect()[0][0]
def get_baseline_scores(train_df, val_df, evaluator, eval_name): stats_rating_df = ( train_df .agg( F.avg('rating').alias('avg_rating'), F.stddev_samp('rating').alias('stddev_rating') ) ) stats_row = stats_rating_df.head() print('[plot_scores Train] Avg: {}'.format(stats_row[0])) print('[plot_scores Train] Std Dev: {}'.format(stats_row[1])) # Naive model: random normal rating centered on average rating and scaled # with standard deviation of training data. train_predict_df = ( train_df .crossJoin(stats_rating_df) .withColumn( 'prediction', F.col('avg_rating') + F.randn() * F.col('stddev_rating') ) .select( 'user', 'item', 'rating', 'prediction' ) ) val_predict_df = ( val_df .crossJoin(stats_rating_df) .withColumn( 'prediction', F.col('avg_rating') + F.randn() * F.col('stddev_rating') ) .select( 'user', 'item', 'rating', 'prediction' ) ) naive_score_train = evaluator.evaluate(train_predict_df) naive_score_val = evaluator.evaluate(val_predict_df) print('Train Naive {} score: {}'.format(eval_name, naive_score_train)) print('Validation Naive {} score: {}'.format(eval_name, naive_score_val)) estimator = Recommender( lambda_1=0.0, lambda_2=0.0, lambda_3=0.0, useALS=False, useBias=True, userCol='user', itemCol='item', ratingCol='rating' ) model = estimator.fit(train_df) baseline_score_train = evaluator.evaluate(model.transform(train_df)) baseline_score_val = evaluator.evaluate(model.transform(val_df)) print('Train Baseline {} score: {}'.format(eval_name, baseline_score_train)) print('Validation Baseline {} score: {}'.format(eval_name, baseline_score_val)) return ( naive_score_train, naive_score_val, baseline_score_train, baseline_score_val )
df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \ .show(truncate=False)
lh = l3.union(l4) l1.groupBy('Level_1', 'Sex').count().sort('count', ascending=False).show(150, truncate=False) l2.groupBy('Level_2', 'Sex').count().sort('count', ascending=False).show(150, truncate=False) lh.groupBy('Level', 'Sex').count().sort('count', ascending=False).show(150, truncate=False) l1_t = l1.groupBy('Level_1').agg( F.mean('Age_1').alias('Mean'), F.count('Age_1').alias('Count'), F.stddev_samp('Age_1').alias('StdDev')) l1_t.sort('Count', ascending=False).show(25, truncate=False) l2_t = l2.groupBy('Level_2').agg( F.mean('Age_2').alias('Mean'), F.count('Age_2').alias('Count'), F.stddev_samp('Age_2').alias('StdDev')) l2_t.sort('Count', ascending=False).show(25, truncate=False) lh_t = lh.groupBy('Level').agg( F.mean('Age').alias('Mean'), F.count('Age').alias('Count'), F.stddev_samp('Age').alias('StdDev')) lh_t.sort('Count', ascending=False).show(25, truncate=False) aRules = fpm.associationRules associationRules = fpm.associationRules freqItemsets = fpm.freqItemsets