async def cluster_by_purchase_behaivor(sbx, spark, user_cluster, date_int, kmeans=3): print('fetch data') df = await user_cluster.df_sbx_customer_purchase_behaivor( sbx, spark, date_int) total = df.count() best_score = total * 100 best_df = None print('Creating clusters') clustering_types = [user_cluster.bisecting_means, user_cluster.k_means] total_clustring_types = len(clustering_types) k = kmeans for i in range(total_clustring_types): if i == total_clustring_types - 1: model = await clustering_types[i](df, k, 1) else: model = await clustering_types[i](df, k) transformed = await user_cluster.run_cluster(model, df) grouped = transformed.groupBy("prediction").agg( countDistinct('customer').alias("count")) (std) = grouped.select(_stddev(col('count')).alias('std')).first() score = std['std'] if (score < best_score): best_score = score best_df = transformed k = len(model.clusterCenters()) return best_df.groupBy("prediction").agg(collect_list(col("customer")).alias("customers"),countDistinct('customer').alias("count"))\ .sort("count", ascending=False)
def get_mean_and_std(all_harvest_df): # https://stackoverflow.com/a/47995478 df_stats = all_harvest_df.select( _mean(col('yield')).alias('mean'), _stddev(col('yield')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] return mean, std
def mean_stdv(df): unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType()) for i in ["count"]: assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect") scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled") pipeline = Pipeline(stages=[assembler, scaler]) df = pipeline.fit(df).transform(df).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect") df_stats = df.select(_mean(col('count_Scaled')).alias('mean'),_stddev(col('count_Scaled')).alias('std')).collect() # mean = df_stats[0]['mean'] # std = df_stats[0]['std'] return df_stats
async def df_sbx_customer_special_box_purchased(self, sbx, spark): data = await sbx.with_model('cart_box') \ .set_page_size(1000) \ .and_where_is_not_null('purchase') \ .and_where_is_equal('variety', os.environ['SPECIAL_BOX']).find() sc = spark.sparkContext def deleteMeta(d): dt = {} dt['customer'] = d['customer'] dt['total_items'] = d['total_items'] dt['current_percentage'] = d['current_percentage'] dt['count'] = 1 return dt dit = list(map(deleteMeta, data['results'])) tmp = sc.parallelize(dit, numSlices=100) df = spark.read.option("multiLine", "true").json(tmp) df2 = df.groupBy("customer").agg( func.avg("total_items").alias('total_items'), func.avg("current_percentage").alias('current_percentage'), func.sum("count").alias('count')) (cumean, custd, comean, costd, tmean, tstd) = df2.select( _mean(col('current_percentage')).alias('cumean'), _stddev(col('current_percentage')).alias('custd'), _mean(col('count')).alias('comean'), _stddev(col('count')).alias('costd'), _mean(col('total_items')).alias('total_items'), _stddev(col('total_items')).alias('total_items'), ).first() df3 = df2.withColumn( "acurrent_percentage", (col("current_percentage") - cumean) / custd).withColumn( "acount", (col("count") - comean) / costd).withColumn( "atotal_items", (col("total_items") - tmean) / tstd) vecAssembler = VectorAssembler( inputCols=["acurrent_percentage", "acount", "atotal_items"], outputCol="features") return vecAssembler.transform(df3)
def incomeZScore(): df_stats = dataset.select( _mean(col('Average_Income')).alias('mean'), _stddev(col('Average_Income')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = dataset.select( (dataset['Average_Income'] - mean) / std).withColumnRenamed( "((Average_Income - 58348.17333333333) / 9095.510688184871)", "z_score_AvgInc").alias("z_score_AvgInc") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = dataset.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) return final_df
def AverageAndStd(time, rdd, streaming_dict, id): if rdd.isEmpty(): return df = rdd.map(lambda x: Row(**x)).toDF() columns = df.schema.names conditions_mean = [ _mean(col(column)).alias(column + "_mean") for column in columns ] conditions_std = [ _stddev(col(column)).alias(column + "_stddev") for column in columns ] df = df.select(conditions_mean + conditions_std).toPandas() df["time_stamp"] = time.timestamp() * 1000 if id in streaming_dict: streaming_dict[id] = streaming_dict[id].append(df, ignore_index=True) else: streaming_dict[id] = df
def crimeZScore(): df2 = unemploymentZScore() df_stats = dataset.select( _mean(col('Crime_Percent')).alias('mean'), _stddev(col('Crime_Percent')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = dataset.select( (dataset['Crime_Percent'] - mean) / std).withColumnRenamed( "((Crime_Percent - 3.2683999999999975) / 0.8328317973490115)", "z_score_Crime") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = df2.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) final_df.show() return final_df
def unemploymentZScore(): df2 = incomeZScore() df_stats = dataset.select( _mean(col('Unemployment_Percent')).alias('mean'), _stddev(col('Unemployment_Percent')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = dataset.select( (dataset['Unemployment_Percent'] - mean) / std).withColumnRenamed( "((Unemployment_Percent - 7.450666666666669) / 2.512157640140963)", "z_score_Unem") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = df2.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) final_df.show() return final_df
def homelessZScore(): df2 = crimeZScore() df_stats = dataset.select( _mean(col('Homeless_Percent')).alias('mean'), _stddev(col('Homeless_Percent')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = dataset.select( (dataset['Homeless_Percent'] - mean) / std).withColumnRenamed( "((Homeless_Percent - 0.17706666666666662) / 0.09455791640084463)", "z_score_Homeless") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = df2.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) final_df.show() return final_df
def prepareData(): df_newOpiFac = newOpioidFactor() df_newOpiFac.show() df_AvgInc = homelessZScore() df_stats = df_newOpiFac.select( _mean(col('new_opioid_factor')).alias('mean'), _stddev(col('new_opioid_factor')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = df_newOpiFac.select( (df_newOpiFac['new_opioid_factor'] - mean) / std).withColumnRenamed( "((new_opioid_factor - 7447.090505536551) / 12252.01952320687)", "z_score_opioid").alias("z_score_opioid") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = df_AvgInc.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) return final_df
df.show(5) #modyfikacja struktury danych oraz wyświetlenie zmodyfikowanego schematu df2 = df.withColumn("max_temp", df["max_temp"].cast(FloatType())) #pobranie czasu zakończenia ładowania daych end_load_time = datetime.datetime.now() #wykonanie zadania df2.filter((df2['latitude'] >= 41) & (df2['latitude'] <= 45) & (df2['longitude'] >= (-110)) & (df2['longitude'] <= (-104)) & (df2['year'] >= 1970)).groupBy("year").mean('max_temp').sort( 'year').show(50) #odchylenie standardowe df2.filter((df2['latitude'] >= 41) & (df2['latitude'] <= 45) & (df2['longitude'] >= (-110)) & (df2['longitude'] <= (-104)) & (df2['year'] >= 1970)).groupBy("year").mean( 'max_temp').sort('year').select( _stddev(col('avg(max_temp)')).alias('std')).show(1) #pobranie czasu zakończenia wykonania programu/zadania end_time = datetime.datetime.now() sc.stop() #utworzenie zmiennych wynikowych time_load_data = end_load_time - start_time time_of_execution = end_time - end_load_time total_time = end_time - start_time #wyświetlenie wyników print("Time load data:") print(time_load_data) print("Time of execution:") print(time_of_execution) print("Total time:") print(total_time)
wikiCategoryFile = "gs://metcs777/wiki-categorylinks.csv.bz2" wikiCategoryLinks = sc.textFile(wikiCategoryFile) wikiCats = wikiCategoryLinks.map(lambda x: x.split(",")).map( lambda x: (x[0].replace('"', ''), x[1].replace('"', ''))) df = sqlContext.createDataFrame(wikiCats) df.show() """ task3.1 """ from pyspark.sql import functions as func from pyspark.sql import DataFrameStatFunctions as statFunc from pyspark.sql import functions as F from pyspark.sql.functions import mean as _mean, stddev as _stddev, col df_cate = df.groupBy(df[1]).count() max = df_cate.agg(func.max("count")).show() avg = df_cate.agg(func.mean("count")).show() med = F.expr('percentile_approx(count, 0.5)') median = df_cate.agg(med.alias('med(count)')).show() std = df_cate.select(_stddev(col('count')).alias('std')).show() """ task3.2 """ top = df_cate.orderBy("count", ascending=[0]).show(10) """ task3.3 """ top_cate = df_cate.orderBy("count", ascending=[0]).limit(10) top_page = top_cate.join(df, df[1] == top_cate[0]).drop("count") top_id = top_page.groupBy(top_page[1]).count() print(top_id.select(top_id[0]).show())
'INDUS_outliers', 'INDUS').show() bounded_df.filter(bounded_df.CHAS_outliers != 0).select( 'CHAS_outliers', 'CHAS').show() bounded_df.filter(bounded_df.NOX_outliers != 0).select('NOX_outliers', 'NOX').show() bounded_df.filter(bounded_df.RM_outliers != 0).select('RM_outliers', 'RM').show() ''' Spark not support visualization to data. here i used databricks display method to visualize the contrnt ''' display( bounded_df.select('CRIM_outliers', 'ZN_outliers', 'INDUS_outliers', 'CHAS_outliers', 'NOX_outliers', 'RM_outliers', 'AGE_outliers', 'DIS_outliers', 'RAD_outliers', 'TAX_outliers', 'PTRATIO_outliers', 'B_outliers', 'LSTAT_outliers', 'PRICE_outliers')) #Calculating Z - score val = df.select(df.INDUS.cast("int")) df_stats = df.select( _mean(df.INDUS.cast("double")).alias('mean'), _stddev(df.INDUS.cast("double")).alias('std'), ).collect() #Add score_INDUS in dataframe mean = df_stats[0]['mean'] std = df_stats[0]['std'] score_INDUS = df.withColumn("z score_INDUS", df.INDUS.cast("double") - mean / std) score_INDUS.show()
# Standardization : Operation to transform a feature so that # mean = 0 and standard deviation = 1 for # the transformed aray of values def standardize(x, meanVal, stdVal): return (x - meanVal) / stdVal def standardize_udf(meanVal, stdVal): return udf(lambda x: standardize(x, meanVal, stdVal), FloatType()) # extract mean and standard deviation value for the column 'height_percentage' df_stats_hp = df.select( _mean(col('height_percentage')).alias('mean'), _stddev(col('height_percentage')).alias('std')).collect() mean_hp = df_stats_hp[0]['mean'] std_hp = df_stats_hp[0]['std'] # extract mean and standard deviation value for the column 'age' df_stats_age = df.select( _mean(col('age')).alias('mean'), _stddev(col('age')).alias('std')).collect() mean_age = df_stats_age[0]['mean'] std_age = df_stats_age[0]['std'] # perform simple standardization on the 'age' and 'height_percentage' column df = df.withColumn('height_percentage', standardize_udf(mean_hp, std_hp)(col('height_percentage')))
# 3. DATA PREPROCESSING # quit Name, PassengerID and Ticket (identifiers, Not attributes) df = df.select('Age', 'Fare', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Survived', 'Cabin') # Dealing with missing values # CABIN: Quit Cabin. The 77.1% of the values are null. df = df.select('Age', 'Fare', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Survived') # AGE: Substitute by Mean from pyspark.sql.functions import mean as _mean, stddev as _stddev, col df_stats = df.select( _mean(col('Age')).alias('mean'), _stddev(col('Age')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] # mean=29.7=30 df = df.fillna(30, subset=['Age']) # FARE: Substitute by Mean df_stats = df.select(_mean(col('Fare')).alias('mean')).collect() mean = df_stats[0]['mean'] # mean=33.2955 df = df.fillna(33.2955, subset=['Fare']) # EMBARKED: Substitute by most common class df.groupBy(df['Embarked']).count().show() # The most common class is 'S' df = df.fillna('S', subset=['Embarked'])
def drop_duplicate_cols(self): ''' Needs optimization ''' numerical_col = [ i_val for (i_val, i_type) in self._data_frame.dtypes if i_type == 'int' or i_type == 'double' or i_type == 'float' ] categorical_col = [ i_val for (i_val, i_type) in self._data_frame.dtypes if i_type == 'string' ] boolean_col = [ i_val for (i_val, i_type) in self._data_frame.dtypes if i_type == 'boolean' ] categorical_list1 = [] categorical_list2 = [] numerical_list1 = [] numerical_list2 = [] boolean_list1 = [] boolean_list2 = [] remove_list = [] for i in range(len(categorical_col) - 1): for j in range(i + 1, len(categorical_col)): if self._data_frame.groupby( self._data_frame[categorical_col[i]]).count().collect( ) == self._data_frame.groupby( categorical_col[j]).count().collect(): if categorical_col[ j] == self._dataframe_context.get_result_column(): categorical_list1.append(categorical_col[i]) categorical_list2.append(categorical_col[j]) else: categorical_list1.append(categorical_col[j]) categorical_list2.append(categorical_col[i]) count_dict1 = dict(list(zip(categorical_list1, categorical_list2))) elements_list1 = [] elements_list2 = [] for k, v in list(count_dict1.items()): elements_list1 = self._data_frame.select(k) elements_list2 = self._data_frame.select(v) if elements_list1.collect() == elements_list2.collect(): remove_list.append(k) for i in range(len(numerical_col) - 1): for j in range(i + 1, len(numerical_col)): df_col1_std = self._data_frame.select( _stddev(col( numerical_col[i])).alias('std')).collect()[0][0] df_col2_std = self._data_frame.select( _stddev(col( numerical_col[j])).alias('std')).collect()[0][0] if (df_col1_std == df_col2_std): if numerical_col[ j] == self._dataframe_context.get_result_column(): numerical_list1.append(numerical_col[i]) numerical_list2.append(numerical_col[j]) else: numerical_list1.append(numerical_col[j]) numerical_list2.append(numerical_col[i]) count_dict2 = dict(list(zip(numerical_list1, numerical_list2))) elements_list3 = [] elements_list4 = [] for k, v in list(count_dict2.items()): elements_list3 = self._data_frame.select(k) elements_list4 = self._data_frame.select(v) if elements_list3.collect() == elements_list4.collect(): remove_list.append(k) for i in range(len(boolean_col) - 1): for j in range(i + 1, len(boolean_col)): if self._data_frame.groupby(df[boolean_col[i]]).count( ).collect() == self._data_frame.groupby( boolean_col[j]).count().collect(): if boolean_col[ j] == self._dataframe_context.get_result_column(): boolean_list1.append(boolean_col[i]) boolean_list2.append(boolean_col[j]) else: boolean_list1.append(boolean_col[j]) boolean_list2.append(boolean_col[i]) count_dict3 = dict(list(zip(boolean_list1, boolean_list2))) elements_list5 = [] elements_list6 = [] for k, v in list(count_dict3.items()): elements_list5 = self._data_frame.select(k) elements_list6 = self._data_frame.select(v) if elements_list5.collect() == elements_list6.collect(): remove_list.append(k) # print(remove_list) # return remove_list self.removed_col = remove_list self._data_frame = self._data_frame.drop(*remove_list) return self._data_frame