def test_bucketizer(self): values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )] data = self.spark.createDataFrame(values, ["features"]) model = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")], inputCol="features", outputCol="buckets") feature_count = len(data.select('features').first()) model_onnx = convert_sparkml( model, 'Sparkml Bucketizer', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.setHandleInvalid("error").transform(data) expected = predicted.select("buckets").toPandas().values.astype( numpy.float32) data_np = [data.toPandas().values.astype(numpy.float32)] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBucketizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['buckets'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def create_buckets(percentage_of_missing_ctus_per_partyid): """ Devide party ids by percentage of missing ctus into a list of 5 buckets > 0 < 0.25 > 0.25 < 0.5 > 0.5 < 0.75 > 0.75 < 0.99 > 0.99 Output: +--------+-----------------------+-------+ |party_id|percentage_missing_ctus|buckets| +--------+-----------------------+-------+ | 1| 0.2| 0.0| | 2| 0.33| 1.0| | 3| 1.0| 4.0| | 4| 0.75| 3.0| | 5| 0.6| 2.0| | 6| 0.6| 2.0| +--------+-----------------------+-------+ """ bucketizer = Bucketizer(splits=[ 0, 0.25, 0.5, 0.75, 0.99, float('Inf') ], \ inputCol="percentage_missing_ctus", outputCol="buckets") df_of_buckets_ratio_between_imputed_distinct_ctus\ = bucketizer.setHandleInvalid("keep").\ transform(percentage_of_missing_ctus_per_partyid) return df_of_buckets_ratio_between_imputed_distinct_ctus
def get_column_hist(self, column, bins): """return a list of counts corresponding to bins""" bins = list(copy.deepcopy(bins)) # take a copy since we are inserting and popping if bins[0] == -np.inf or bins[0] == -float("inf"): added_min = False bins[0] = -float("inf") else: added_min = True bins.insert(0, -float("inf")) if bins[-1] == np.inf or bins[-1] == float("inf"): added_max = False bins[-1] = float("inf") else: added_max = True bins.append(float("inf")) temp_column = self.spark_df.select(column).where(col(column).isNotNull()) bucketizer = Bucketizer( splits=bins, inputCol=column, outputCol="buckets") bucketed = bucketizer.setHandleInvalid("skip").transform(temp_column) # This is painful to do, but: bucketizer cannot handle values outside of a range # (hence adding -/+ infinity above) # Further, it *always* follows the numpy convention of lower_bound <= bin < upper_bound # for all but the last bin # But, since the last bin in our case will often be +infinity, we need to # find the number of values exactly equal to the upper bound to add those # We'll try for an optimization by asking for it at the same time if added_max == True: upper_bound_count = temp_column.select(column).filter(col(column) == bins[-2]).count() else: upper_bound_count = 0 hist_rows = bucketed.groupBy("buckets").count().collect() # Spark only returns buckets that have nonzero counts. hist = [0] * (len(bins) - 1) for row in hist_rows: hist[int(row["buckets"])] = row["count"] hist[-2] += upper_bound_count if added_min: below_bins = hist.pop(0) bins.pop(0) if below_bins > 0: logger.warning("Discarding histogram values below lowest bin.") if added_max: above_bins = hist.pop(-1) bins.pop(-1) if above_bins > 0: logger.warning("Discarding histogram values above highest bin.") return hist
def model_train(zipcode, complaint, day): print("Loading Data ...") data311 = spark.read.format("csv").option("header", "true").load("Data_Final/*.csv") infer_schema = "true" first_row_is_header = "true" delimiter = "," data311.registerTempTable("data311") data311 = data311.withColumn("ResTimeH", data311.Resolution_Time_Hours.cast('int')) data311 = data311.withColumn('day_of_week', dayofweek(data311['Created Date'])) data311 = data311.withColumn("Zip", data311["Incident Zip"].cast('int')) data311 = data311.filter(data311.ResTimeH > 0) data311 = data311.filter(data311.ResTimeH < 99) bucketizer = Bucketizer(splits=[0, 2, 6, float('Inf')], inputCol="ResTimeH", outputCol="categories") data311 = bucketizer.setHandleInvalid("keep").transform(data311) X = data311['Zip', 'Complaint_Type_Groups', 'day_of_week', 'categories'] X = X.filter(X["Zip"].isNotNull()) X = X.filter(X["Complaint_Type_Groups"].isNotNull()) X = X.filter(X["day_of_week"].isNotNull()) stage_1 = StringIndexer(inputCol="Complaint_Type_Groups", outputCol="categoryIndex") stage_2 = OneHotEncoderEstimator(inputCols=["categoryIndex"], outputCols=["categoryVec"]) stage_3 = VectorAssembler(inputCols=['Zip', 'day_of_week', 'categoryVec'], outputCol="features") stage_4 = StandardScaler().setInputCol("features").setOutputCol( "Scaled_ip_features") stage_5 = LogisticRegression(labelCol="categories", featuresCol="Scaled_ip_features") # setup the pipeline pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4, stage_5]) # fit the pipeline model and transform the data as defined pipeline_model = pipeline.fit(X) zipcode = int(zipcode) day = int(day) input_variables = pd.DataFrame( [[zipcode, complaint, day]], columns=['Zip', 'Complaint_Type_Groups', 'day_of_week']) input_variables = spark.createDataFrame(input_variables) transformed = pipeline_model.transform(input_variables) ans = transformed.select(collect_list('prediction')).first()[0] if (ans[0] == 0.0): prediction = "Your complaint will be resolved within 2 hours." elif (ans[0] == 1.0): prediction = "Your complaint will be resolved within 2-6 hours." else: prediction = "Your complaint will be resolved after 6 hours" return prediction
def age_recoder(spark_df, age_col): """ :param spark_df: :param age_col: :return: """ age1 = Bucketizer(splits=[0, 2, 6, 12, 18, 25, 35, 45, 55, 65, 150], inputCol=age_col, outputCol="age1") age7 = Bucketizer(splits=[0, 18, 25, 35, 45, 55, 65, 150], inputCol=age_col, outputCol="age7") age8 = Bucketizer(splits=[0, 12, 18, 25, 35, 45, 55, 65, 150], inputCol=age_col, outputCol="age8") sdf_1 = age1.setHandleInvalid("keep").transform(spark_df) sdf_2 = age7.setHandleInvalid("keep").transform(sdf_1) res_sdf = age8.setHandleInvalid("keep").transform(sdf_2) return res_sdf
def _bucketize_age_column( self, dataframe: DataFrame, input_col: str, output_col: str) -> Tuple[DataFrame, int, List[str]]: bucketizer = Bucketizer(splits=self.age_groups, inputCol=input_col, outputCol=output_col) output = bucketizer.setHandleInvalid("keep").transform(dataframe) splits = [s for s in bucketizer.getSplits()] mapping = [ "[{}, {})".format(splits[i], splits[i + 1]) for i in range(len(splits) - 1) ] n_age_groups = len(mapping) return output, n_age_groups, mapping
def add_age_id(spark, df, logger): """Calculate the age_id by splitting the visitor age into buckets""" agebucketizer = Bucketizer(splits=[ float('-Inf'), 0, 2, 11, 16, 21, 26, 36, 46, 56, 66, float('Inf') ], inputCol="i94bir", outputCol="agebuckets") agebuck_df = agebucketizer.setHandleInvalid("keep").transform(df) age_id_df = agebuck_df.withColumn("age_id", when(col("i94bir") == -1, 999)\ .otherwise(col("agebuckets") .cast(IntegerType())) ) logger.info("Added age_id") age_id_df.persist() return age_id_df
def add_duration_id(spark, df, logger): """Calculate the visitduration_id by splitting the visit duration into buckets""" durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate")) ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22, 29, float('Inf') ], inputCol="duration_days", outputCol="ddbuckets") ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df) dur_id_df = ddbuck_df.withColumn("visitduration_id", when(isnull(col("arrdate")) | isnull(col("depdate")), 999)\ .otherwise(col("ddbuckets").cast(IntegerType())) ) logger.info("Added duration_id") return dur_id_df
df_rb_short = df_rb.drop('helpful').drop('reviewerName').drop('unixReviewTime') df_join = df_rb_short.join(df_mb_short, df_rb_short.asin == df_mb_short.asin, 'left').drop(df_mb_short.asin) cols_todrop = ['reviewText', 'summary', 'description', 'imUrl', 'title'] df_join_short = df_join.drop(*cols_todrop) df_join_short = df_join_short.withColumn('reviewYear', F.year('reviewTime')) max_rank = df_join_short.agg({ 'salesRank_Books': 'max' }).collect()[0]["max(salesRank_Books)"] bucketizer = Bucketizer( splits=[i * 5000 for i in list(range(int(max_rank / 5000) + 2))], inputCol="salesRank_Books", outputCol="salesRank_Books_buckets") df_join_short = bucketizer.setHandleInvalid("keep").transform(df_join_short) df_join_short = df_join_short.withColumn( 'salesRank_ranged', df_join_short.salesRank_Books_buckets * 5000) df_join_short = df_join_short.drop('salesRank_Books_buckets') #Visualizations #viz1 viz1_df = df_join_short.groupBy( 'overall').count().toPandas() #distribution of ratings plt.bar(viz1_df['overall'], viz1_df['count']) plt.xlabel('Rating') plt.ylabel('Count') plt.title('Rating Distribution') plt.show() #viz2 viz2_df = df_join_short.groupBy('reviewYear').agg(
data.describe('age').show() data_group_month = data.groupBy('month').count() data_group_weekday = data.groupBy('weekday').count() data.createOrReplaceTempView('datas') path = spark.sql( "select dpstation, rcstation, count(*) as num from datas group by dpstation, rcstation order by num desc" ) # 分组统计数据应该使用passenger表进行 # agegroup from pyspark.ml.feature import Bucketizer splits = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] bucketizer = Bucketizer(splits=splits, inputCol='age', outputCol='agegroup') cust_group_age = bucketizer.setHandleInvalid('keep').transform(cust) cust_group_age = cust_group_age.groupBy('agegroup').count() cust_group_age.coalesce(1).write.format('csv').option('header', 'true').save('custage') # sex cust_group_sex = cust.groupBy('sex').count() cust_group_sex.coalesce(1).write.format('csv').option('header', 'true').save('custsex') # freq data = data.withColumn('year', lit('2020')) data = data.withColumn( 'dpdate', concat_ws('-', data['year'], data['month'], data['day'])) data = data.withColumn('dpdate', to_date(data['dpdate'])) data.createOrReplaceTempView('datas')
df_demo = df_demo.withColumn('islamic', islamic_udf(df_demo.DEMO_CAPTUREDATE)) df_demo = df_demo.withColumn('islamic_month', get_month_udf(df_demo.islamic)) # df_demo = df_demo.withColumn('day_week', day_of_week_udf(df_demo.DEMO_CAPTUREDATE)) df_demo = df_demo.withColumn('age', age_udf(df_demo.birth_Year)) df_demo = df_demo.withColumn('location', location_udf(df_demo.cityTel)) bucketizer = Bucketizer(splits=[0, 10, 20, 30, 40, 60, float('Inf')], inputCol="age", outputCol="age_Category") df_cat = bucketizer.setHandleInvalid("keep").transform(df_demo) #df_cat = df_cat.withColumn("age_Category", df_cat["age_Category"].cast("integer")) # converting string type to integer #df_cat = df_cat.withColumn("age_Category", df_cat["age_Category"].cast("string")) # converting string type to integer df = df_cat.filter((df_cat.islamic_month == '09')) dfCount = df.groupBy('ID', 'DEMO_HOUR').count().select('ID', 'DEMO_HOUR', f.col('count').alias('count')) dfMax = dfCount.groupBy('ID').agg( f.last("count").alias("maxCount"), f.last("DEMO_HOUR").alias("interest_HOUR"))
qc2 = sqlContext.sql("select StateCodes, TotalC2014, 2014_GDP_per_Capita, t1.State from Consumption as t1 join income as t2 where t1.State = t2.State") qc2.registerTempTable("incomeConsumption") qc2.coalesce(1).write.format('com.databricks.spark.csv').save('incomeConsumption',header = 'true') #join the incomeConsumption with homelessness qc3 = sqlContext.sql("select TotalC2014, 2014_GDP_per_Capita, Count, t1.StateCodes from incomeConsumption as t1 join states as t2 where t1.StateCodes = t2.StateCodes") qc3.registerTempTable("homelessconsumption") qc3.coalesce(1).write.format('com.databricks.spark.csv').save('consumptionHomeless',header = 'true') #binning consumption to see if more homeless people are in high consumption areas qc4 = sqlContext.sql("select TotalC2014, Count, StateCodes from homelessconsumption order by TotalC2014") crdd = qc4.rdd.map(tuple) crdd = crdd.map(lambda x: (int(x[0]), x[1], x[2])) consumption_df = spark.createDataFrame(crdd, ["TotalC2014", "Count", "StateCodes"]) bucketizer = Bucketizer(splits=[ 0, 500000, 2000000, float('Inf') ], inputCol="TotalC2014", outputCol="buckets") consumption_df_buck = bucketizer.setHandleInvalid("keep").transform(consumption_df) t = {0.0:"low Consumption", 1.0: "Medium Consumption", 2.0:"High Consumption"} udf_foo = udf(lambda x: t[x], StringType()) consumption_df_buck.withColumn("consumption_bucket", udf_foo("buckets")).show() c_df_bucket = consumption_df_buck.withColumn("consumption_bucket", udf_foo("buckets")) c_df_bucket.registerTempTable("consumptionHomelessBuck") qc5 = sqlContext.sql("select consumption_bucket, Count, buckets from consumptionHomelessBuck order by consumption_bucket") qc5.registerTempTable("binnedC") qc5.coalesce(1).write.format('com.databricks.spark.csv').save('binnedconsumptionHomeless',header = 'true') #analysis for average qc6 = sqlContext.sql("select consumption_bucket, Count from binnedC") crdd2 = qc6.rdd.map(tuple) crdd2 = crdd2.mapValues(lambda x: int(x)).reduceByKey(lambda x,y: x+y) mapped_binnedc_df = sqlc.createDataFrame(crdd2, ['consumption_bucket', 'Count']) mapped_binnedc_df.registerTempTable("mappedbinnedc")
# calculate outlier, 3 sigma, standard deviation # df.select( # [F.mean(c).alias(c+'_mean') for c in outlier_cols]+[F.stddev(c).alias(c+'_std') for c in outlier_cols] # ).collect() df.select( [(F.mean(c)+3*F.stddev(c)).alias(c+'_ceil') for c in outlier_cols] ).show() # 根据条件给数据打标签 df.select(*bounds, *[F.when(~F.col(c).between(bounds[c]['min'], bounds[c]['max']), "yes").otherwise("no").alias(c+'_outlier') for c in bounds]) # cut feature into bins, bucketizer, buckets, discretization from pyspark.ml.feature import Bucketizer bucketizer = Bucketizer(splits=[ 0, 6, 18, 60, float('Inf') ],inputCol="ages", outputCol="buckets") df_buck = bucketizer.setHandleInvalid("keep").transform(df) # bins, method2 from functools import reduce splits = [0, 5, 9, 10, 11] splits = list(enumerate(splits)) # [(0, 0), (1, 5), (2, 9), (3, 10), (4, 11)] bins = reduce(lambda c, i: c.when(F.col('Age') <= i[1], i[0]), splits, F.when(F.col('Age') < splits[0][0], None)).otherwise(splits[-1][0] + 1).alias('bins') df = df.select('age', bins) # 基于聚类的特征处理,根据数据本身的特性,对特征进行聚类 # 剔除异常值后再聚类 from pyspark.ml.evaluation import ClusteringEvaluator data_len = new_df.count() print("数据量:{}".format(data_len))
# num_tweets_each_hour.show() # num_tweets_each_hour_list = list(map(lambda row: row.asDict(), num_tweets_each_hour.collect())) # Now check sentiment wise tweet count logging.info( f"Calculating total sentiment wise tweets by each user at hour level...") # First flatten the dataframe df_flattened = flatten_df(df2) # first bucketed sentiment scores bucketizer = Bucketizer(splits=[-1, -0.25, 0.25, 1], inputCol="sentiment_score", outputCol="sentiment_score_integer") df_bucketized = bucketizer.setHandleInvalid("keep").transform(df_flattened) sentiment_labels = ["negative", "neutral", "positive"] label_array = array(*(lit(label) for label in sentiment_labels)) df_bucketized_with_sentiment_label = df_bucketized.withColumn( "sentiment_score_label", label_array.getItem(col("sentiment_score_integer").cast("integer"))) # df_bucketized_with_sentiment_label.toPandas() num_tweets_each_hour_sentimental = df_bucketized_with_sentiment_label.groupBy("username", "author_id", window("created_at", "1 hour"), "sentiment_score_label") \ .count() \ .select(col("username").alias("username_b"), col("author_id").alias("author_id_b"), col("window").alias("window_b"),
############################ # Bucketizer ile Değişken Türetmek/Dönüştürmek ############################ from pyspark.ml.feature import Bucketizer spark_df.select('age').describe().toPandas().transpose() """ 0 1 2 3 4 summary count mean stddev min max age 10000 38.9218 10.487806451704587 18 92 """ bucketizer = Bucketizer(splits=[0, 35, 45, 65], inputCol="age", outputCol="age_cat") spark_df = bucketizer.setHandleInvalid("keep").transform(spark_df) spark_df.show(20) spark_df = spark_df.withColumn('age_cat', spark_df.age_cat + 1) spark_df.groupby("age_cat").count().show() spark_df.groupby("age_cat").agg({'geography': "mean"}).show() spark_df = spark_df.withColumn("age_cat", spark_df["age_cat"].cast("integer")) #Out[89]: DataFrame[rownumber: int, customerid: int, surname: string, creditscore: int, geography: string, gender: string, age: int, tenure: int, balance: double, numofproducts: int, hascrcard: int, isactivemember: int, estimatedsalary: double, exited: int, age_cat: int] ############################ # when ile Değişken Türetmek (segment) ############################ spark_df = spark_df.withColumn('segment', when(spark_df['tenure'] < 5, "segment_b").otherwise("segment_a"))