#Encoding string columns in merged table table1 = StringIndexer(inputCol = "PaymentStatus", outputCol = "PaymentStatus_index").fit(table1).transform(table1) #Creating meaningful Time variables table1 = table1.withColumn("DaysSubscription", datediff(col("EndDate"), col("StartDate"))) table1 = table1.withColumn("MonthsSubscription", months_between(col("EndDate"), col("StartDate"))) table1 = table1.withColumn("Year", year("StartDate")) # COMMAND ---------- #Feature engineering #Aggregating variables by CustomerID subs_totals = table1.groupBy("CustomerID").agg(count("SubscriptionID"), avg("DaysSubscription"), avg("MonthsSubscription"), sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), min("NbrMealsPrice"), max("NbrMealsPrice"), avg("NbrMealsPrice"), min("ProductDiscount"), max("ProductDiscount"), sum("ProductDiscount"), min("TotalDiscount"), max("TotalDiscount"), sum("TotalDiscount"), min("TotalPrice"), max("TotalPrice"), sum("TotalPrice"), min("TotalCredit"), max("TotalCredit"), sum("TotalCredit")) #Aggregating variables by Product Type subs_products = table1.groupBy("CustomerID").pivot("ProductName").agg(sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), sum("NbrMealsPrice"), sum("ProductDiscount"), sum("TotalDiscount"), sum("TotalPrice"), sum("TotalCredit")).withColumnRenamed("CustomerID","cIDProduct") #Aggregating variables by Payment Type subs_payment_type = table1.groupBy("CustomerID").pivot("PaymentType").agg(sum("TotalPrice"), sum("TotalCredit")).withColumnRenamed("CustomerID","cIDPayment") #Aggregating variables by Start Year of Subscription subs_year = table1.groupBy("CustomerID").pivot("Year").agg(count("SubscriptionID")).withColumnRenamed("CustomerID","cIDYear")
avg_ground_jfk = inter + regression.coefficients[3] print(avg_ground_jfk) # Average minutes on ground at LGA avg_ground_lga = inter + regression.regression.coefficients[4] print(avg_ground_lga) # RPM buckcket from pyspark.ml.feature import Bucketizer bucketizer = Bucketizer(split=[3500, 4500, 6000, 6500], inputCol='rpm', outpuCol='rpm_bin') # Apply bucket to rpm column cars = bucketizer.transform(cars) # ROM buckets bucketed.select('rpm', 'rpm_bin').show(5) cars.groupBy('rpm_bin').count().show() # Engineering density cars = cars.withColumn('density_line', cars.mass / cars.length) # Linear density cars = cars.withColumn('density_quad', cars.mass / cars.length ** 2) # Area density cars = cars.withColumn('density_cube', cars.mass / cars.length ** 3) # Volume density from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator # Create buckets at 3 hour intervals through the day buckets = Bucketizer(splits=[3 * x for x in range(9)], inputCol='depart', outputCol='depart_bucket') # Bucket the departure times bucketed = buckets.transform(flights) bucketed.select('depart', 'depart_bucket').show(5)
from pyspark.ml.feature import StringIndexer, OneHotEncoder df3 = StringIndexer(inputCol='Embarked', outputCol='Embarked1').fit(df3).transform(df3) df3.show() df3 = OneHotEncoder(inputCol='Embarked1', outputCol='Embarked2', dropLast=False).transform(df3) df3.show() # -------------------------------------------- df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3) df3.groupBy(df3.Embarked, 'Embarked').agg({ 'Embarked': 'count', 'Embarked1': 'sum' }).show() df3.show(5) df3.show(5) df3.show(10) df3.schema df3.printSchema() # -------------------------------------------- df4.show() df4.printSchema() fit(si1) male = 0 female = 1