convert_int = ["NbrMeals_EXCEP", "GrossFormulaPrice", "NetFormulaPrice", "NbrMealsPrice", "ProductDiscount", "FormulaDiscount", "TotalDiscount", "TotalPrice", "TotalCredit"] for i in convert_int: table1 = table1.withColumn(i, table1[i].cast("integer")) #Changing column types from 'Subscriptions' from string to timestamp convert_date = ["StartDate","EndDate","RenewalDate","PaymentDate"] for i in convert_date: table1 = table1.withColumn(i, table1[i].cast("timestamp")) #Encoding string columns in merged table table1 = StringIndexer(inputCol = "PaymentStatus", outputCol = "PaymentStatus_index").fit(table1).transform(table1) #Creating meaningful Time variables table1 = table1.withColumn("DaysSubscription", datediff(col("EndDate"), col("StartDate"))) table1 = table1.withColumn("MonthsSubscription", months_between(col("EndDate"), col("StartDate"))) table1 = table1.withColumn("Year", year("StartDate")) # COMMAND ---------- #Feature engineering #Aggregating variables by CustomerID subs_totals = table1.groupBy("CustomerID").agg(count("SubscriptionID"), avg("DaysSubscription"), avg("MonthsSubscription"), sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), min("NbrMealsPrice"), max("NbrMealsPrice"), avg("NbrMealsPrice"), min("ProductDiscount"), max("ProductDiscount"), sum("ProductDiscount"), min("TotalDiscount"), max("TotalDiscount"), sum("TotalDiscount"), min("TotalPrice"), max("TotalPrice"), sum("TotalPrice"), min("TotalCredit"), max("TotalCredit"), sum("TotalCredit"))
# Average minutes on ground at LGA avg_ground_lga = inter + regression.regression.coefficients[4] print(avg_ground_lga) # RPM buckcket from pyspark.ml.feature import Bucketizer bucketizer = Bucketizer(split=[3500, 4500, 6000, 6500], inputCol='rpm', outpuCol='rpm_bin') # Apply bucket to rpm column cars = bucketizer.transform(cars) # ROM buckets bucketed.select('rpm', 'rpm_bin').show(5) cars.groupBy('rpm_bin').count().show() # Engineering density cars = cars.withColumn('density_line', cars.mass / cars.length) # Linear density cars = cars.withColumn('density_quad', cars.mass / cars.length ** 2) # Area density cars = cars.withColumn('density_cube', cars.mass / cars.length ** 3) # Volume density from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator # Create buckets at 3 hour intervals through the day buckets = Bucketizer(splits=[3 * x for x in range(9)], inputCol='depart', outputCol='depart_bucket') # Bucket the departure times bucketed = buckets.transform(flights) bucketed.select('depart', 'depart_bucket').show(5) # Create a one-hot encoder onehot = OneHotEncoderEstimator(inputCols=['depart_bucket'], outputCols=['depart_dummy'])
##StringEncoding of categorical variables cat_x_vars = ["term", "grade", "home_ownership", "pred_KM", "emp_length"] #df2 = df #backup in case of trouble for cat_var in cat_x_vars: df = StringIndexer(inputCol=cat_var, outputCol=cat_var + 'Idx').fit(df).transform(df).drop(cat_var) df = df.withColumnRenamed(cat_var + 'Idx', cat_var) #df.select(cat_x_vars).show(5) #check ##Create y or target variables for neural networks #probability/indicator for default df = df.withColumn('probDef', F.when(df['loan_status'] == 1, 1.0).otherwise(0.0)) #default is 1, repaid is 0 #indicator for early replayment df = df.withColumn( 'probER', F.when((df['loan_status'] == 0) & (df['fracNumPmts'] < 1), 1.0).otherwise(0.0)) #indicator for on-schedule repayment can be inferred as probDef=probER=0,0, with #visually: #plot of timing of either default or eventual (not early repayment) #df.filter((df['loan_status']==1)|(df.fracNumPmts >=1)).select(df.fracNumPmts).toPandas().plot.hist() #plt.show() #This is bi-modal, mostly low over 0,1 and then a spike at 1. #plot of timing of either repayment (whenever) #df.filter(df['loan_status']==0).select(df.fracNumPmts).toPandas().plot.hist()