def encode_using_indexer(df, column_name): ''' Transforms a df at a particular column_name by converting all unique categories to an index as they get processed. ie. If values in column are {a,b,b,c,d} => {0.0,1.0,1.0,2.0,3.0} (Good for Binary) ''' indexed_name = 'index_'+column_name df = StringIndexer(inputCol=column_name, outputCol=indexed_name, handleInvalid="skip").fit(df).transform(df) df = df.drop(column_name) df = df.withColumnRenamed(indexed_name, column_name) return df
def encode_using_one_hot(df, column_name): ''' Transforms a df at a particular column_name by converting all unique categories to a vector as they get processed. ie. If values in column are {a,b,b,c,d} => {<1.0,0,0>, <0,1.0,0>, ... } (Good for non-ordinal categories) ''' indexed_name = 'index_'+column_name vectored_name = 'vec_'+column_name df = StringIndexer(inputCol=column_name, outputCol=indexed_name, handleInvalid="skip").fit(df).transform(df) encoder = OneHotEncoderEstimator( inputCols=[indexed_name], outputCols=[vectored_name]) model = encoder.fit(df) df = model.transform(df) df = df.drop(indexed_name) df = df.drop(column_name) df = df.withColumnRenamed(vectored_name, column_name) return df
#Check schema and first rows delivery.printSchema() #Schema is ok delivery.toPandas().head(5) #Find missings delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column #Treating missing values delivery = delivery.where(col("DeliveryClass").isNotNull()) #Encoding string columns in "Delivery" delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery) delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery) #Renaming the SubscriptionID column for future joins delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery") #FORMULA #Check schema and first rows formula.printSchema() #Schema is ok formula.toPandas().head(5) #Find missings formula.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in formula.columns)) #No missings #Renaming the FormulaID column for future joins formula = formula.withColumnRenamed("FormulaID","fID_Formula") #SUBSCRIPTIONS #Find missings subscriptions.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in subscriptions.columns)) #Found 161 missings in the Price/Discount columns and 20 in the NbrMeans_EXCEP column #Treating missing values
print('\n\n\nString indexation for categorical variables.. \n') #numerical vars for neural network float_x_vars = [ "loan_amnt", "int_rate", "annual_inc", "dti", "revol_util", "installment", "inst2inc" ] ##StringEncoding of categorical variables cat_x_vars = ["term", "grade", "home_ownership", "pred_KM", "emp_length"] #df2 = df #backup in case of trouble for cat_var in cat_x_vars: df = StringIndexer(inputCol=cat_var, outputCol=cat_var + 'Idx').fit(df).transform(df).drop(cat_var) df = df.withColumnRenamed(cat_var + 'Idx', cat_var) #df.select(cat_x_vars).show(5) #check ##Create y or target variables for neural networks #probability/indicator for default df = df.withColumn('probDef', F.when(df['loan_status'] == 1, 1.0).otherwise(0.0)) #default is 1, repaid is 0 #indicator for early replayment df = df.withColumn( 'probER', F.when((df['loan_status'] == 0) & (df['fracNumPmts'] < 1), 1.0).otherwise(0.0)) #indicator for on-schedule repayment can be inferred as probDef=probER=0,0, with