def encode_using_indexer(df, column_name):
    '''
    Transforms a df at a particular column_name by converting all unique categories to an index as they get processed.
    ie. If values in column are {a,b,b,c,d} => {0.0,1.0,1.0,2.0,3.0} (Good for Binary)
    '''
    indexed_name = 'index_'+column_name
    df = StringIndexer(inputCol=column_name, outputCol=indexed_name,
                       handleInvalid="skip").fit(df).transform(df)
    df = df.drop(column_name)
    df = df.withColumnRenamed(indexed_name, column_name)
    return df
def encode_using_one_hot(df, column_name):
    '''
    Transforms a df at a particular column_name by converting all unique categories to a vector as they get processed.
    ie. If values in column are {a,b,b,c,d} => {<1.0,0,0>, <0,1.0,0>, ... } (Good for non-ordinal categories)
    '''
    indexed_name = 'index_'+column_name
    vectored_name = 'vec_'+column_name

    df = StringIndexer(inputCol=column_name, outputCol=indexed_name,
                       handleInvalid="skip").fit(df).transform(df)

    encoder = OneHotEncoderEstimator(
        inputCols=[indexed_name], outputCols=[vectored_name])
    model = encoder.fit(df)
    df = model.transform(df)
    df = df.drop(indexed_name)
    df = df.drop(column_name)
    df = df.withColumnRenamed(vectored_name, column_name)
    return df
#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA
#Check schema and first rows
formula.printSchema() #Schema is ok
formula.toPandas().head(5)

#Find missings
formula.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in formula.columns)) #No missings

#Renaming the FormulaID column for future joins
formula = formula.withColumnRenamed("FormulaID","fID_Formula")
#SUBSCRIPTIONS
#Find missings
subscriptions.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in subscriptions.columns)) #Found 161 missings in the Price/Discount columns and 20 in the NbrMeans_EXCEP column

#Treating missing values
Exemplo n.º 4
0
print('\n\n\nString indexation for categorical variables.. \n')

#numerical vars for neural network
float_x_vars = [
    "loan_amnt", "int_rate", "annual_inc", "dti", "revol_util", "installment",
    "inst2inc"
]
##StringEncoding of categorical variables
cat_x_vars = ["term", "grade", "home_ownership", "pred_KM", "emp_length"]

#df2 = df #backup in case of trouble

for cat_var in cat_x_vars:
    df = StringIndexer(inputCol=cat_var, outputCol=cat_var +
                       'Idx').fit(df).transform(df).drop(cat_var)
    df = df.withColumnRenamed(cat_var + 'Idx', cat_var)

#df.select(cat_x_vars).show(5) #check

##Create y or target variables for neural networks
#probability/indicator for default
df = df.withColumn('probDef',
                   F.when(df['loan_status'] == 1,
                          1.0).otherwise(0.0))  #default is 1, repaid is 0
#indicator for early replayment
df = df.withColumn(
    'probER',
    F.when((df['loan_status'] == 0) & (df['fracNumPmts'] < 1),
           1.0).otherwise(0.0))
#indicator for on-schedule repayment can be inferred as probDef=probER=0,0, with