Python StringIndexer.withColumnRenamed示例

编程语言: Python

命名空间/包名称: pyspark.ml.feature

类/类型: StringIndexer

方法/功能: withColumnRenamed

hotexamples.com的示例: 4

Python StringIndexer.withColumnRenamed - 已找到4个示例。这些是从开源项目中提取的最受好评的pyspark.ml.feature.StringIndexer.withColumnRenamed现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

StringIndexer(30)

fit(30)

transform(30)

getOutputCol(22)

show(19)

select(15)

setHandleInvalid(14)

write(10)

drop(9)

randomSplit(8)

toPandas(4)

withColumnRenamed(4)

getInputCol(3)

withColumn(3)

groupBy(3)

where(3)

printSchema(3)

save(2)

setInputCol(2)

count(2)

take(1)

describe(1)

setOutputCol(1)

filter(1)

dropna(1)

fitAsync(1)

orderBy(1)

_call_java(1)

labels(1)

groupby(1)

getOutputCols(1)

fillna(1)

load(1)

示例#1

显示文件

文件： step2_feature_engineering.py 项目： ribal-aladeeb/big-data

def encode_using_indexer(df, column_name):
    '''
    Transforms a df at a particular column_name by converting all unique categories to an index as they get processed.
    ie. If values in column are {a,b,b,c,d} => {0.0,1.0,1.0,2.0,3.0} (Good for Binary)
    '''
    indexed_name = 'index_'+column_name
    df = StringIndexer(inputCol=column_name, outputCol=indexed_name,
                       handleInvalid="skip").fit(df).transform(df)
    df = df.drop(column_name)
    df = df.withColumnRenamed(indexed_name, column_name)
    return df

示例#2

显示文件

文件： step2_feature_engineering.py 项目： ribal-aladeeb/big-data

def encode_using_one_hot(df, column_name):
    '''
    Transforms a df at a particular column_name by converting all unique categories to a vector as they get processed.
    ie. If values in column are {a,b,b,c,d} => {<1.0,0,0>, <0,1.0,0>, ... } (Good for non-ordinal categories)
    '''
    indexed_name = 'index_'+column_name
    vectored_name = 'vec_'+column_name

    df = StringIndexer(inputCol=column_name, outputCol=indexed_name,
                       handleInvalid="skip").fit(df).transform(df)

    encoder = OneHotEncoderEstimator(
        inputCols=[indexed_name], outputCols=[vectored_name])
    model = encoder.fit(df)
    df = model.transform(df)
    df = df.drop(indexed_name)
    df = df.drop(column_name)
    df = df.withColumnRenamed(vectored_name, column_name)
    return df

示例#3

显示文件

文件： Databricks_Churn Prediction.py 项目： Betsy-Varghese/Predictive-Modeling-Python

#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA
#Check schema and first rows
formula.printSchema() #Schema is ok
formula.toPandas().head(5)

#Find missings
formula.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in formula.columns)) #No missings

#Renaming the FormulaID column for future joins
formula = formula.withColumnRenamed("FormulaID","fID_Formula")
#SUBSCRIPTIONS
#Find missings
subscriptions.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in subscriptions.columns)) #Found 161 missings in the Price/Discount columns and 20 in the NbrMeans_EXCEP column

#Treating missing values

示例#4

显示文件

print('\n\n\nString indexation for categorical variables.. \n')

#numerical vars for neural network
float_x_vars = [
    "loan_amnt", "int_rate", "annual_inc", "dti", "revol_util", "installment",
    "inst2inc"
]
##StringEncoding of categorical variables
cat_x_vars = ["term", "grade", "home_ownership", "pred_KM", "emp_length"]

#df2 = df #backup in case of trouble

for cat_var in cat_x_vars:
    df = StringIndexer(inputCol=cat_var, outputCol=cat_var +
                       'Idx').fit(df).transform(df).drop(cat_var)
    df = df.withColumnRenamed(cat_var + 'Idx', cat_var)

#df.select(cat_x_vars).show(5) #check

##Create y or target variables for neural networks
#probability/indicator for default
df = df.withColumn('probDef',
                   F.when(df['loan_status'] == 1,
                          1.0).otherwise(0.0))  #default is 1, repaid is 0
#indicator for early replayment
df = df.withColumn(
    'probER',
    F.when((df['loan_status'] == 0) & (df['fracNumPmts'] < 1),
           1.0).otherwise(0.0))
#indicator for on-schedule repayment can be inferred as probDef=probER=0,0, with