Python StringIndexer.withColumn 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark.ml.feature

클래스/타입: StringIndexer

메소드/함수: withColumn

hotexamples.com에서의 예제들: 3

Python StringIndexer.withColumn - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.ml.feature.StringIndexer.withColumn에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

StringIndexer(30)

fit(30)

transform(30)

getOutputCol(22)

show(19)

select(15)

setHandleInvalid(14)

write(10)

drop(9)

randomSplit(8)

toPandas(4)

withColumnRenamed(4)

getInputCol(3)

withColumn(3)

groupBy(3)

where(3)

printSchema(3)

save(2)

setInputCol(2)

count(2)

take(1)

describe(1)

setOutputCol(1)

filter(1)

dropna(1)

fitAsync(1)

orderBy(1)

_call_java(1)

labels(1)

groupby(1)

getOutputCols(1)

fillna(1)

load(1)

예제 #1

파일 보기

파일: Databricks_Churn Prediction.py 프로젝트: Betsy-Varghese/Predictive-Modeling-Python

convert_int = ["NbrMeals_EXCEP", "GrossFormulaPrice", "NetFormulaPrice", "NbrMealsPrice", "ProductDiscount", "FormulaDiscount", "TotalDiscount", "TotalPrice", "TotalCredit"]

for i in convert_int:
    table1 = table1.withColumn(i, table1[i].cast("integer"))

#Changing column types from 'Subscriptions' from string to timestamp
convert_date = ["StartDate","EndDate","RenewalDate","PaymentDate"]
    
for i in convert_date:
  table1 = table1.withColumn(i, table1[i].cast("timestamp"))
  
#Encoding string columns in merged table
table1 = StringIndexer(inputCol = "PaymentStatus", outputCol = "PaymentStatus_index").fit(table1).transform(table1)

#Creating meaningful Time variables
table1 = table1.withColumn("DaysSubscription", datediff(col("EndDate"), col("StartDate")))
table1 = table1.withColumn("MonthsSubscription", months_between(col("EndDate"), col("StartDate")))
table1 = table1.withColumn("Year", year("StartDate"))

# COMMAND ----------

#Feature engineering
#Aggregating variables by CustomerID
subs_totals = table1.groupBy("CustomerID").agg(count("SubscriptionID"), avg("DaysSubscription"), 
                                                  avg("MonthsSubscription"), sum("NbrMeals_REG"), sum("NbrMeals_EXCEP"), 
                                                  min("NbrMealsPrice"), max("NbrMealsPrice"), avg("NbrMealsPrice"), 
                                                  min("ProductDiscount"), max("ProductDiscount"), sum("ProductDiscount"), 
                                                  min("TotalDiscount"), max("TotalDiscount"), sum("TotalDiscount"),
                                                  min("TotalPrice"), max("TotalPrice"), sum("TotalPrice"), 
                                                  min("TotalCredit"), max("TotalCredit"),
                                                  sum("TotalCredit"))

예제 #2

파일 보기

파일: pyspark_tutorials.py 프로젝트: linghui-wu/LargeScaleComputing_A20

# Average minutes on ground at LGA
avg_ground_lga = inter + regression.regression.coefficients[4]
print(avg_ground_lga)

# RPM buckcket
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(split=[3500, 4500, 6000, 6500], 
    inputCol='rpm', outpuCol='rpm_bin')
# Apply bucket to rpm column
cars = bucketizer.transform(cars)
# ROM buckets
bucketed.select('rpm', 'rpm_bin').show(5)
cars.groupBy('rpm_bin').count().show()

# Engineering density
cars = cars.withColumn('density_line', cars.mass / cars.length)  # Linear density
cars = cars.withColumn('density_quad', cars.mass / cars.length ** 2)  # Area density
cars = cars.withColumn('density_cube', cars.mass / cars.length ** 3)  # Volume density

from pyspark.ml.feature import Bucketizer, OneHotEncoderEstimator

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[3 * x for x in range(9)], inputCol='depart', outputCol='depart_bucket')

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select('depart', 'depart_bucket').show(5)

# Create a one-hot encoder
onehot = OneHotEncoderEstimator(inputCols=['depart_bucket'], outputCols=['depart_dummy'])

예제 #3

파일 보기

##StringEncoding of categorical variables
cat_x_vars = ["term", "grade", "home_ownership", "pred_KM", "emp_length"]

#df2 = df #backup in case of trouble

for cat_var in cat_x_vars:
    df = StringIndexer(inputCol=cat_var, outputCol=cat_var +
                       'Idx').fit(df).transform(df).drop(cat_var)
    df = df.withColumnRenamed(cat_var + 'Idx', cat_var)

#df.select(cat_x_vars).show(5) #check

##Create y or target variables for neural networks
#probability/indicator for default
df = df.withColumn('probDef',
                   F.when(df['loan_status'] == 1,
                          1.0).otherwise(0.0))  #default is 1, repaid is 0
#indicator for early replayment
df = df.withColumn(
    'probER',
    F.when((df['loan_status'] == 0) & (df['fracNumPmts'] < 1),
           1.0).otherwise(0.0))
#indicator for on-schedule repayment can be inferred as probDef=probER=0,0, with

#visually:
#plot of timing of either default or eventual (not early repayment)
#df.filter((df['loan_status']==1)|(df.fracNumPmts >=1)).select(df.fracNumPmts).toPandas().plot.hist()
#plt.show()  #This is  bi-modal, mostly low over 0,1 and then a spike at 1.

#plot of timing of either repayment (whenever)
#df.filter(df['loan_status']==0).select(df.fracNumPmts).toPandas().plot.hist()