df5 = StringIndexer(inputCol='Embarked',outputCol='Embarked1').fit(df5).transform(df5) df5.show() df5 = OneHotEncoder(inputCol='Embarked1',outputCol='Embarked2',dropLast=False).transform(df5) df5.show() # -------------------------------------------- df5 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df5).transform(df5) df5 = OneHotEncoder(inputCol='Gender',outputCol='Gender1',dropLast=False).transform(df5) df5.show() df5 = df5.select(df5.Pclass.cast('double'),df5.Gender1,df5.Embarked2,df5.PassengerId) df5.printSchema() # Vector assembler df5 = VectorAssembler(inputCols=['Pclass','Gender1','Embarked2'],outputCol='Features').transform(df5) df5.show(truncate=False) df5_1 = model2.transform(df5) df5_1.show() df5_1.select('PassengerId','prediction').coalesce(1).write.csv('c:/test5.csv') # df5_1.select('PassengerId','prediction').toPandas().to_csv('c:/test5.csv')
df2 = spark.read.csv('/users/jyothsnap/Kaggle/titanic/train.csv',header=True) df2.count() # --------------------------------------- df3 = df2.select('Sex','Pclass','Survived','Embarked') df3.show() df3.printSchema() from pyspark.ml.feature import StringIndexer df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3) df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show() df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked_Transformed').fit(df3).transform(df3) #df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show() df3.show() df3.printSchema() df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double')) df3.show() df3.printSchema() # Vector assembler from pyspark.ml.feature import VectorAssembler df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3) df3.show() # # 1 choose approach from pyspark.ml.classification import DecisionTreeClassifier dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy')
#Convert nulls into 0's complaints_final = complaints_final.na.fill(0) #CUSTOMERS #Check schema and first rows customers.printSchema() #Schema is ok customers.toPandas().head(5) #Find missings customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings #Renaming the CustomerID column for future joins customers = customers.withColumnRenamed("CustomerID","cIDCustomer") #DELIVERY #Check schema and first rows delivery.printSchema() #Schema is ok delivery.toPandas().head(5) #Find missings delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column #Treating missing values delivery = delivery.where(col("DeliveryClass").isNotNull()) #Encoding string columns in "Delivery" delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery) delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery) #Renaming the SubscriptionID column for future joins delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery") #FORMULA