# _import zoo data to a spark dataframe mushroom_df = spark.read.option("inferschema", "true").option("header", "true").csv("mushrooms.csv") mushroom_df.show(5) mushroom_df.printSchema() mushroom_df = mushroom_df.na.drop() # _No need to create extra column as Lab column is already binary classifiable with either EDIBLE or POISONOUS values mushroom_df = mushroom_df.drop("VeilType") # _preprocess data pre_process_data = RFormula(formula="Lab ~ .") pre_process_data = pre_process_data.fit(mushroom_df) pre_process_data = pre_process_data.transform(mushroom_df) pre_process_data.show(5) # _split dataset into test and train datasets train, test = pre_process_data.randomSplit([0.7, 0.3]) # _initialize logistic regression classifier lr = LogisticRegression(labelCol="label", featuresCol="features") # _train logistic regression model with train data available fittedLr = lr.fit(train) # _classify test data result = fittedLr.transform(test) result.show(5)
zoo_df = spark.read.option("inferschema", "true").option("header", "true").csv("zoo.csv") zoo_df.show(5) zoo_df.printSchema() # _add new column Is_Mammal zoo_df = zoo_df.withColumn("Is_Mammal", expr("CASE WHEN Type = 1 THEN 1 ELSE 0 END")) # _preprocess data pre_process_data = RFormula( formula= "Is_Mammal ~ Hair + Feathers + Eggs + Milk + Airborne + Aquatic + Predator + Toothed + Backbone + Breathes + Venomous + Fins + Legs + Tail + Domestic + Catsize" ) pre_process_data = pre_process_data.fit(zoo_df) pre_process_data = pre_process_data.transform(zoo_df) pre_process_data.show(5) # _split dataset into test and train datasets train, test = pre_process_data.randomSplit([0.7, 0.3]) # _initialize logistic regression classifier lr = LogisticRegression(labelCol="label", featuresCol="features") # _train logistic regression model with train data available fittedLr = lr.fit(train) # _classify test data result = fittedLr.transform(test) result.show()