Exemplo n.º 1
0
    # _import zoo data to a spark dataframe
    mushroom_df = spark.read.option("inferschema",
                                    "true").option("header",
                                                   "true").csv("mushrooms.csv")
    mushroom_df.show(5)
    mushroom_df.printSchema()

    mushroom_df = mushroom_df.na.drop()
    # _No need to create extra column as Lab column is already binary classifiable with either EDIBLE or POISONOUS values
    mushroom_df = mushroom_df.drop("VeilType")

    # _preprocess data
    pre_process_data = RFormula(formula="Lab ~ .")
    pre_process_data = pre_process_data.fit(mushroom_df)
    pre_process_data = pre_process_data.transform(mushroom_df)

    pre_process_data.show(5)

    # _split dataset into test and train datasets
    train, test = pre_process_data.randomSplit([0.7, 0.3])

    # _initialize logistic regression classifier
    lr = LogisticRegression(labelCol="label", featuresCol="features")

    # _train logistic regression model with train data available
    fittedLr = lr.fit(train)

    # _classify test data
    result = fittedLr.transform(test)
    result.show(5)
Exemplo n.º 2
0
    zoo_df = spark.read.option("inferschema",
                               "true").option("header", "true").csv("zoo.csv")
    zoo_df.show(5)
    zoo_df.printSchema()

    # _add new column Is_Mammal
    zoo_df = zoo_df.withColumn("Is_Mammal",
                               expr("CASE WHEN Type = 1 THEN 1 ELSE 0 END"))

    # _preprocess data
    pre_process_data = RFormula(
        formula=
        "Is_Mammal ~ Hair + Feathers + Eggs + Milk + Airborne + Aquatic + Predator + Toothed + Backbone + Breathes + Venomous + Fins + Legs + Tail + Domestic + Catsize"
    )
    pre_process_data = pre_process_data.fit(zoo_df)
    pre_process_data = pre_process_data.transform(zoo_df)

    pre_process_data.show(5)

    # _split dataset into test and train datasets
    train, test = pre_process_data.randomSplit([0.7, 0.3])

    # _initialize logistic regression classifier
    lr = LogisticRegression(labelCol="label", featuresCol="features")

    # _train logistic regression model with train data available
    fittedLr = lr.fit(train)

    # _classify test data
    result = fittedLr.transform(test)
    result.show()