예제 #1
0
def random_forest_classifier():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    rf = RandomForestClassifier(numTrees=3,
                                maxDepth=2,
                                labelCol="indexed",
                                seed=42)
    model = rf.fit(td)
    # model.featureImportances
    # # SparseVector(1, {0: 1.0})
    # allclose(model.treeWeights, [1.0, 1.0, 1.0])
    # # True
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    result = model.transform(test0).head()
    # result.prediction
    # # 0.0
    # numpy.argmax(result.probability)
    # # 0
    # numpy.argmax(result.rawPrediction)
    # # 0
    # test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
    # model.transform(test1).head().prediction
    # # 1.0
    # model.trees
    # # [DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...]
    temp_path = "."
    rfc_path = temp_path + "/rfc"
    rf.write().overwrite().save(rfc_path)
    rf2 = RandomForestClassifier.load(rfc_path)
    # rf2.getNumTrees()
    # # 3
    model_path = temp_path + "/rfc_model"
    model.write().overwrite().save(model_path)
    model2 = RandomForestClassificationModel.load(model_path)
예제 #2
0
import pyspark.sql.functions as func
import pyspark

if (len(sys.argv) < 3):
    print("Please provide testfilepth")
    sys.exit(-1)

modelpath = sys.argv[1]
testfilepath = sys.argv[2]

conf = SparkConf().setAppName("Wine Quality Testing").setMaster("local[1]")
sc = SparkContext(conf=conf)

spark = SparkSession.builder.getOrCreate()

rf = RandomForestClassifier.load(modelpath)

defTest = spark.read.format('csv').options(header='true',
                                           inferSchema='true',
                                           delimiter=';').csv(testfilepath)

defTest.printSchema()

featureColumns = [
    col for col in defTest.columns if (col != '""""quality"""""')
]

assembler = VectorAssembler(inputCols=featureColumns, outputCol='features')

rfPipeline = Pipeline(stages=[assembler, rf])
예제 #3
0
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
import sys
import pyspark.sql.functions as func
import pyspark
# import findspark
# findspark.init()

conf = SparkConf().setAppName("Wine Quality Testing").setMaster("local[1]")
sc = SparkContext(conf=conf)

spark = SparkSession.builder.getOrCreate()

rf = RandomForestClassifier.load("s3://myprogrambucket/rfwine_model.model")

defTest = spark.read.format('csv').options(
    header='true', inferSchema='true',
    delimiter=';').csv("s3://myprogrambucket/ValidationDataset.csv")
defTest.printSchema()

featureColumns = [
    col for col in defTest.columns if (col != '""""quality"""""')
]

assembler = VectorAssembler(inputCols=featureColumns, outputCol='features')

rfPipeline = Pipeline(stages=[assembler, rf])

fit = rfPipeline.fit(defTest)
예제 #4
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
import sys
import pyspark.sql.functions as func
import pyspark

spark = SparkSession.builder.master("local").appName("assignment2").config(
    "spark.some.config.option", "some-value").getOrCreate()

rf = RandomForestClassifier.load("s3://winedataset/rfwine_model.model")

defTest = spark.read.format('csv').options(
    header='true', inferSchema='true',
    delimiter=';').csv("s3://winedataset/ValidationDataset.csv")
defTest.printSchema()

featureColumns = [
    col for col in defTest.columns if (col != '""""quality"""""')
]

assembler = VectorAssembler(inputCols=featureColumns, outputCol='features')

rfPipeline = Pipeline(stages=[assembler, rf])

fit = rfPipeline.fit(defTest)
transformed = fit.transform(defTest)
transformed = transformed.withColumn("prediction",
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
import sys
import pyspark.sql.functions as func
import pyspark

conf = SparkConf().setAppName("Wine Quality Prediction").setMaster("local[1]")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

#loading trained model
rf = RandomForestClassifier.load("wine_model.model")

#Read data from csv
#data = spark.read.format('csv').options(header='true', inferSchema='true', delimiter=';').csv("s3://pa2smit/ValidationDataset.csv")
data = spark.read.csv('ValidationDataset.csv',
                      header='true',
                      inferSchema='true',
                      sep=';')

featureColumns = [col for col in data.columns if col != '""""quality"""""']
assembler = VectorAssembler(inputCols=featureColumns, outputCol='values')

rfPipe = Pipeline(stages=[assembler, rf])

fitData = rfPipe.fit(data)
transformedData = fitData.transform(data)
예제 #6
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
import sys
import pyspark.sql.functions as func
import pyspark

conf = SparkConf().setAppName("Wine Quality Prediction").setMaster("local[1]")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

#loading trained model
rf = RandomForestClassifier.load("s3://pa2smit/wine_model.model")

#Read data from csv
data = spark.read.format('csv').options(
    header='true', inferSchema='true',
    delimiter=';').csv("s3://pa2smit/ValidationDataset.csv")

featureColumns = [col for col in data.columns if col != '""""quality"""""']
assembler = VectorAssembler(inputCols=featureColumns, outputCol='values')

rfPipe = Pipeline(stages=[assembler, rf])

fitData = rfPipe.fit(data)
transformedData = fitData.transform(data)
transformedData = transformedData.withColumn(
    "prediction", func.round(transformedData['prediction']))