def random_forest_classifier(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42) model = rf.fit(td) # model.featureImportances # # SparseVector(1, {0: 1.0}) # allclose(model.treeWeights, [1.0, 1.0, 1.0]) # # True test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) result = model.transform(test0).head() # result.prediction # # 0.0 # numpy.argmax(result.probability) # # 0 # numpy.argmax(result.rawPrediction) # # 0 # test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) # model.transform(test1).head().prediction # # 1.0 # model.trees # # [DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...] temp_path = "." rfc_path = temp_path + "/rfc" rf.write().overwrite().save(rfc_path) rf2 = RandomForestClassifier.load(rfc_path) # rf2.getNumTrees() # # 3 model_path = temp_path + "/rfc_model" model.write().overwrite().save(model_path) model2 = RandomForestClassificationModel.load(model_path)
import pyspark.sql.functions as func import pyspark if (len(sys.argv) < 3): print("Please provide testfilepth") sys.exit(-1) modelpath = sys.argv[1] testfilepath = sys.argv[2] conf = SparkConf().setAppName("Wine Quality Testing").setMaster("local[1]") sc = SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() rf = RandomForestClassifier.load(modelpath) defTest = spark.read.format('csv').options(header='true', inferSchema='true', delimiter=';').csv(testfilepath) defTest.printSchema() featureColumns = [ col for col in defTest.columns if (col != '""""quality"""""') ] assembler = VectorAssembler(inputCols=featureColumns, outputCol='features') rfPipeline = Pipeline(stages=[assembler, rf])
from pyspark.ml.classification import RandomForestClassifier from pyspark.ml import Pipeline from pyspark.ml.feature import VectorAssembler from pyspark.mllib.evaluation import MulticlassMetrics import sys import pyspark.sql.functions as func import pyspark # import findspark # findspark.init() conf = SparkConf().setAppName("Wine Quality Testing").setMaster("local[1]") sc = SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() rf = RandomForestClassifier.load("s3://myprogrambucket/rfwine_model.model") defTest = spark.read.format('csv').options( header='true', inferSchema='true', delimiter=';').csv("s3://myprogrambucket/ValidationDataset.csv") defTest.printSchema() featureColumns = [ col for col in defTest.columns if (col != '""""quality"""""') ] assembler = VectorAssembler(inputCols=featureColumns, outputCol='features') rfPipeline = Pipeline(stages=[assembler, rf]) fit = rfPipeline.fit(defTest)
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.ml.classification import RandomForestClassifier from pyspark.ml import Pipeline from pyspark.ml.feature import VectorAssembler from pyspark.mllib.evaluation import MulticlassMetrics import sys import pyspark.sql.functions as func import pyspark spark = SparkSession.builder.master("local").appName("assignment2").config( "spark.some.config.option", "some-value").getOrCreate() rf = RandomForestClassifier.load("s3://winedataset/rfwine_model.model") defTest = spark.read.format('csv').options( header='true', inferSchema='true', delimiter=';').csv("s3://winedataset/ValidationDataset.csv") defTest.printSchema() featureColumns = [ col for col in defTest.columns if (col != '""""quality"""""') ] assembler = VectorAssembler(inputCols=featureColumns, outputCol='features') rfPipeline = Pipeline(stages=[assembler, rf]) fit = rfPipeline.fit(defTest) transformed = fit.transform(defTest) transformed = transformed.withColumn("prediction",
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.ml.classification import RandomForestClassifier from pyspark.ml import Pipeline from pyspark.ml.feature import VectorAssembler from pyspark.mllib.evaluation import MulticlassMetrics import sys import pyspark.sql.functions as func import pyspark conf = SparkConf().setAppName("Wine Quality Prediction").setMaster("local[1]") sc = SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() #loading trained model rf = RandomForestClassifier.load("wine_model.model") #Read data from csv #data = spark.read.format('csv').options(header='true', inferSchema='true', delimiter=';').csv("s3://pa2smit/ValidationDataset.csv") data = spark.read.csv('ValidationDataset.csv', header='true', inferSchema='true', sep=';') featureColumns = [col for col in data.columns if col != '""""quality"""""'] assembler = VectorAssembler(inputCols=featureColumns, outputCol='values') rfPipe = Pipeline(stages=[assembler, rf]) fitData = rfPipe.fit(data) transformedData = fitData.transform(data)
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.ml.classification import RandomForestClassifier from pyspark.ml import Pipeline from pyspark.ml.feature import VectorAssembler from pyspark.mllib.evaluation import MulticlassMetrics import sys import pyspark.sql.functions as func import pyspark conf = SparkConf().setAppName("Wine Quality Prediction").setMaster("local[1]") sc = SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() #loading trained model rf = RandomForestClassifier.load("s3://pa2smit/wine_model.model") #Read data from csv data = spark.read.format('csv').options( header='true', inferSchema='true', delimiter=';').csv("s3://pa2smit/ValidationDataset.csv") featureColumns = [col for col in data.columns if col != '""""quality"""""'] assembler = VectorAssembler(inputCols=featureColumns, outputCol='values') rfPipe = Pipeline(stages=[assembler, rf]) fitData = rfPipe.fit(data) transformedData = fitData.transform(data) transformedData = transformedData.withColumn( "prediction", func.round(transformedData['prediction']))