def RandomForestRegressor(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) model = rf.fit(df) model.featureImportances # SparseVector(1, {0: 1.0}) allclose(model.treeWeights, [1.0, 1.0]) # True test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) model.transform(test0).head().prediction # 0.0 model.numFeatures # 1 model.trees # [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] model.getNumTrees # 2 test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) model.transform(test1).head().prediction # 0.5 temp_path = "./" rfr_path = temp_path + "/rfr" rf.save(rfr_path) rf2 = RandomForestRegressor.load(rfr_path) rf2.getNumTrees() # 2 model_path = temp_path + "/rfr_model" model.save(model_path) model2 = RandomForestRegressionModel.load(model_path) model.featureImportances == model2.featureImportances
def _set_dataFrameModel(self, _type, _SLA, data, vecAssembler): if _type == 'regression': if _SLA == 'randomForest': rf = RandomForestRegressor() rf.setLabelCol(self.targetVariable)\ .setPredictionCol("prediction")\ .setFeaturesCol("features")\ .setProbabilityCol("proba")\ .setSeed(100088121L)\ .setMaxDepth(int(self.sparkOptions[1]))\ .setMaxMemoryInMB(10000)\ .setFeatureSubsetStrategy(self.sparkOptions[5]) self._regEval = RegressionEvaluator( predictionCol="prediction", labelCol=self.targetVariable, metricName="rmse") else: #classification if _SLA == 'randomForest': rf = RandomForestClassifier( labelCol=self.targetVariable, featuresCol="features", maxDepth=int(self.sparkOptions[1]), featureSubsetStrategy=self.sparkOptions[5], impurity=self.sparkOptions[2], probabilityCol="proba") if goodClass != '': self.regEval = BinaryClassificationEvaluator( labelCol=self.targetVariable, metricName="areaUnderROC") else: self.regEval = MulticlassClassificationEvaluator( labelCol=self.targetVariable, predictionCol="prediction", metricName="accuracy") # Create a Pipeline self._pipeline = Pipeline() # Set the stages of the Pipeline #vecAssembler self._pipeline.setStages([vecAssembler, rf]) # GridSearch self._paramGrid = (ParamGridBuilder().addGrid( rf.numTrees, [int(num) for num in self.sparkOptions[4].split(',')]).build()) # Add the grid to the CrossValidator self._crossval = CrossValidator(estimator=self._pipeline, estimatorParamMaps=self._paramGrid, evaluator=self._regEval, numFolds=self.nbSamples) # Now let's find and return the best model self._dataFrameModel = self._crossval.fit(data).bestModel #to be removed #print rf.getNumTrees() #modelText = str(self._dataFrameModel.stages[-1]) #._java_obj.toDebugString() #nbTrees = int(re.sub('.*?([0-9]*) trees$',r'\1',modelText)) #print nbTrees # end TBR rf.save("/home/t752887/python/myModelPath/SPARK_RF_R_" + str(self.sparkModelsId[0]))