def random_forest_classifier(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=42) model = rf.fit(td) # model.featureImportances # # SparseVector(1, {0: 1.0}) # allclose(model.treeWeights, [1.0, 1.0, 1.0]) # # True test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) result = model.transform(test0).head() # result.prediction # # 0.0 # numpy.argmax(result.probability) # # 0 # numpy.argmax(result.rawPrediction) # # 0 # test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) # model.transform(test1).head().prediction # # 1.0 # model.trees # # [DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...] temp_path = "." rfc_path = temp_path + "/rfc" rf.write().overwrite().save(rfc_path) rf2 = RandomForestClassifier.load(rfc_path) # rf2.getNumTrees() # # 3 model_path = temp_path + "/rfc_model" model.write().overwrite().save(model_path) model2 = RandomForestClassificationModel.load(model_path)
print(now.year, now.month, now.day, now.hour, now.minute, now.second) rf = RandomForestClassifier(labelCol='attack_cat_index', featuresCol='features', impurity='entropy', seed=1234, maxBins=136, maxDepth=25, featureSubsetStrategy='all', predictionCol='prediction') rf = rf.fit(train) now = datetime.datetime.now() print(now.year, now.month, now.day, now.hour, now.minute, now.second) model_output_path = "{}/data/RandomForest_extended.bin".format(base_path) rf.write().overwrite().save(model_output_path) result = rf.transform(test) prediction_df = result.select("attack_cat_index", "prediction").toPandas() prediction_list = prediction_df[["attack_cat_index", "prediction"]].values.tolist() #Creamos una funcion para el TPR def truePositiveRate(list, label): tot_count = 0 true_count = 0 for a in list: if a[0] == label: tot_count = tot_count + 1 if a[1] == label: