def balanced_random_forest_tuning(train_samples): rf = RandomForestClassifier(labelCol="label", featuresCol="features", cacheNodeIds=True, weightCol="weight") ru = RandomUnderSampler().setIndexCol('id') cw = ClassWeighter() pipeline = Pipeline().setStages([ru, cw, rf]) paramGrid = \ (ParamGridBuilder() .addGrid(rf.numTrees, [50, 75, 100]) .addGrid(rf.featureSubsetStrategy, ['sqrt']) .addGrid(rf.impurity, ['gini', 'entropy']) .addGrid(rf.maxDepth, [5, 15, 30]) .addGrid(rf.minInstancesPerNode, [1]) .addGrid(rf.subsamplingRate, [1.0, 0.66, 0.4]) .addGrid(cw.classWeight, [[1/36, 1.0], [1/9.0, 1.0]]) .addGrid(ru.targetImbalanceRatio, [9.0, 36.0]) .build()) pr_evaluator = \ BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderPR") tvs = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=pr_evaluator, numFolds=4, collectSubModels=True) model = tvs.fit(train_samples) return model
def random_forest_tuning(train_samples): rf = RandomForestClassifier( labelCol="label", featuresCol="features", cacheNodeIds=True ) ru = RandomUnderSampler().setIndexCol("id") pipeline = Pipeline().setStages([ru, rf]) paramGrid = ( ParamGridBuilder() .addGrid(rf.numTrees, [50, 75, 100]) .addGrid(rf.featureSubsetStrategy, ["sqrt"]) .addGrid(rf.impurity, ["gini", "entropy"]) .addGrid(rf.maxDepth, [5, 15, 30]) .addGrid(rf.minInstancesPerNode, [1]) .addGrid(rf.subsamplingRate, [1.0, 0.6, 0.4]) .addGrid(ru.targetImbalanceRatio, [1.0, 1.5, 2.0]) .build() ) pr_evaluator = BinaryClassificationEvaluator( labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderPR" ) tvs = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=pr_evaluator, trainRatio=0.8, collectSubModels=True, ) model = tvs.fit(train_samples) return model
imbalance_ratio = (neg_samples.count()/pos_samples.count()) train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples) train_set, test_set = train_set.persist(), test_set.persist() rf = RandomForestClassifier(labelCol="label", featuresCol="features", cacheNodeIds=True, maxDepth=17, impurity='entropy', featureSubsetStrategy='sqrt', minInstancesPerNode=10, numTrees=100, subsamplingRate=1.0, maxMemoryInMB=768) ru = (RandomUnderSampler() .setIndexCol('sample_id') .setTargetImbalanceRatio(1.0)) pipeline = Pipeline().setStages([ru, rf]) model = pipeline.fit(train_set) predictions = model.transform(test_set).persist() train_predictions = model.transform(train_set).persist() write_params(model, result_dir) write_results(predictions, train_predictions, result_dir) # Write feature importances feature_importances = get_feature_importances(model.stages[1]) feature_importances.to_csv(result_dir + '/feature_importances.csv')