def grid_search(spark): sample_ratio = 0.05 neg_samples = get_negative_samples(spark).sample(sample_ratio).na.fill(0) pos_samples = get_positive_samples(spark).sample(sample_ratio).na.fill(0) df = get_dataset_df(spark, pos_samples, neg_samples).na.fill(0) trainDF, testDF = df.randomSplit([0.8, 0.2], seed=0) xgboost = XGBoostEstimator(featuresCol="features", labelCol="label", predictionCol="prediction") pipeline = Pipeline().setStages([xgboost]) model = pipeline.fit(trainDF) paramGrid = (ParamGridBuilder().addGrid( xgboost.max_depth, [x for x in range(3, 20, 6)]).addGrid( xgboost.eta, [x for x in np.linspace(0.2, 0.6, 4)]).addGrid( xgboost.scale_pos_weight, [x for x in np.linspace(0.03, 1.0, 3)]).build()) evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="probabilities", metricName="areaUnderPR") cv = (CrossValidator().setEstimator(pipeline).setEvaluator( evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3)) cvModel = cv.fit(trainDF) bestModel = (cvModel.bestModel.asInstanceOf[PipelineModel].stages( 2).asInstanceOf[XGBoostClassificationModel]) with open(workdir + 'data/xgboost_tuning_results_1.txt', 'w') as file: for model, result in zip(model.subModels[0], model.avgMetrics): file.write('==================================\n') for stage in model.stages: params = stage.extractParamMap() for k in params: file.write(f'{k.name}: {params[k]}\n') file.write(f"Area under PR: {result}\n") prediction = bestModel.transform(testDF) prediction = prediction.withColumn("rawPrediction", prediction['probabilities']) area_under_PR, f1_score = evaluate_binary_classifier(prediction) with open(workdir + 'data/xgboost_tuning_perf_1.txt', 'w') as file: file.write(f"Area Under PR = {area_under_PR}\nF1 score = {f1_score}") return
#!/usr/bin/env python import accident_prediction_montreal from pyspark.sql.functions import udf, min, max, col from pyspark.sql.types import FloatType from pyspark.ml.feature import VectorAssembler from preprocess import get_negative_samples, get_positive_samples from utils import init_spark from preprocess import get_dataset_df from export_results import * result_dir = create_result_dir('base') spark = init_spark() neg_samples = get_negative_samples(spark).sample(0.5) pos_samples = get_positive_samples(spark) imbalance_ratio = (neg_samples.count() / pos_samples.count()) train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples) train_set, test_set = train_set.persist(), test_set.persist() get_accidents_count = udf(lambda v: float(v[7]), FloatType()) def fit(train_set): accidents_count = train_set.select( get_accidents_count('features').alias('accidents_count'), 'label') accidents_count_to_proba = [] for i in range(377): accidents_count_higher = \ accidents_count.filter(col('accidents_count') >= i) proba = (accidents_count_higher.filter(col('label') == 1.0).count() /
from random_forest import balanced_random_forest_tuning, \ compute_precision_recall, \ compute_precision_recall_graph from preprocess import get_positive_samples, \ get_negative_samples, \ get_dataset_df from evaluate import evaluate_binary_classifier from utils import init_spark from workdir import workdir spark = init_spark() i = 1 sampleFraction = 0.01 neg_samples = get_negative_samples(spark).sample(sampleFraction) pos_samples = get_positive_samples(spark).sample(sampleFraction) df = get_dataset_df(spark, pos_samples, neg_samples) (train_set, test_set) = df.randomSplit([0.8, 0.2]) (train_set, test_set) = (train_set.persist(), test_set.persist()) model = balanced_random_forest_tuning(train_set) with open(workdir + f'data/brf_tuning_results_{i}.txt', 'w') as file: for model, result in zip(model.subModels[0], model.avgMetrics): file.write('==================================\n') for stage in model.stages: params = stage.extractParamMap() for k in params: file.write(f'{k.name}: {params[k]}\n') file.write(f"Area under PR: {result}\n")
#!/usr/bin/env python import accident_prediction_montreal from preprocess import get_negative_samples, get_positive_samples from utils import init_spark from workdir import workdir spark = init_spark() neg_samples = \ get_negative_samples(spark, save_to='data/negative-sample-new.parquet', sample_ratio=1e-3) print(neg_samples.count())