def mlflow_rf(file_path, num_trees, max_depth): with mlflow.start_run(run_name="random-forest") as run: # Create train/test split spark = SparkSession.builder.appName("App").getOrCreate() airbnbDF = spark.read.parquet(file_path) (trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed=42) # Prepare the StringIndexer and VectorAssembler categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"] indexOutputCols = [x + "Index" for x in categoricalCols] stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="skip") numericCols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & (field != "price"))] assemblerInputs = indexOutputCols + numericCols vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") # Log params: Num Trees and Max Depth mlflow.log_param("num_trees", num_trees) mlflow.log_param("max_depth", max_depth) rf = RandomForestRegressor(labelCol="price", maxBins=40, maxDepth=max_depth, numTrees=num_trees, seed=42) pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf]) # Log model pipelineModel = pipeline.fit(trainDF) mlflow.spark.log_model(pipelineModel, "model") # Log metrics: RMSE and R2 predDF = pipelineModel.transform(testDF) regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2}) # Log artifact: Feature Importance Scores rfModel = pipelineModel.stages[-1] pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)), columns=["feature", "importance"]) .sort_values(by="importance", ascending=False)) # First write to local filesystem, then tell MLflow where to find that file pandasDF.to_csv("/tmp/feature-importance.csv", index=False) mlflow.log_artifact("/tmp/feature-importance.csv")
def VarSelection(Data, Tgt='Target'): Cor = [i[0] for i in Data.dtypes if 'double' in i[1]] vectorassembler = VectorAssembler( inputCols=[_ for _ in Cor if _ not in (Tgt)], outputCol='assembled_features') DataM = vectorassembler.transform(Data) random_seed = 4 num_iter = 10 random.seed(random_seed) random_seeds = set([random.randint(0, 10000) for _ in range(num_iter)]) features_random_seed = {} for random_seed in random_seeds: rf = RandomForestClassifier(featuresCol=vectorassembler.getOutputCol(), labelCol=Tgt, seed=random_seed) rf_model = rf.fit(DataM) importances = [(index, value) for index, value in enumerate( rf_model.featureImportances.toArray().tolist())] importances = sorted(importances, key=lambda value: value[1], reverse=True) imp = 0 vector_assembler_cols = vectorassembler.getInputCols() for element in importances: feature = vector_assembler_cols[element[0]] importance = element[1] if imp < 0.95: features_random_seed[feature] = features_random_seed.get( feature, []) + [importance] else: features_random_seed[feature] = features_random_seed.get( feature, []) + [None] imp += element[1] features_random_seed = pd.DataFrame(features_random_seed).T feature_importances = features_random_seed.dropna(how='all').mean(axis=1) list_of_feature_importance = sorted(zip(feature_importances.index, feature_importances), key=lambda x: x[1], reverse=True) print(list_of_feature_importance) return list_of_feature_importance
# MAGIC 3. `dis`: weighted distances to five Boston employment centers # MAGIC # MAGIC Save the results to `bostonFeaturizedDF2` # COMMAND ---------- # TODO from pyspark.ml.feature import VectorAssembler assembler = # FILL_IN bostonFeaturizedDF2 = # FILL_IN # COMMAND ---------- # TEST - Run this cell to test your solution dbTest("ML1-P-02-01-01", True, set(assembler.getInputCols()) == {'indus', 'age', 'dis'}) dbTest("ML1-P-02-01-02", True, bool(bostonFeaturizedDF2.schema['newFeatures'].dataType)) print("Tests passed!") # COMMAND ---------- # MAGIC %md # MAGIC ### Step 2: Train the Model # MAGIC # MAGIC Instantiate a linear regression model `lrNewFeatures`. Save the trained model to `lrModelNew`. # COMMAND ---------- # TODO from pyspark.ml.regression import LinearRegression
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 생성 try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), StructField("FlightTime", IntegerType(), True), ]) input_path = "{}/data/simple_flight_delay_features_flight_times.json".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # 예정된 도착/출발 시간 추가 # from pyspark.sql.functions import hour features_with_hour = features.withColumn("CRSDepHourOfDay", hour(features.CRSDepTime)) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime)) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인 # null_counts = [ (column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns ] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간화 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature의 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦 numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay", "FlightTime" ] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # 수치 벡터 어셈블러 저장 vector_assembler_path = "{}/models/numeric_vector_assembler_6.0.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # 확정된 특징 검사 final_vectorized_features.show() # # 분류 모델 교차 검증, 훈련, 평가: 4개의 지표에 대해 5번 반복 # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # 테스트/훈련 데이터 분할 training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # 모든 데이터에 대해 랜덤 포레스트 분류 모델 인스턴스화 및 적합 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # 새 모델을 이전 모델 위에 덮어쓰기 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.flight_time.bin".format( base_path) model.write().overwrite().save(model_output_path) # 테스트 데이터로 모델 평가 predictions = model.transform(test_data) # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가 from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # 특징 중요도 수집 # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # 지표별 평균과 표준편차 평가 및 표로 출력 # import numpy as np score_averages = defaultdict(float) # 표 데이터 계산 average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # 표 출력 print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # 점수를 실행 사이에 존재하는 점수 로그에 유지 # import pickle # 점수 로그를 적재하거나 빈 로그를 초기화 try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # 기존 점수 로그 계산 score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # 각 지표에 대한 점수 변화를 계산하고 디스플레이 try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # 기존 평균 점수를 로그에 추가 score_log.append(score_log_entry) # 다음 번 실행을 위해 로그 유지 pickle.dump(score_log, open(score_log_filename, "wb")) # # 특징 중요도의 변화를 분석하고 보고 # # 각 특징에 대한 평균 계산 feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # 특징 중요도를 내림차순으로 정렬하고 출력 import operator sorted_feature_importances = sorted(feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # 이번 실행 결과인 특징 중요도와 이전 실행 결과와 비교 # # 특징 중요도 로그를 적재하거나 빈 로그를 초기화 try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # 각 특징에 대한 점수 변화를 계산하고 디스플레이 try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # 변동 값(delta) 계산 feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[ feature_name] feature_deltas[feature_name] = run_delta # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다 import operator sorted_feature_deltas = sorted(feature_deltas.items(), key=operator.itemgetter(1), reverse=True) # 정렬된 특징 변동 값 디스플레이 print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # 로그에 기존 평균 변동 값을 추가 feature_log.append(feature_importance_entry) # 다음 실행을 위해 로그 유지 pickle.dump(feature_log, open(feature_log_filename, "wb"))
# Log model pipelineModel = pipeline.fit(trainDF) mlflow.spark.log_model(pipelineModel, "model") # Log metrics: RMSE and R2 predDF = pipelineModel.transform(testDF) regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2}) # Log artifact: Feature Importance Scores rfModel = pipelineModel.stages[-1] pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)), columns=["feature", "importance"]) .sort_values(by="importance", ascending=False)) # First write to local filesystem, then tell MLflow where to find that file pandasDF.to_csv("feature-importance.csv", index=False) mlflow.log_artifact("feature-importance.csv") # COMMAND ---------- # MAGIC %md # MAGIC ## MLflowClient # COMMAND ---------- from mlflow.tracking import MlflowClient
# Loads data. df = spark.read.option("header", "true").csv("/user/root/data/*.csv") df_notnull = df.filter( F.col("lon").isNotNull() & F.col("lat").isNotNull() & F.col('P1').isNotNull() & F.col('timestamp').isNotNull()) df = df_notnull df_timestamp = df.withColumn('timestamp', df['timestamp'].substr(1, 7)) df = df_timestamp timestamp = df.collect()[0][5] features = ['P1', 'lon', 'lat'] vector_assembler = VectorAssembler(inputCols=features, outputCol="features") # Cast feature columns to double expression = [ F.col(c).cast("Double").alias(c) for c in vector_assembler.getInputCols() ] dataframe_v = df.select(*expression) dataframe_t = vector_assembler.transform(dataframe_v) dataframe_t = dataframe_t.withColumn("id", F.monotonically_increasing_id()) df = df.withColumn( "id", F.monotonically_increasing_id()).drop("P1").drop("lon").drop("lat") df_joined = df.join(dataframe_t, "id", "inner").drop("id") df_joined.cache() min_max_avg_df = df_joined.groupBy('lat', 'lon').agg(F.avg( df_joined.P1)).withColumnRenamed('avg(P1)', 'avg').orderBy('lat') df_cloned = spark.createDataFrame(min_max_avg_df.rdd, min_max_avg_df.schema)
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn("CRSDepHourOfDay", hour(features.CRSDepTime)) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime)) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [ (column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns ] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay" ] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...". format( i, split_count, )) # Test/train split training_data, test_data = final_vectorized_features.randomSplit( [0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted(feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[ feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted(feature_deltas.items(), key=operator.itemgetter(1), reverse=True) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
#REmove null values in lat/long df_notnull = df.filter(sf.col("Latitude").isNotNull() & sf.col("Longitude").isNotNull()) if limitInput: df_limit = df_notnull.limit(observations) else: df_limit = df_notnull featureColumns = ["Latitude", "Longitude"] vectorAssembler = VectorAssembler(inputCols=featureColumns, outputCol="Features") # For your special case that has string instead of doubles you should cast them first. expr = [col(c).cast("Double").alias(c) for c in vectorAssembler.getInputCols()] #Apply the above expression df_vector = df_limit.select(*expr) #Transform the dataFrame based on the vector assembler df_trans = vectorAssembler.transform(df_vector) #Create id that can be used correlate each observation to its feature vector df_trans = df_trans.withColumn("id", monotonically_increasing_id()) df_limit = df_limit.withColumn("id", monotonically_increasing_id()).drop("Latitude").drop("Longitude") #Drop one of the id columns after joining df_joined = df_limit.join(df_trans, "id", "inner").drop("id") df_joined.cache()
def add_features_maker(stages): ''' INPUT: stages - (list) list of transformer to be used as 'stages' argument of pyspark Pipeline() constructor It must be an output of 'create_label_maker()' function. OUTPUT: stages - (list) list of transformer to be used as 'stages' argument of pyspark Pipeline() constructor feature_labels - (list) list of feature column names for utility DESCRIPTION: This is a subroutine of create_preprocess_pipeline() function. Stages added by this function will make feature columns in target pyspark dataframe. ''' # 'event_name' # replace whitespace of page column with underbar and put into a new column sqlTrans = SQLTransformer(statement=" \ SELECT userId, Churn AS label, ts, registration, level, event_name \ FROM ( \ SELECT *, REPLACE(page, ' ', '_') AS event_name \ FROM __THIS__)") stages.append(sqlTrans) # 'event_name' elements event_names = [ 'About', 'Add_Friend', 'Add_to_Playlist', # 'Cancel', # 'Cancellation_Confirmation', 'Downgrade', 'Error', 'Help', 'Home', 'Logout', 'NextSong', 'Roll_Advert', 'Save_Settings', 'Settings', 'Submit_Downgrade', 'Submit_Upgrade', 'Thumbs_Down', 'Thumbs_Up', 'Upgrade'] # 'eventInterval' # add a column to store event intervals (in seconds) sqlTrans = SQLTransformer(statement=" \ SELECT *, \ ((FIRST_VALUE(ts) OVER ( \ PARTITION BY userId, event_name \ ORDER BY ts DESC \ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW \ ) / 1000) - (LAST_VALUE(ts) OVER ( \ PARTITION BY userId, event_name \ ORDER BY ts DESC \ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW \ ) / 1000)) AS eventInterval \ FROM __THIS__") stages.append(sqlTrans) # 'lastTS' # add a column to store the last TS for each user sqlTrans = SQLTransformer(statement=" \ SELECT *, \ (FIRST_VALUE(ts) OVER ( \ PARTITION BY userId, event_name \ ORDER BY ts DESC \ ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW \ )) AS lastTS \ FROM __THIS__") stages.append(sqlTrans) # 'trueInterval' # set the last TS row's interval value to Null sqlTrans = SQLTransformer(statement=" \ SELECT *, \ CASE WHEN ts == lastTS\ THEN NULL ELSE eventInterval END AS trueInterval \ FROM __THIS__") stages.append(sqlTrans) # 'trueInterval'(update), 'pageCount', 'paidCount', 'songCount' # group by userId and page # we get average of interval for NextSong, and count for other events # we also count paid songs, and total songs sqlTrans = SQLTransformer(statement=" \ SELECT label, userId, event_name, \ AVG(trueInterval) AS trueInterval, \ COUNT(event_name) AS pageCount, \ COUNT(CASE WHEN event_name = 'NextSong' AND level = 'paid'\ THEN event_name END) AS paidCount, \ COUNT(CASE WHEN event_name = 'NextSong'\ THEN event_name END) AS songCount \ FROM __THIS__ \ GROUP BY label, userId, event_name") stages.append(sqlTrans) # 'songInterval' # add a column to store interval when page is NextSong sqlTrans = SQLTransformer(statement=" \ SELECT *, \ CASE WHEN event_name == 'NextSong'\ THEN trueInterval END AS songInterval \ FROM __THIS__") stages.append(sqlTrans) # 'songInterval'(update), 'paidRatio', # element of event_names list as new columns # group by userId, average song intervals, and count other events and # didide the sum by songCount # loop event names to create sql lines and concatenate them sql_line = ''.join([ '(COUNT(CASE WHEN event_name == "{}" \ THEN pageCount END) / SUM(songCount)) AS {},\ '.format(name, name) for name in event_names])[:-1] sqlTrans = SQLTransformer(statement=" \ SELECT label, userId, \ MAX(songInterval) AS songInterval, \ (MAX(paidCount) / MAX(songCount)) AS paidRatio, \ {} \ FROM __THIS__ \ GROUP BY label, userId".format(sql_line)) stages.append(sqlTrans) # 'featureVec' # assemble feature columns into a vector column event_names.remove('NextSong') feature_columns = ['songInterval', 'paidRatio'] + event_names assembler = VectorAssembler(inputCols=feature_columns, outputCol='featureVec') stages.append(assembler) # store feature labels for utility feature_labels = assembler.getInputCols() return stages, feature_labels
header=True, sep=',') ### Select features and label data = csv.select(*(csv.columns[:-1] + [((col("y")).cast("Int").alias("label"))])) # print(data) ### Split the data and rename Y column splits = data.randomSplit([0.7, 0.3]) train = splits[0] test = splits[1].withColumnRenamed("label", "trueLabel") ### Define the pipeline assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol="features") print("Input Columns: ", assembler.getInputCols()) print("Output Column: ", assembler.getOutputCol()) algorithm = LogisticRegression(labelCol="label", featuresCol="features") pipeline = Pipeline(stages=[assembler, algorithm]) ### Tune Parameters lr_reg_params = [0.01, 0.5, 2.0] lr_elasticnet_param = [0.0, 0.5, 1.0] lr_max_iter = [1, 5, 10] ### CrossValidation folds = 2 parallelism = 3 evaluator = BinaryClassificationEvaluator()
data_test = vecAssembler.transform(data_test) #==================RFC=================== RFC_start = time.time() rf = RandomForestClassifier(labelCol='labels', featuresCol='features', maxDepth=best_RFC_maxDepth, numTrees=best_RFC_numTrees, \ maxBins=best_RFC_maxBins, impurity='entropy', seed=myseed) RFC_model = rf.fit(data_training) pred_RFC_all = RFC_model.transform(data_test) accuracy_RFC_all = evaluator.evaluate(pred_RFC_all) auc_RFC_all = evaluator_auc.evaluate(pred_RFC_all) RFC_end = time.time() import pandas as pd featureImpRF = pd.DataFrame(list( zip(vecAssembler.getInputCols(), RFC_model.featureImportances)), columns=["feature", "importance"]) featureImpRF = featureImpRF.sort_values(by="importance", ascending=False) RFC_time = RFC_end - RFC_start print("Gradient boosting Classifier:{} s".format(RFC_time)) print("The accuracy of RFC with the larger dataset= %g " % accuracy_RFC_all) print("The AUC of RFC is %g" % auc_RFC_all) print("The most import feature is:", featureImpRF) #================Gradient boosting Classifier==================== GBC_start = time.time() gbc = GBTClassifier(labelCol='labels', featuresCol='features', maxIter=best_GBT_maxIter, maxDepth=best_GBT_maxDepth, maxBins=best_GBT_maxBins)
accuracy)) # COMMAND ---------- model = finalModel.stages[-1] display(model) # COMMAND ---------- # MAGIC %md # MAGIC ### Evaluate Feature Importance # COMMAND ---------- # zip the list of features with their scores scores = zip(assembler.getInputCols(), model.featureImportances) # and pretty print theem for x in scores: print("%-15s = %s" % x) # COMMAND ---------- # MAGIC %md # MAGIC ### Register Model # MAGIC # MAGIC #### Create a new registered model using the API # MAGIC # MAGIC The following cells use the `mlflow.register_model()` function to create a new registered model named `IrisModel`. This also creates a new model version (e.g., `Version 1` of `IrisModel`). # COMMAND ----------
multinomialRegression ]) # COMMAND ---------- # TEST - Run this cell to test your solution from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import StringIndexer, VectorAssembler dbTest("ML1-P-07-02-01", True, type(indexer) == type(StringIndexer())) dbTest("ML1-P-07-02-02", True, indexer.getInputCol() == 'species') dbTest("ML1-P-07-02-03", True, indexer.getOutputCol() == 'speciesClass') dbTest("ML1-P-07-02-04", True, type(assembler) == type(VectorAssembler())) dbTest("ML1-P-07-02-05", True, assembler.getInputCols() == irisDF.columns[:-1]) dbTest("ML1-P-07-02-06", True, assembler.getOutputCol() == 'features') dbTest("ML1-P-07-02-07", True, type(multinomialRegression) == type(LogisticRegression())) dbTest("ML1-P-07-02-08", True, multinomialRegression.getLabelCol() == "speciesClass") dbTest("ML1-P-07-02-09", True, multinomialRegression.getFeaturesCol() == 'features') dbTest("ML1-P-07-02-10", True, type(pipeline) == type(Pipeline())) print("Tests passed!") # COMMAND ---------- # MAGIC %md # MAGIC ### Step 3: Train the Model and Transform the Dataset # MAGIC
# ### Exercise 8 (b) Inspect the model # The learner has been trained now. Lets inspect which are the wrights for the # (trained) linear regression model, which is now stored as the second element of # our pipeline. # # Run the next cell. Ensure that you understand what's going on. Ask for help if # you have questions. # In[ ]: # The coefficients (i.e., weights) are as follows: weights = lrModel.stages[1].coefficients # The corresponding features for these weights are: featuresNoLabel = vectorizer.getInputCols() # Print coefficients list(zip(featuresNoLabel, weights)) # Print the intercept print(lrModel.stages[1].intercept) # **Exercises** # # - Write down the linear regression equation that your model learned. # - Recall when we visualized each predictor against Power Output using a Scatter # Plot, # does the final equation seems logical given
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") rf = RandomForestRegressor(labelCol="price", maxBins=40, maxDepth=5, numTrees=100, seed=42) pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf]) with mlflow.start_run(run_name="random-forest") as run: # Log params: num_trees and max_depth mlflow.log_param("num_trees", rf.getNumTrees()) mlflow.log_param("max_depth", rf.getMaxDepth()) # Log model pipelineModel = pipeline.fit(trainDF) mlflow.spark.log_model(pipelineModel, "model") # Log metrics: RMSE and R2 predDF = pipelineModel.transform(testDF) regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2}) # Log artifact: feature importance scores rfModel = pipelineModel.stages[-1] pandasDF = (pd.DataFrame( list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)), columns=["feature", "importance"]).sort_values(by="importance", ascending=False)) # First write to local filesystem, then tell MLflow where to find that file pandasDF.to_csv("feature-importance.csv", index=False) mlflow.log_artifact("feature-importance.csv")
def TransformDataframe(self, vectors): assembler = VectorAssembler(inputCols=vectors, outputCol="features") expr = [ col(c).cast("float").alias(c) for c in assembler.getInputCols() ] self.dataframe = assembler.transform(self.dataframe)
print(dtcModel.toDebugString) # COMMAND ---------- # Visualize the decision tree display(dtcModel) # COMMAND ---------- # MAGIC %md # MAGIC ### Evaluate Feature Importance # COMMAND ---------- list(zip(assembler.getInputCols(), dtcModel.featureImportances)) # COMMAND ---------- # MAGIC %md # MAGIC ### Evaluate Model Performance # COMMAND ---------- # Vectorize the features of test set assembledTestDF = assembler.transform(testDF) # Make predictions using vectorized test set testPredictionDF = dtcModel.transform(assembledTestDF) display(testPredictionDF)
indexer = StringIndexer(inputCol="dest_ip", outputCol="dest_ipIndex") model = indexer.fit(indexed) indexed = model.transform(indexed) indexer = StringIndexer(inputCol="tcp_type",outputCol="tcp_typeIndex") model = indexer.fit(indexed) indexed = model.transform(indexed) indexer = StringIndexer(inputCol="tcp_portno",outputCol="tcp_portnoIndex") model = indexer.fit(indexed) indexed = model.transform(indexed) req_test_dataDF = indexed.select("label","ttl","offset","tcp_typeIndex","p_length","source_ipIndex","dest_ipIndex","tcp_portnoIndex") assembler = VectorAssembler( inputCols=["ttl","offset","tcp_typeIndex","p_length","source_ipIndex","dest_ipIndex","tcp_portnoIndex"], outputCol="features" ) expr = [col(c).cast("Double").alias(c) for c in assembler.getInputCols()] df2 = req_dataDF.select("label",*expr) df = assembler.transform(df2.na.drop()) training = df.select("label","features") test_df2 = req_test_dataDF.select("label",*expr) test_df = assembler.transform(test_df2.na.drop()) testing = test_df.select("label","features") lr = LogisticRegression(maxIter=10, regParam=0.3,elasticNetParam=0.8) print("LogisticRegression parameters:\n" + lr.explainParams() + "\n") lrModel = lr.fit(training) print("Coefficients: \n" + str(lrModel.coefficientMatrix)) print("Intercept: " + str(lrModel.interceptVector)) prediction = lrModel.transform(testing) result = prediction.select("features", "label", "probability", "prediction") \ .collect()
indexer = StringIndexer(inputCol="tcp_type", outputCol="tcp_typeIndex") model = indexer.fit(indexed) indexed = model.transform(indexed) indexer = StringIndexer(inputCol="tcp_portno", outputCol="tcp_portnoIndex") model = indexer.fit(indexed) indexed = model.transform(indexed) req_test_dataDF = indexed.select("label", "ttl", "offset", "tcp_typeIndex", "p_length", "source_ipIndex", "dest_ipIndex", "tcp_portnoIndex") assembler = VectorAssembler(inputCols=[ "ttl", "offset", "tcp_typeIndex", "p_length", "source_ipIndex", "dest_ipIndex", "tcp_portnoIndex" ], outputCol="features") expr = [col(c).cast("Double").alias(c) for c in assembler.getInputCols()] df2 = req_dataDF.select("label", *expr) df = assembler.transform(df2.na.drop()) training = df.select("label", "features") test_df2 = req_test_dataDF.select("label", *expr) test_df = assembler.transform(test_df2.na.drop()) testing = test_df.select("label", "features") nb = NaiveBayes(smoothing=1.0, modelType="multinomial") model = nb.fit(training) predictions = model.transform(testing) predictions.show() evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions)