예제 #1
0
def mlflow_rf(file_path, num_trees, max_depth):
  with mlflow.start_run(run_name="random-forest") as run:
    # Create train/test split
    spark = SparkSession.builder.appName("App").getOrCreate()
    airbnbDF = spark.read.parquet(file_path)
    (trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed=42)

    # Prepare the StringIndexer and VectorAssembler
    categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]
    indexOutputCols = [x + "Index" for x in categoricalCols]

    stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="skip")

    numericCols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & (field != "price"))]
    assemblerInputs = indexOutputCols + numericCols
    vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    
    # Log params: Num Trees and Max Depth
    mlflow.log_param("num_trees", num_trees)
    mlflow.log_param("max_depth", max_depth)

    rf = RandomForestRegressor(labelCol="price",
                               maxBins=40,
                               maxDepth=max_depth,
                               numTrees=num_trees,
                               seed=42)

    pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])

    # Log model
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction",
                                            labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})

    # Log artifact: Feature Importance Scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(),
                                    rfModel.featureImportances)),
                          columns=["feature", "importance"])
              .sort_values(by="importance", ascending=False))
    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("/tmp/feature-importance.csv", index=False)
    mlflow.log_artifact("/tmp/feature-importance.csv")
예제 #2
0
def VarSelection(Data, Tgt='Target'):
    Cor = [i[0] for i in Data.dtypes if 'double' in i[1]]
    vectorassembler = VectorAssembler(
        inputCols=[_ for _ in Cor if _ not in (Tgt)],
        outputCol='assembled_features')
    DataM = vectorassembler.transform(Data)
    random_seed = 4
    num_iter = 10
    random.seed(random_seed)
    random_seeds = set([random.randint(0, 10000) for _ in range(num_iter)])
    features_random_seed = {}
    for random_seed in random_seeds:
        rf = RandomForestClassifier(featuresCol=vectorassembler.getOutputCol(),
                                    labelCol=Tgt,
                                    seed=random_seed)
        rf_model = rf.fit(DataM)

        importances = [(index, value) for index, value in enumerate(
            rf_model.featureImportances.toArray().tolist())]
        importances = sorted(importances,
                             key=lambda value: value[1],
                             reverse=True)
        imp = 0
        vector_assembler_cols = vectorassembler.getInputCols()
        for element in importances:
            feature = vector_assembler_cols[element[0]]
            importance = element[1]

            if imp < 0.95:
                features_random_seed[feature] = features_random_seed.get(
                    feature, []) + [importance]
            else:
                features_random_seed[feature] = features_random_seed.get(
                    feature, []) + [None]
            imp += element[1]
    features_random_seed = pd.DataFrame(features_random_seed).T
    feature_importances = features_random_seed.dropna(how='all').mean(axis=1)
    list_of_feature_importance = sorted(zip(feature_importances.index,
                                            feature_importances),
                                        key=lambda x: x[1],
                                        reverse=True)
    print(list_of_feature_importance)
    return list_of_feature_importance
# MAGIC 3. `dis`: weighted distances to five Boston employment centers
# MAGIC 
# MAGIC Save the results to `bostonFeaturizedDF2`

# COMMAND ----------

# TODO
from pyspark.ml.feature import VectorAssembler

assembler = # FILL_IN
bostonFeaturizedDF2 = # FILL_IN

# COMMAND ----------

# TEST - Run this cell to test your solution
dbTest("ML1-P-02-01-01", True, set(assembler.getInputCols()) == {'indus', 'age', 'dis'})
dbTest("ML1-P-02-01-02", True, bool(bostonFeaturizedDF2.schema['newFeatures'].dataType))

print("Tests passed!")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 2: Train the Model
# MAGIC 
# MAGIC Instantiate a linear regression model `lrNewFeatures`.  Save the trained model to `lrModelNew`.

# COMMAND ----------

# TODO
from pyspark.ml.regression import LinearRegression
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
예제 #5
0
def main(base_path):
    APP_NAME = "train_spark_mllib_model.py"

    # SparkSession이 없으면 환경 생성
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),
        StructField("CRSArrTime", TimestampType(), True),
        StructField("CRSDepTime", TimestampType(), True),
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Route", StringType(), True),
        StructField("TailNum", StringType(), True),
        StructField("EngineManufacturer", StringType(), True),
        StructField("EngineModel", StringType(), True),
        StructField("Manufacturer", StringType(), True),
        StructField("ManufacturerYear", StringType(), True),
        StructField("OwnerState", StringType(), True),
        StructField("FlightTime", IntegerType(), True),
    ])

    input_path = "{}/data/simple_flight_delay_features_flight_times.json".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # 예정된 도착/출발 시간 추가
    #
    from pyspark.sql.functions import hour
    features_with_hour = features.withColumn("CRSDepHourOfDay",
                                             hour(features.CRSDepTime))
    features_with_hour = features_with_hour.withColumn(
        "CRSArrHourOfDay", hour(features.CRSArrTime))
    features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime",
                              "CRSArrHourOfDay").show()

    #
    # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인
    #
    null_counts = [
        (column,
         features_with_hour.where(features_with_hour[column].isNull()).count())
        for column in features_with_hour.columns
    ]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print("\nNull Value Report")
    print("-----------------")
    print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))

    #
    # pysmark.ml.feature.Bucketizer를 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화
    #
    from pyspark.ml.feature import Bucketizer

    # 구간화 모델 설정
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # 모델 저장
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # 모델 적용
    ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # pyspark.ml.feature의 특징 도구 임포트
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # 범주 필드를 인덱스로 전환
    string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"]
    for column in string_columns:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦
    numeric_columns = [
        "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay",
        "CRSArrHourOfDay", "FlightTime"
    ]
    index_columns = [column + "_index" for column in string_columns]

    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # 수치 벡터 어셈블러 저장
    vector_assembler_path = "{}/models/numeric_vector_assembler_6.0.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # 인덱스 열 제거
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # 확정된 특징 검사
    final_vectorized_features.show()

    #
    # 분류 모델 교차 검증, 훈련, 평가: 4개의 지표에 대해 5번 반복
    #

    from collections import defaultdict
    scores = defaultdict(list)
    feature_importances = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("\nRun {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        #  테스트/훈련 데이터 분할
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # 모든 데이터에 대해 랜덤 포레스트 분류 모델 인스턴스화 및 적합
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4896,
        )
        model = rfc.fit(training_data)

        # 새 모델을 이전 모델 위에 덮어쓰기
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.flight_time.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # 테스트 데이터로 모델 평가
        predictions = model.transform(test_data)

        # 이 테스트/훈련 데이터 분할의 결과를 각 지표별로평가
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:
            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

        #
        # 특징 중요도 수집
        #
        feature_names = vector_assembler.getInputCols()
        feature_importance_list = model.featureImportances
        for feature_name, feature_importance in zip(feature_names,
                                                    feature_importance_list):
            feature_importances[feature_name].append(feature_importance)

    #
    # 지표별 평균과 표준편차 평가 및 표로 출력
    #
    import numpy as np
    score_averages = defaultdict(float)

    # 표 데이터 계산
    average_stds = []  # ha
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)

        average_stds.append((metric_name, average_accuracy, std_accuracy))

    # 표 출력
    print("\nExperiment Log")
    print("--------------")
    print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))

    #
    # 점수를 실행 사이에 존재하는 점수 로그에 유지
    #
    import pickle

    # 점수 로그를 적재하거나 빈 로그를 초기화
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    # 기존 점수 로그 계산
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # 각 지표에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    experiment_report = []
    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        experiment_report.append((metric_name, run_delta))

    print("\nExperiment Report")
    print("-----------------")
    print(tabulate(experiment_report, headers=["Metric", "Score"]))

    # 기존 평균 점수를 로그에 추가
    score_log.append(score_log_entry)

    # 다음 번 실행을 위해 로그 유지
    pickle.dump(score_log, open(score_log_filename, "wb"))

    #
    # 특징 중요도의 변화를 분석하고 보고
    #

    # 각 특징에 대한 평균 계산
    feature_importance_entry = defaultdict(float)
    for feature_name, value_list in feature_importances.items():
        average_importance = sum(value_list) / len(value_list)
        feature_importance_entry[feature_name] = average_importance

    # 특징 중요도를 내림차순으로 정렬하고 출력
    import operator
    sorted_feature_importances = sorted(feature_importance_entry.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)

    print("\nFeature Importances")
    print("-------------------")
    print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))

    #
    # 이번 실행 결과인 특징 중요도와 이전 실행 결과와 비교
    #

    # 특징 중요도 로그를 적재하거나 빈 로그를 초기화
    try:
        feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
        feature_log = pickle.load(open(feature_log_filename, "rb"))
        if not isinstance(feature_log, list):
            feature_log = []
    except IOError:
        feature_log = []

    # 각 특징에 대한 점수 변화를 계산하고 디스플레이
    try:
        last_feature_log = feature_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_feature_log = defaultdict(float)
        for feature_name, importance in feature_importance_entry.items():
            last_feature_log[feature_name] = importance

    # 변동 값(delta) 계산
    feature_deltas = {}
    for feature_name in feature_importances.keys():
        run_delta = feature_importance_entry[feature_name] - last_feature_log[
            feature_name]
        feature_deltas[feature_name] = run_delta

    # 특징 변동 값을 정렬해 가장 큰 변동이 있는 특징을 먼저 나오게 한다
    import operator
    sorted_feature_deltas = sorted(feature_deltas.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # 정렬된 특징 변동 값 디스플레이
    print("\nFeature Importance Delta Report")
    print("-------------------------------")
    print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))

    # 로그에 기존 평균 변동 값을 추가
    feature_log.append(feature_importance_entry)

    # 다음 실행을 위해 로그 유지
    pickle.dump(feature_log, open(feature_log_filename, "wb"))
예제 #6
0
 
  # Log model
  pipelineModel = pipeline.fit(trainDF)
  mlflow.spark.log_model(pipelineModel, "model")

  # Log metrics: RMSE and R2
  predDF = pipelineModel.transform(testDF)
  regressionEvaluator = RegressionEvaluator(predictionCol="prediction", 
                                            labelCol="price")
  rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
  r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
  mlflow.log_metrics({"rmse": rmse, "r2": r2})

  # Log artifact: Feature Importance Scores
  rfModel = pipelineModel.stages[-1]
  pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(), 
                                    rfModel.featureImportances)), 
                          columns=["feature", "importance"])
              .sort_values(by="importance", ascending=False))
  # First write to local filesystem, then tell MLflow where to find that file
  pandasDF.to_csv("feature-importance.csv", index=False)
  mlflow.log_artifact("feature-importance.csv")

# COMMAND ----------

# MAGIC %md
# MAGIC ## MLflowClient

# COMMAND ----------

from mlflow.tracking import MlflowClient
# Loads data.
df = spark.read.option("header", "true").csv("/user/root/data/*.csv")
df_notnull = df.filter(
    F.col("lon").isNotNull() & F.col("lat").isNotNull()
    & F.col('P1').isNotNull() & F.col('timestamp').isNotNull())
df = df_notnull
df_timestamp = df.withColumn('timestamp', df['timestamp'].substr(1, 7))
df = df_timestamp
timestamp = df.collect()[0][5]

features = ['P1', 'lon', 'lat']
vector_assembler = VectorAssembler(inputCols=features, outputCol="features")

# Cast feature columns to double
expression = [
    F.col(c).cast("Double").alias(c) for c in vector_assembler.getInputCols()
]

dataframe_v = df.select(*expression)
dataframe_t = vector_assembler.transform(dataframe_v)

dataframe_t = dataframe_t.withColumn("id", F.monotonically_increasing_id())
df = df.withColumn(
    "id", F.monotonically_increasing_id()).drop("P1").drop("lon").drop("lat")

df_joined = df.join(dataframe_t, "id", "inner").drop("id")
df_joined.cache()

min_max_avg_df = df_joined.groupBy('lat', 'lon').agg(F.avg(
    df_joined.P1)).withColumnRenamed('avg(P1)', 'avg').orderBy('lat')
df_cloned = spark.createDataFrame(min_max_avg_df.rdd, min_max_avg_df.schema)
예제 #8
0
def main(base_path):
    APP_NAME = "train_spark_mllib_model.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except NameError as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),
        StructField("CRSArrTime", TimestampType(), True),
        StructField("CRSDepTime", TimestampType(), True),
        StructField("Carrier", StringType(), True),
        StructField("DayOfMonth", IntegerType(), True),
        StructField("DayOfWeek", IntegerType(), True),
        StructField("DayOfYear", IntegerType(), True),
        StructField("DepDelay", DoubleType(), True),
        StructField("Dest", StringType(), True),
        StructField("Distance", DoubleType(), True),
        StructField("FlightDate", DateType(), True),
        StructField("FlightNum", StringType(), True),
        StructField("Origin", StringType(), True),
        StructField("Route", StringType(), True),
        StructField("TailNum", StringType(), True),
        StructField("EngineManufacturer", StringType(), True),
        StructField("EngineModel", StringType(), True),
        StructField("Manufacturer", StringType(), True),
        StructField("ManufacturerYear", StringType(), True),
        StructField("OwnerState", StringType(), True),
    ])

    input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Add the hour of day of scheduled arrival/departure
    #
    from pyspark.sql.functions import hour
    features_with_hour = features.withColumn("CRSDepHourOfDay",
                                             hour(features.CRSDepTime))
    features_with_hour = features_with_hour.withColumn(
        "CRSArrHourOfDay", hour(features.CRSArrTime))
    features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime",
                              "CRSArrHourOfDay").show()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [
        (column,
         features_with_hour.where(features_with_hour[column].isNull()).count())
        for column in features_with_hour.columns
    ]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print("\nNull Value Report")
    print("-----------------")
    print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the model
    arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(
        base_path)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Apply the model
    ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"]
    for column in string_columns:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Save the pipeline model
        string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
            base_path, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = [
        "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay",
        "CRSArrHourOfDay"
    ]
    index_columns = [column + "_index" for column in string_columns]

    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(
        base_path)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    #
    # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
    #

    from collections import defaultdict
    scores = defaultdict(list)
    feature_importances = defaultdict(list)
    metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
    split_count = 3

    for i in range(1, split_count + 1):
        print("\nRun {} out of {} of test/train splits in cross validation...".
              format(
                  i,
                  split_count,
              ))

        # Test/train split
        training_data, test_data = final_vectorized_features.randomSplit(
            [0.8, 0.2])

        # Instantiate and fit random forest classifier on all the data
        from pyspark.ml.classification import RandomForestClassifier
        rfc = RandomForestClassifier(
            featuresCol="Features_vec",
            labelCol="ArrDelayBucket",
            predictionCol="Prediction",
            maxBins=4896,
        )
        model = rfc.fit(training_data)

        # Save the new model over the old one
        model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
            base_path)
        model.write().overwrite().save(model_output_path)

        # Evaluate model using test data
        predictions = model.transform(test_data)

        # Evaluate this split's results for each metric
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        for metric_name in metric_names:
            evaluator = MulticlassClassificationEvaluator(
                labelCol="ArrDelayBucket",
                predictionCol="Prediction",
                metricName=metric_name)
            score = evaluator.evaluate(predictions)

            scores[metric_name].append(score)
            print("{} = {}".format(metric_name, score))

        #
        # Collect feature importances
        #
        feature_names = vector_assembler.getInputCols()
        feature_importance_list = model.featureImportances
        for feature_name, feature_importance in zip(feature_names,
                                                    feature_importance_list):
            feature_importances[feature_name].append(feature_importance)

    #
    # Evaluate average and STD of each metric and print a table
    #
    import numpy as np
    score_averages = defaultdict(float)

    # Compute the table data
    average_stds = []  # ha
    for metric_name in metric_names:
        metric_scores = scores[metric_name]

        average_accuracy = sum(metric_scores) / len(metric_scores)
        score_averages[metric_name] = average_accuracy

        std_accuracy = np.std(metric_scores)

        average_stds.append((metric_name, average_accuracy, std_accuracy))

    # Print the table
    print("\nExperiment Log")
    print("--------------")
    print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))

    #
    # Persist the score to a sccore log that exists between runs
    #
    import pickle

    # Load the score log or initialize an empty one
    try:
        score_log_filename = "{}/models/score_log.pickle".format(base_path)
        score_log = pickle.load(open(score_log_filename, "rb"))
        if not isinstance(score_log, list):
            score_log = []
    except IOError:
        score_log = []

    # Compute the existing score log entry
    score_log_entry = {
        metric_name: score_averages[metric_name]
        for metric_name in metric_names
    }

    # Compute and display the change in score for each metric
    try:
        last_log = score_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_log = score_log_entry

    experiment_report = []
    for metric_name in metric_names:
        run_delta = score_log_entry[metric_name] - last_log[metric_name]
        experiment_report.append((metric_name, run_delta))

    print("\nExperiment Report")
    print("-----------------")
    print(tabulate(experiment_report, headers=["Metric", "Score"]))

    # Append the existing average scores to the log
    score_log.append(score_log_entry)

    # Persist the log for next run
    pickle.dump(score_log, open(score_log_filename, "wb"))

    #
    # Analyze and report feature importance changes
    #

    # Compute averages for each feature
    feature_importance_entry = defaultdict(float)
    for feature_name, value_list in feature_importances.items():
        average_importance = sum(value_list) / len(value_list)
        feature_importance_entry[feature_name] = average_importance

    # Sort the feature importances in descending order and print
    import operator
    sorted_feature_importances = sorted(feature_importance_entry.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)

    print("\nFeature Importances")
    print("-------------------")
    print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))

    #
    # Compare this run's feature importances with the previous run's
    #

    # Load the feature importance log or initialize an empty one
    try:
        feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
        feature_log = pickle.load(open(feature_log_filename, "rb"))
        if not isinstance(feature_log, list):
            feature_log = []
    except IOError:
        feature_log = []

    # Compute and display the change in score for each feature
    try:
        last_feature_log = feature_log[-1]
    except (IndexError, TypeError, AttributeError):
        last_feature_log = defaultdict(float)
        for feature_name, importance in feature_importance_entry.items():
            last_feature_log[feature_name] = importance

    # Compute the deltas
    feature_deltas = {}
    for feature_name in feature_importances.keys():
        run_delta = feature_importance_entry[feature_name] - last_feature_log[
            feature_name]
        feature_deltas[feature_name] = run_delta

    # Sort feature deltas, biggest change first
    import operator
    sorted_feature_deltas = sorted(feature_deltas.items(),
                                   key=operator.itemgetter(1),
                                   reverse=True)

    # Display sorted feature deltas
    print("\nFeature Importance Delta Report")
    print("-------------------------------")
    print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))

    # Append the existing average deltas to the log
    feature_log.append(feature_importance_entry)

    # Persist the log for next run
    pickle.dump(feature_log, open(feature_log_filename, "wb"))
예제 #9
0
#REmove null values in lat/long
df_notnull = df.filter(sf.col("Latitude").isNotNull() & sf.col("Longitude").isNotNull())

if limitInput:
    df_limit = df_notnull.limit(observations)
else:
    df_limit = df_notnull

featureColumns = ["Latitude", "Longitude"]
vectorAssembler = VectorAssembler(inputCols=featureColumns,
                                  outputCol="Features")


# For your special case that has string instead of doubles you should cast them first.
expr = [col(c).cast("Double").alias(c)
        for c in vectorAssembler.getInputCols()]

#Apply the above expression
df_vector = df_limit.select(*expr)

#Transform the dataFrame based on the vector assembler
df_trans = vectorAssembler.transform(df_vector)

#Create id that can be used correlate each observation to its feature vector
df_trans = df_trans.withColumn("id", monotonically_increasing_id())
df_limit = df_limit.withColumn("id", monotonically_increasing_id()).drop("Latitude").drop("Longitude")

#Drop one of the id columns after joining
df_joined = df_limit.join(df_trans, "id", "inner").drop("id")
df_joined.cache()
예제 #10
0
def add_features_maker(stages):
    '''
    INPUT:
    stages - (list) list of transformer to be used as 'stages' argument of
        pyspark Pipeline() constructor
                It must be an output of 'create_label_maker()' function.

    OUTPUT:
    stages - (list) list of transformer to be used as 'stages' argument of
        pyspark Pipeline() constructor
    feature_labels - (list) list of feature column names for utility

    DESCRIPTION:
    This is a subroutine of create_preprocess_pipeline() function.
    Stages added by this function will make feature columns in target pyspark
        dataframe.
    '''
    # 'event_name'
    # replace whitespace of page column with underbar and put into a new column
    sqlTrans = SQLTransformer(statement=" \
        SELECT userId, Churn AS label, ts, registration, level, event_name \
        FROM ( \
            SELECT *, REPLACE(page, ' ', '_') AS event_name \
            FROM __THIS__)")

    stages.append(sqlTrans)

    # 'event_name' elements
    event_names = [
         'About',
         'Add_Friend',
         'Add_to_Playlist',
         # 'Cancel',
         # 'Cancellation_Confirmation',
         'Downgrade',
         'Error',
         'Help',
         'Home',
         'Logout',
         'NextSong',
         'Roll_Advert',
         'Save_Settings',
         'Settings',
         'Submit_Downgrade',
         'Submit_Upgrade',
         'Thumbs_Down',
         'Thumbs_Up',
         'Upgrade']

    # 'eventInterval'
    # add a column to store event intervals (in seconds)
    sqlTrans = SQLTransformer(statement=" \
        SELECT *, \
            ((FIRST_VALUE(ts) OVER ( \
                PARTITION BY userId, event_name \
                ORDER BY ts DESC \
                ROWS BETWEEN 1 PRECEDING AND CURRENT ROW \
            ) / 1000) - (LAST_VALUE(ts) OVER ( \
                PARTITION BY userId, event_name \
                ORDER BY ts DESC \
                ROWS BETWEEN 1 PRECEDING AND CURRENT ROW \
            ) / 1000)) AS eventInterval \
        FROM __THIS__")

    stages.append(sqlTrans)

    # 'lastTS'
    # add a column to store the last TS for each user
    sqlTrans = SQLTransformer(statement=" \
        SELECT *, \
            (FIRST_VALUE(ts) OVER ( \
                PARTITION BY userId, event_name \
                ORDER BY ts DESC \
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW \
            )) AS lastTS \
        FROM __THIS__")

    stages.append(sqlTrans)

    # 'trueInterval'
    # set the last TS row's interval value to Null
    sqlTrans = SQLTransformer(statement=" \
        SELECT *, \
            CASE WHEN ts == lastTS\
            THEN NULL ELSE eventInterval END AS trueInterval \
        FROM __THIS__")

    stages.append(sqlTrans)

    # 'trueInterval'(update), 'pageCount', 'paidCount', 'songCount'
    # group by userId and page
    # we get average of interval for NextSong, and count for other events
    # we also count paid songs, and total songs
    sqlTrans = SQLTransformer(statement=" \
        SELECT label, userId, event_name, \
            AVG(trueInterval) AS trueInterval, \
            COUNT(event_name) AS pageCount, \
            COUNT(CASE WHEN event_name = 'NextSong' AND level = 'paid'\
                  THEN event_name END) AS paidCount, \
            COUNT(CASE WHEN event_name = 'NextSong'\
                  THEN event_name END) AS songCount \
        FROM __THIS__ \
        GROUP BY label, userId, event_name")

    stages.append(sqlTrans)

    # 'songInterval'
    # add a column to store interval when page is NextSong
    sqlTrans = SQLTransformer(statement=" \
        SELECT *, \
            CASE WHEN event_name == 'NextSong'\
            THEN trueInterval END AS songInterval \
        FROM __THIS__")

    stages.append(sqlTrans)

    # 'songInterval'(update), 'paidRatio',
    #   element of event_names list as new columns
    # group by userId, average song intervals, and count other events and
    #   didide the sum by songCount

    # loop event names to create sql lines and concatenate them
    sql_line = ''.join([
            '(COUNT(CASE WHEN event_name == "{}" \
            THEN pageCount END) / SUM(songCount)) AS {},\
            '.format(name, name) for name in event_names])[:-1]

    sqlTrans = SQLTransformer(statement=" \
        SELECT label, userId, \
            MAX(songInterval) AS songInterval, \
            (MAX(paidCount) / MAX(songCount)) AS paidRatio, \
            {} \
        FROM __THIS__ \
        GROUP BY label, userId".format(sql_line))

    stages.append(sqlTrans)

    # 'featureVec'
    # assemble feature columns into a vector column
    event_names.remove('NextSong')
    feature_columns = ['songInterval', 'paidRatio'] + event_names

    assembler = VectorAssembler(inputCols=feature_columns,
                                outputCol='featureVec')

    stages.append(assembler)

    # store feature labels for utility
    feature_labels = assembler.getInputCols()

    return stages, feature_labels
예제 #11
0
    header=True,
    sep=',')

### Select features and label
data = csv.select(*(csv.columns[:-1] +
                    [((col("y")).cast("Int").alias("label"))]))
# print(data)

### Split the data and rename Y column
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")

### Define the pipeline
assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol="features")
print("Input Columns: ", assembler.getInputCols())
print("Output Column: ", assembler.getOutputCol())

algorithm = LogisticRegression(labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=[assembler, algorithm])

### Tune Parameters
lr_reg_params = [0.01, 0.5, 2.0]
lr_elasticnet_param = [0.0, 0.5, 1.0]
lr_max_iter = [1, 5, 10]

### CrossValidation
folds = 2
parallelism = 3

evaluator = BinaryClassificationEvaluator()
예제 #12
0
data_test = vecAssembler.transform(data_test)
#==================RFC===================
RFC_start = time.time()
rf = RandomForestClassifier(labelCol='labels', featuresCol='features', maxDepth=best_RFC_maxDepth, numTrees=best_RFC_numTrees, \
                            maxBins=best_RFC_maxBins, impurity='entropy', seed=myseed)

RFC_model = rf.fit(data_training)
pred_RFC_all = RFC_model.transform(data_test)
accuracy_RFC_all = evaluator.evaluate(pred_RFC_all)
auc_RFC_all = evaluator_auc.evaluate(pred_RFC_all)

RFC_end = time.time()

import pandas as pd
featureImpRF = pd.DataFrame(list(
    zip(vecAssembler.getInputCols(), RFC_model.featureImportances)),
                            columns=["feature", "importance"])
featureImpRF = featureImpRF.sort_values(by="importance", ascending=False)

RFC_time = RFC_end - RFC_start
print("Gradient boosting Classifier:{} s".format(RFC_time))
print("The accuracy of RFC with the larger dataset= %g " % accuracy_RFC_all)
print("The AUC of RFC is %g" % auc_RFC_all)
print("The most import feature is:", featureImpRF)
#================Gradient boosting Classifier====================
GBC_start = time.time()
gbc = GBTClassifier(labelCol='labels',
                    featuresCol='features',
                    maxIter=best_GBT_maxIter,
                    maxDepth=best_GBT_maxDepth,
                    maxBins=best_GBT_maxBins)
예제 #13
0
        accuracy))

# COMMAND ----------

model = finalModel.stages[-1]
display(model)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Evaluate Feature Importance

# COMMAND ----------

# zip the list of features with their scores
scores = zip(assembler.getInputCols(), model.featureImportances)

# and pretty print theem
for x in scores:
    print("%-15s = %s" % x)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Register Model
# MAGIC
# MAGIC #### Create a new registered model using the API
# MAGIC
# MAGIC The following cells use the `mlflow.register_model()` function to create a new registered model named `IrisModel`. This also creates a new model version (e.g., `Version 1` of `IrisModel`).

# COMMAND ----------
예제 #14
0
  multinomialRegression
])

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorAssembler

dbTest("ML1-P-07-02-01", True, type(indexer) == type(StringIndexer()))
dbTest("ML1-P-07-02-02", True, indexer.getInputCol() == 'species')
dbTest("ML1-P-07-02-03", True, indexer.getOutputCol() == 'speciesClass')

dbTest("ML1-P-07-02-04", True, type(assembler) == type(VectorAssembler()))
dbTest("ML1-P-07-02-05", True, assembler.getInputCols() == irisDF.columns[:-1])
dbTest("ML1-P-07-02-06", True, assembler.getOutputCol() == 'features')

dbTest("ML1-P-07-02-07", True, type(multinomialRegression) == type(LogisticRegression()))
dbTest("ML1-P-07-02-08", True, multinomialRegression.getLabelCol() == "speciesClass")
dbTest("ML1-P-07-02-09", True, multinomialRegression.getFeaturesCol() == 'features')

dbTest("ML1-P-07-02-10", True, type(pipeline) == type(Pipeline()))

print("Tests passed!")

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 3: Train the Model and Transform the Dataset
# MAGIC 
# ### Exercise 8 (b) Inspect the model
# The learner has been trained now. Lets inspect which are the wrights for the
# (trained) linear regression model, which is now stored as the second element of
# our pipeline.
# 
# Run the next cell. Ensure that you understand what's going on. Ask for help if
# you have questions.

# In[ ]:


# The coefficients (i.e., weights) are as follows:
weights = lrModel.stages[1].coefficients

# The corresponding features for these weights are:
featuresNoLabel = vectorizer.getInputCols()


# Print coefficients 
list(zip(featuresNoLabel, weights))
 
 # Print the intercept
print(lrModel.stages[1].intercept)


# **Exercises**
# 
# - Write down the linear regression equation that your model learned.
# - Recall when we visualized each predictor against Power Output using a Scatter
# Plot,
# does the final equation seems logical given
예제 #16
0
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
rf = RandomForestRegressor(labelCol="price",
                           maxBins=40,
                           maxDepth=5,
                           numTrees=100,
                           seed=42)
pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])

with mlflow.start_run(run_name="random-forest") as run:
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    # Log model
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")
    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction",
                                              labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    # Log artifact: feature importance scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(
        list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)),
        columns=["feature", "importance"]).sort_values(by="importance",
                                                       ascending=False))
    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("feature-importance.csv", index=False)
    mlflow.log_artifact("feature-importance.csv")
예제 #17
0
 def TransformDataframe(self, vectors):
     assembler = VectorAssembler(inputCols=vectors, outputCol="features")
     expr = [
         col(c).cast("float").alias(c) for c in assembler.getInputCols()
     ]
     self.dataframe = assembler.transform(self.dataframe)
예제 #18
0
print(dtcModel.toDebugString)

# COMMAND ----------

# Visualize the decision tree

display(dtcModel)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Evaluate Feature Importance

# COMMAND ----------

list(zip(assembler.getInputCols(), dtcModel.featureImportances))

# COMMAND ----------

# MAGIC %md
# MAGIC ### Evaluate Model Performance

# COMMAND ----------

# Vectorize the features of test set
assembledTestDF = assembler.transform(testDF)

# Make predictions using vectorized test set
testPredictionDF = dtcModel.transform(assembledTestDF)

display(testPredictionDF)
indexer = StringIndexer(inputCol="dest_ip", outputCol="dest_ipIndex")
model = indexer.fit(indexed)
indexed = model.transform(indexed)
indexer = StringIndexer(inputCol="tcp_type",outputCol="tcp_typeIndex")
model = indexer.fit(indexed)
indexed = model.transform(indexed)
indexer = StringIndexer(inputCol="tcp_portno",outputCol="tcp_portnoIndex")
model = indexer.fit(indexed)
indexed = model.transform(indexed)
req_test_dataDF = indexed.select("label","ttl","offset","tcp_typeIndex","p_length","source_ipIndex","dest_ipIndex","tcp_portnoIndex")

assembler = VectorAssembler(
  inputCols=["ttl","offset","tcp_typeIndex","p_length","source_ipIndex","dest_ipIndex","tcp_portnoIndex"], outputCol="features"
)
expr = [col(c).cast("Double").alias(c) 
        for c in assembler.getInputCols()]

df2 = req_dataDF.select("label",*expr)
df = assembler.transform(df2.na.drop())
training = df.select("label","features")
test_df2 = req_test_dataDF.select("label",*expr)
test_df = assembler.transform(test_df2.na.drop())
testing = test_df.select("label","features")
lr = LogisticRegression(maxIter=10, regParam=0.3,elasticNetParam=0.8)
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
lrModel = lr.fit(training)
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))
prediction = lrModel.transform(testing)
result = prediction.select("features", "label", "probability", "prediction") \
    .collect()
indexer = StringIndexer(inputCol="tcp_type", outputCol="tcp_typeIndex")
model = indexer.fit(indexed)
indexed = model.transform(indexed)
indexer = StringIndexer(inputCol="tcp_portno", outputCol="tcp_portnoIndex")
model = indexer.fit(indexed)
indexed = model.transform(indexed)
req_test_dataDF = indexed.select("label", "ttl", "offset", "tcp_typeIndex",
                                 "p_length", "source_ipIndex", "dest_ipIndex",
                                 "tcp_portnoIndex")

assembler = VectorAssembler(inputCols=[
    "ttl", "offset", "tcp_typeIndex", "p_length", "source_ipIndex",
    "dest_ipIndex", "tcp_portnoIndex"
],
                            outputCol="features")
expr = [col(c).cast("Double").alias(c) for c in assembler.getInputCols()]

df2 = req_dataDF.select("label", *expr)
df = assembler.transform(df2.na.drop())
training = df.select("label", "features")
test_df2 = req_test_dataDF.select("label", *expr)
test_df = assembler.transform(test_df2.na.drop())
testing = test_df.select("label", "features")
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(training)
predictions = model.transform(testing)
predictions.show()
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)