예제 #1
0
def trainAndEvalModelByDecisionTreeRegressor(stages, train_df, test_df,
                                             evaluator):
    '''
    使用 DecisionTreeRegressor 决策树回归建立机器学习Pipeline流程进行模型训练和评估
    :param stages:
    :param train_df:
    :param test_df:
    :param evaluator:
    :return:
    '''
    print(
        '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练 ======================='
    )
    dt = DecisionTreeRegressor(labelCol='cnt', featuresCol='features')
    dtPipeline = Pipeline(stages=stages +
                          [dt])  # print(str(dtPipeline.getStages()))
    dtPipelineModel = dtPipeline.fit(train_df)
    bestModel = dtPipelineModel.stages[2]  # print(bestModel.toDebugString)
    print(
        '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 ======================='
    )
    predicts = dtPipelineModel.transform(test_df)
    # print(str(predicts.columns))  # 预测后新增的字段:'aFeatures', 'features', 'prediction'
    predicts.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
                    'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt',
                    'prediction').show(10)
    rmse = evaluator.evaluate(predicts)
    print(
        '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(rmse='
        + str(rmse) + ') =======================')
    return (bestModel, predicts, rmse)
예제 #2
0
def prepare_spark_pipeline_for_DT():
    print('----------Preparing spark pipeline for DT----------')
    label_indexer = StringIndexer(inputCol="price",
                                  outputCol="label",
                                  handleInvalid="keep")
    vector_assembler = VectorAssembler(inputCols=features,
                                       outputCol="unscaled_features")
    standard_scaler = StandardScaler(inputCol="unscaled_features",
                                     outputCol="features")
    DT_model = DecisionTreeRegressor(maxDepth=8)

    stages = [label_indexer, vector_assembler, standard_scaler, DT_model]
    pipeline = Pipeline(stages=stages)

    estimator_param = ParamGridBuilder().addGrid(DT_model.maxDepth,
                                                 [8, 16]).addGrid(
                                                     DT_model.impurity,
                                                     ["variance"]).build()
    eval = RegressionEvaluator(labelCol="label",
                               predictionCol="prediction",
                               metricName="mse")
    return CrossValidator(estimator=pipeline,
                          estimatorParamMaps=estimator_param,
                          evaluator=eval,
                          numFolds=3), eval
예제 #3
0
def dtRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : decisiontree_regression model [model]
    """
    featuresCol = conf["params"].get("featuresCol")
    impurity = conf["params"].get("impurity", "variance")
    
    maxDepth    = conf["params"].get("maxDepth", 5)
    maxBin = conf["params"].get("maxBins",32)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB",256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    seed = conf["params"].get("seed", None)
    varianceCol = conf["params"].get("varianceCol", None)   
    
    dt = DecisionTreeRegressor(maxDepth=maxDepth,featuresCol=featuresCol)
    pipeline = Pipeline(stages=[featureIndexer, dt])
    
    print ("maxDepth : " , dt.getMaxDepth())
    
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
          #jika menggunakan ml-tuning cross validation  
          if conf["tuning"].get("method").lower() == "crossval":
            paramgGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramgGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
          #jika menggunakan ml-tuning train validation split
          elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramgGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = pipeline.fit(df)
          
    return model
예제 #4
0
    def dtr(self):
        # Load and parse the data file, converting it to a DataFrame.
        data = self.session.read.format("libsvm").load(self.dataDir + "/data/mllib/sample_libsvm_data.txt")

        # Automatically identify categorical features, and index them.
        # Set maxCategories so features with > 4 distinct values are treated as continuous.
        featureIndexer = \
            VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = data.randomSplit([0.7, 0.3])

        # Train a GBT model.
        drg = DecisionTreeRegressor(featuresCol="indexedFeatures")

        # Chain indexer and GBT in a Pipeline
        pipeline = Pipeline(stages=[featureIndexer, drg])

        # Train model.  This also runs the indexer.
        model = pipeline.fit(trainingData)

        # Make predictions.
        predictions = model.transform(testData)

        # Select example rows to display.
        predictions.select("prediction", "label", "features").show(5)

        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(
            labelCol="label", predictionCol="prediction", metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

        gbtModel = model.stages[1]
        print(gbtModel)  # summary only
def build_decision_tree_regression(observation_df, feature_columns):
    # Create new column with all of the features
    vector_observation_df = create_feature_column(observation_df,
                                                  feature_columns,
                                                  ['features', 'duration_sec'])

    train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3])
    lr = DecisionTreeRegressor(featuresCol="features", labelCol="duration_sec")

    model = lr.fit(train_df)

    test_predictions = model.transform(test_df)

    test_predictions.select("prediction", "duration_sec", "features").show(5)

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="rmse")
    print("RMSE on test data = %g" % evaluator.evaluate(test_predictions))

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="r2")

    print("R2 on test data = %g" % evaluator.evaluate(test_predictions))

    return model
예제 #6
0
def task_7(data_io, train_data, test_data):

    # ---------------------- Your implementation begins------------------------
    dt = DecisionTreeRegressor(labelCol="overall",
                               featuresCol="features",
                               maxDepth=5)
    model = dt.fit(train_data)
    predictions = model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="overall",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {'test_rmse': None}
    # Modify res:
    res['test_rmse'] = rmse

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_7')
    return res
예제 #7
0
 def test_decision_tree_regressor(self):
     features = [[0, 1], [1, 1], [2, 0]]
     features = numpy.array(features, dtype=numpy.float32)
     labels = [100, -10, 50]
     dd = [(labels[i], Vectors.dense(features[i]))
           for i in range(len(labels))]
     data = self.spark.createDataFrame(
         self.spark.sparkContext.parallelize(dd),
         schema=["label", "features"])
     dt = DecisionTreeRegressor(labelCol="label", featuresCol="features")
     model = dt.fit(data)
     feature_count = data.select('features').first()[0].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml Decision Tree Regressor',
         [('features', FloatTensorType([None, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     predicted = model.transform(data)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlDecisionTreeRegressor")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
예제 #8
0
def trainAndEvalModelByDecisionTreeRegressorAndCrossValidator(
        stages, train_df, test_df, evaluator):
    '''
    使用 DecisionTreeRegressor 决策树回归和 CrossValidator 建立机器学习Pipeline流程进行模型训练和验证,并找出最佳模型
    :param stages:
    :param train_df:
    :param test_df:
    :param evaluator:
    :return:
    '''
    print(
        '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练 ======================='
    )
    dt = DecisionTreeRegressor(labelCol='cnt', featuresCol='features')
    paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [
        5, 10, 15, 25
    ]).addGrid(dt.maxBins, [25, 35, 45, 50]).build(
    )  # 执行模型参数训练 4*4=16次,其中impurity="variance"固定不变,不用再参与训练,由于在line:108,创建 vectorIndexer 时,设置了maxCategories=24,因此这里maxBins要大于24
    cv = CrossValidator(estimator=dt,
                        evaluator=evaluator,
                        estimatorParamMaps=paramGrid,
                        numFolds=3)
    cvPipeline = Pipeline(stages=stages + [cv])
    cvPipelineModel = cvPipeline.fit(train_df)
    bestModel = cvPipelineModel.stages[2].bestModel
    print(
        '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 ======================='
    )
    predicts = cvPipelineModel.transform(test_df)
    rmse = evaluator.evaluate(predicts)
    print(
        '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(rmse='
        + str(rmse) + ') =======================')
    return (bestModel, predicts, rmse)
예제 #9
0
def score_dt(split_input_train_df, split_input_validation_df, model_evaluator):
    global model_rmse, model_dict, model_count

    print(
        "###################### Decision Tree Regression #########################"
    )
    dt_regressor = DecisionTreeRegressor(featuresCol='features',
                                         labelCol='total_delivery_duration')

    print("CrossValidation...")
    dt_paramGrid = ParamGridBuilder()\
    .addGrid(dt_regressor.maxBins, [5700, 6000])\
    .addGrid(dt_regressor.maxMemoryInMB, [256, 512])\
    .build()
    dt_cross_val = CrossValidator(estimator=dt_regressor,
                                  estimatorParamMaps=dt_paramGrid,
                                  evaluator=model_evaluator,
                                  numFolds=3)
    print("Done")
    print("Fitting training data...")
    dt_cv_model = dt_cross_val.fit(split_input_train_df)
    print("Done")
    print("Evaluating on validation data...")
    rmse = model_evaluator.evaluate(
        dt_cv_model.transform(split_input_validation_df))
    model_rmse.append(rmse)
    model_count += 1
    model_dict[model_count] = {}
    model_dict[model_count]["DT"] = dt_cv_model
    print("RMSE on validation data: %f" % rmse)
예제 #10
0
def create_model(training_data, features_col, label_col, max_bins=32):
    """
    Create machine learning model
    :param training_data: -- dataframe: training dataset
    :param features_col: -- col: containing all the features needed.
    :param label_col: -- col: label
    :param max_bins: -- integer: number of bins needed for
    :return: model created and its evaluator
    """

    # Create Decision Tree Model
    dt = DecisionTreeRegressor()

    # Create params for the model
    params = ParamGridBuilder().baseOn({
        dt.featuresCol: features_col
    }).baseOn({
        dt.labelCol: label_col
    }).addGrid(dt.maxDepth, [3, 5, 7]).addGrid(
        dt.maxBins, [32 if max_bins <= 32 else max_bins + 1]).build()

    # Model Evaluator
    dt_evaluator = RegressionEvaluator(labelCol=label_col)

    # Create model with Cross Validation to get the best results
    dt_cv = CrossValidator(estimator=dt,
                           estimatorParamMaps=params,
                           evaluator=dt_evaluator)

    dt_cv_model = dt_cv.fit(training_data)

    return dt_cv_model, dt_evaluator
예제 #11
0
def model_train(input,model_path):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(input,schema= tmax_schema)
    train, validation = data.randomSplit([0.75,0.25])
    train = train.cache()
    validation = validation.cache()

    sql_query = """SELECT today.latitude, today.longitude, today.elevation, dayofyear(today.date) AS dy,yesterday.tmax AS yesterday_tmax, today.tmax
                     FROM __THIS__ as today
               INNER JOIN __THIS__ as yesterday
                       ON date_sub(today.date, 1) = yesterday.date
                      AND today.station = yesterday.station"""
    transformer = SQLTransformer(statement=sql_query)
    assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation','dy','yesterday_tmax'],outputCol='features')
    regressor = DecisionTreeRegressor(featuresCol='features',labelCol='tmax')
    weather_pipeline = Pipeline(stages=[transformer,assemble_features,regressor])
    model = weather_pipeline.fit(train)
    model.write().overwrite().save(model_path)

    prediction = model.transform(validation)
    #Scoring the model
    evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='tmax',metricName='rmse')
    score = evaluator.evaluate(prediction)
    print("Score of the weather model is",score)
예제 #12
0
def predict_price_of_unit_area_by_decision_tree(
        real_estate_dataset_df: DataFrame):
    """
    Predict the price per unit area based on house age, distance to MRT (public transportation) and number of convenience stores,
    using decision tree regression.
    :param real_estate_dataset_df:
    :return:
    """

    real_estate_dataset_df = transform_dataset_to_label_feature_form(
        real_estate_dataset_df)

    train_test_datasets = real_estate_dataset_df.randomSplit([0.5, 0.5])
    train_dataset = train_test_datasets[0]
    test_dataset = train_test_datasets[1]

    # setLabelCol, setFeatureCol: Change column name for "label" and "features" columns.
    decision_tree_regressor = DecisionTreeRegressor().setLabelCol(
        'actual_price')
    model = decision_tree_regressor.fit(train_dataset)

    # Create predictions for testing dataset.
    predictions = model.transform(test_dataset).\
        select('actual_price', func.round(func.col('prediction'), 2).alias('predicted_price')).\
        orderBy(func.desc('actual_price')).cache()

    return predictions
예제 #13
0
def train(data, max_depth, max_bins, model_name, log_as_mleap, log_as_onnx):
    (trainingData, testData) = data.randomSplit([0.7, 0.3], 42)
    print("testData.schema:")
    testData.printSchema()

    # MLflow - log parameters
    print("Parameters:")
    print("  max_depth:", max_depth)
    print("  max_bins:", max_bins)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("max_bins", max_bins)

    # Create pipeline
    dt = DecisionTreeRegressor(labelCol=colLabel,
                               featuresCol=colFeatures,
                               maxDepth=max_depth,
                               maxBins=max_bins)
    assembler = VectorAssembler(inputCols=data.columns[:-1],
                                outputCol=colFeatures)
    pipeline = Pipeline(stages=[assembler, dt])

    # Fit model and predict
    model = pipeline.fit(trainingData)
    predictions = model.transform(testData)

    # MLflow - log metrics
    print("Metrics:")
    predictions = model.transform(testData)
    metrics = ["rmse", "r2", "mae"]
    for metric_name in metrics:
        evaluator = RegressionEvaluator(labelCol=colLabel,
                                        predictionCol=colPrediction,
                                        metricName=metric_name)
        metric_value = evaluator.evaluate(predictions)
        print(f"  {metric_name}: {metric_value}")
        mlflow.log_metric(metric_name, metric_value)

    # MLflow - log spark model
    mlflow.spark.log_model(model, "spark-model", \
        registered_model_name=None if not model_name else f"{model_name}")

    # MLflow - log as MLeap model
    if log_as_mleap:
        scoreData = testData.drop("quality")
        mlflow.mleap.log_model(spark_model=model, sample_input=scoreData, artifact_path="mleap-model", \
            registered_model_name=None if not model_name else f"{model_name}_mleap")

        # Log MLeap schema file for MLeap runtime deserialization
        schema_path = "schema.json"
        with open(schema_path, 'w') as f:
            f.write(scoreData.schema.json())
        print("schema_path:", schema_path)
        mlflow.log_artifact(schema_path, "mleap-model")

    # MLflow - log as ONNX model
    if log_as_onnx:
        import onnx_utils
        scoreData = testData.drop("quality")
        onnx_utils.log_model(spark, model, "onnx-model", model_name, scoreData)
예제 #14
0
파일: trees.py 프로젝트: dsp-uga/einstein
 def model_define(self):
     """Returns a model with the hyperparameters inputted in :func:
     `get_parameters`
     Returns:
         (pyspark.ml.regression.DecisionTreeRegressor):
             Decision Tree Regression model
     """
     return DecisionTreeRegressor()
예제 #15
0
def DT_Algorithm(tr,te,featureIndexer):
    # Train a DecisionTree model.
    dt = DecisionTreeRegressor(featuresCol="indexedFeatures",labelCol='positive_rating_ratio')
    # Chain indexer and tree in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, dt])
    paramGrid=ParamGridBuilder().addGrid(dt.maxDepth, [5, 10, 15]) \
        .addGrid(dt.minInstancesPerNode, [1, 5, 10]).build()
    print("---------------------Decision Tree Regression---------------------")
    predict(tr, te, pipeline, paramGrid, False)
def estimator_pipeline(train_dataframe, test_dataframe):

    random.seed(0)

    #вектор features
    vector = VectorAssembler(inputCols=train_dataframe.columns[:-1],
                             outputCol='features')

    #estimator LR с параметрами из задания
    estimator_LR = LinearRegression(featuresCol='features',
                                    labelCol='ctr',
                                    maxIter=40,
                                    regParam=0.4,
                                    elasticNetParam=0.8)
    #другие эстиматоры с параметрами по умолчанию
    estimator_DT = DecisionTreeRegressor(featuresCol='features',
                                         labelCol='ctr')
    estimator_RF = RandomForestRegressor(featuresCol='features',
                                         labelCol='ctr')
    estimator_GB = GBTRegressor(featuresCol='features', labelCol='ctr')

    #evaluator
    RMSE_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='ctr',
                                         metricName='rmse')

    #спсиок моделей и непосредственно результаты будем записывать в списки
    models_ = []
    RMSE_result = []

    #обучаем все эстиматоры
    for est_r in [estimator_LR, estimator_DT, estimator_RF, estimator_GB]:

        #задаем pipline обучения (2 стадии, в реальности - доп.ступени отчистки и предобработки данных)
        pipeline = Pipeline(stages=[vector, est_r])
        #делаем fit для Pipline по тренировочному датасету (создаем вектор, обучаем эстиматор)
        model = pipeline.fit(train_dataframe)
        #добавляем  модель в список
        models_.append(model)

        #       #сохраняем модель (по заданию) - можем сохранть модель в цикле с uid, но тогда нет понимания как правильно
        #       #обращатсья к модели через PipelineModel.load из PySparkMLPredict т.к. uid будет постоянно меняться
        #       #по этому сохраняем вс модели в список и далее для каждую модель сохраняем с определенным названием
        #        model.save(est_r.uid)

        #делаем прогноз по тестовому датасету
        prediction = pipeline.fit(train_dataframe).transform(test_dataframe)
        #считаем метрику RMSE для тестового датасета
        RMSE = round(RMSE_evaluator.evaluate(prediction), 4)
        #записываем результат в массив для отображения в консоли
        RMSE_result.append(RMSE)

    #сохранение моделей
    for pair in zip(models_, ['LR_model', 'DT_model', 'RF_model', 'GB_model']):
        pair[0].save(pair[1])

    return models_, RMSE_result
예제 #17
0
def decision_tree_regression(trainingDataFrame, maxCategories=4):
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                   maxCategories=maxCategories).fit(trainingDataFrame)
    dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[featureIndexer, dt])
    dtModel = pipeline.fit(trainingDataFrame)
    result = {}
    result["model"] = dtModel
    result["summary"] = dtModel.stages[1]
    return result
예제 #18
0
def get_best_weather_model(data):
    train, test = data.randomSplit([0.75, 0.25])
    train = train.cache()
    test = test.cache()

    # e.g., use print(LinearRegression().explainParams()) to see what can be tuned
    estimator_gridbuilders = [
        estimator_gridbuilder(
            DecisionTreeRegressor(),
            dict(
                maxDepth=[10],
                minInstancesPerNode=[2],
                minInfoGain=[0.5],
            )),
        estimator_gridbuilder(
            LinearRegression(),
            dict(
                regParam=[0.2],  # [0.1, 0.01]
                elasticNetParam=[.8],  # 0-L2, 1-L1
                aggregationDepth=[5],
                tol=[0.000005],
                maxIter=[100])),
        estimator_gridbuilder(
            RandomForestRegressor(),
            dict(featureSubsetStrategy=["onethird"],
                 maxDepth=[10],
                 numTrees=[40])),
        estimator_gridbuilder(
            GBTRegressor(),
            dict(
                maxIter=[20],
                maxDepth=[10],
                lossType=['squared'],
            )),
        # TODO: find better estimators
    ]
    metricName = 'r2'
    tvs_list = make_weather_trainers(
        .2,  # fraction of data for training
        estimator_gridbuilders,
        metricName)
    ev = tvs_list[0].getEvaluator()
    scorescale = 1 if ev.isLargerBetter() else -1
    model_name_scores = []
    for tvs in tvs_list:
        model = tvs.fit(train)
        test_pred = model.transform(test)
        score = ev.evaluate(test_pred) * scorescale
        model_name_scores.append(
            (model, get_estimator_name(tvs.getEstimator()), score))
    best_model, best_name, best_score = max(model_name_scores,
                                            key=lambda triplet: triplet[2])
    print("Best model is %s with validation data %s score %f" %
          (best_name, ev.getMetricName(), best_score * scorescale))
    return best_model
예제 #19
0
def test_pyspark_regression_decision_tree():
    try:
        import pyspark
        import sklearn.datasets
        from pyspark.sql import SparkSession
        from pyspark import SparkContext, SparkConf
        from pyspark.ml.feature import VectorAssembler, StringIndexer
        from pyspark.ml.regression import DecisionTreeRegressor, GBTRegressor, RandomForestRegressor
        import pandas as pd

        iris_sk = sklearn.datasets.load_iris()
        iris = pd.DataFrame(data=np.c_[iris_sk['data'], iris_sk['target']],
                            columns=iris_sk['feature_names'] +
                            ['target'])[:100]
        spark = SparkSession.builder.config(
            conf=SparkConf().set("spark.master", "local[*]")).getOrCreate()
    except:
        print("Skipping test_pyspark_regression_decision_tree!")
        return
    import shap
    import numpy as np

    # Simple regressor: try to predict sepal length based on the other features
    col = [
        "sepal_length", "sepal_width", "petal_length", "petal_width", "type"
    ]
    iris = spark.createDataFrame(iris, col).drop("type")
    iris = VectorAssembler(inputCols=col[1:-1],
                           outputCol="features").transform(iris)

    regressors = [
        GBTRegressor(labelCol="sepal_length", featuresCol="features"),
        RandomForestRegressor(labelCol="sepal_length", featuresCol="features"),
        DecisionTreeRegressor(labelCol="sepal_length", featuresCol="features")
    ]
    for regressor in regressors:
        model = regressor.fit(iris)
        explainer = shap.TreeExplainer(model)
        X = pd.DataFrame(data=iris_sk.data,
                         columns=iris_sk.feature_names).drop(
                             'sepal length (cm)', 1)[:100]  # pylint: disable=E1101

        shap_values = explainer.shap_values(X)
        expected_values = explainer.expected_value

        # validate values sum to the margin prediction of the model plus expected_value
        predictions = model.transform(iris).select("prediction").toPandas()
        diffs = expected_values + shap_values.sum(
            1) - predictions["prediction"]
        assert np.max(np.abs(
            diffs)) < 1e-4, "SHAP values don't sum to model output for class0!"
        assert (np.abs(expected_values - predictions.mean()) <
                1e-1).all(), "Bad expected_value!"
    spark.stop()
예제 #20
0
def decision_tree_regressor(spark, original_label_col, feature_col_names):
    # Create two columns, 'label' and 'features'. Label is true or false, features is a vector of values.
    label_col = "label"
    vector_col = "features"

    dt = DecisionTreeRegressor(labelCol="indexedLabel",
                               featuresCol="indexedFeatures")
    evaluator = RegressionEvaluator(labelCol="indexedLabel",
                                    predictionCol="prediction",
                                    metricName="mae")
    return run_model(spark, original_label_col, label_col, vector_col,
                     feature_col_names, dt, [evaluator])
예제 #21
0
def func2():
    """
    使用K折交叉验证
    :return:
    """
    hour_df = sqlContext.read.format("csv").option(
        "header", "true").load(Path + "hour.csv")
    # 舍弃不需要的字段
    hour_df = hour_df.drop("instant").drop("dteday").drop("yr").drop(
        "casual").drop("registered")
    # 数据转换为double
    hour_df = hour_df.select([
        col(column).cast("double").alias(column) for column in hour_df.columns
    ])
    # 将数据分为train_df和test_df,比例为0.7:0.3
    train_df, test_df = hour_df.randomSplit([0.7, 0.3])
    train_df.cache()
    test_df.cache()
    # 创建特征字段list
    featureCols = hour_df.columns[:-1]
    # 建立pipeline
    vectorAssembler = VectorAssembler(inputCols=featureCols,
                                      outputCol="aFeatures")
    vectorIndexer = VectorIndexer(inputCol="aFeatures",
                                  outputCol="features",
                                  maxCategories=24)
    dt = DecisionTreeRegressor(labelCol="cnt", featuresCol="features")
    dt_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, dt])
    # 训练
    dt_pipelineModel = dt_pipeline.fit(dataset=train_df)
    # 使用transform预测
    predicted = dt_pipelineModel.transform(test_df)
    ###评估模型
    evaluator = RegressionEvaluator(labelCol="cnt",
                                    predictionCol="prediction",
                                    metricName="rmse")
    predicted_df = dt_pipelineModel.transform(test_df)
    rmse = evaluator.evaluate(predicted_df)
    ##TrainValidationSplit训练找出最佳模型
    paramGrid = ParamGridBuilder().addGrid(
        dt.impurity,
        ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(
            dt.maxBins, [10, 15, 20]).build()
    cv = CrossValidator(estimator=dt,
                        evaluator=evaluator,
                        estimatorParamMaps=paramGrid,
                        numFolds=3)
    cv_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])
    cv_pipelineModel = cv_pipeline.fit(dataset=train_df)
    ##使用最佳模型进行预测
    predictions = cv_pipelineModel.transform(test_df)
    rmse2 = evaluator.evaluate(predictions)
    print(rmse2)
예제 #22
0
def DTR(df_data):
    # Train a DecisionTree model.
    print("Train a DecisionTree model...")
    t1 = time.time()
    dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
    # Chain indexer and tree in a Pipeline
    pipeline = Pipeline(stages=[data.feature_indexer(df_data), dt])
    # Train model.  This also runs the indexer.
    dtr_model = pipeline.fit(df_data)
    t2 = time.time() - t1
    print("dt_model using time: %.2fs\n" % t2)
    return dtr_model
def decision_tree_regression(train_data, test_data):
    dt = DecisionTreeRegressor(featuresCol='features', labelCol='MEDV')
    dt_model = dt.fit(train_data)
    dt_predictions = dt_model.transform(test_data)
    dt_evaluator = RegressionEvaluator(
        labelCol='MEDV',
        predictionCol='prediction',
        metricName='rmse',
    )
    rmse = dt_evaluator.evaluate(dt_predictions)
    print('Root Mean Squared Error (RMSE) on test data = %g' % rmse)
    print(dt_model.featureImportances)
예제 #24
0
def task_8(data_io, train_data, test_data):

    # ---------------------- Your implementation begins------------------------
    trainingData, testData = train_data.randomSplit([0.75, 0.25])
    best = 0
    all_rmse = []
    lowest_rmse = 100
    for i in [5, 7, 9, 12]:
        dt = DecisionTreeRegressor(labelCol="overall",
                                   featuresCol="features",
                                   maxDepth=i)
        model = dt.fit(trainingData)
        predictions = model.transform(testData)
        evaluator = RegressionEvaluator(labelCol="overall",
                                        predictionCol="prediction",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        all_rmse = all_rmse + [rmse]
        if rmse <= lowest_rmse:
            lowest_rmse = rmse
            best = i
            best_model = model

    predictions = best_model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="overall",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'test_rmse': None,
        'valid_rmse_depth_5': None,
        'valid_rmse_depth_7': None,
        'valid_rmse_depth_9': None,
        'valid_rmse_depth_12': None,
    }
    # Modify res:
    res['test_rmse'] = rmse
    res['valid_rmse_depth_5'] = all_rmse[0]
    res['valid_rmse_depth_7'] = all_rmse[1]
    res['valid_rmse_depth_9'] = all_rmse[2]
    res['valid_rmse_depth_12'] = all_rmse[3]

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_8')
    return res
def decisionTreeRegressor(data, ncolumns, schemaNames):
    from pyspark.ml import Pipeline
    from pyspark.ml.regression import DecisionTreeRegressor
    from pyspark.ml.tuning import ParamGridBuilder
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.tuning import CrossValidator
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.feature import Binarizer
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    import numpy as np
    import time

    binarizer = Binarizer(
        threshold=0.00001,
        inputCol="features",
        outputCol="binarized_features",
    )
    binarizedDataFrame = binarizer.transform(data)

    (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50)
    dtr = DecisionTreeRegressor(labelCol="label",
                                featuresCol="binarized_features",
                                maxDepth=10,
                                maxBins=10,
                                impurity='Variance')

    timer = ''
    start = time.time()
    cvModel = dtr.fit(trainingData)
    end = time.time()
    timer = ((end - start) / 60)

    prediction = cvModel.transform(testData)
    evaluator = RegressionEvaluator\
         (labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(prediction)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    areaUC = evaluator.evaluate(prediction)

    fi = cvModel.featureImportances
    imp_feat = np.zeros(ncolumns - 1)
    imp_feat[fi.indices] = fi.values
    x = np.arange(ncolumns - 1)
    idx = (-imp_feat).argsort()[:3]
    feat = []
    for i in idx:
        feat.append(schemaNames[i])

    return feat, rmse, areaUC, timer
예제 #26
0
def que1():
    for i, ct in enumerate([
            DecisionTreeClassifier(seed=9008),
            DecisionTreeRegressor(predictionCol="prediction_c", seed=9008),
            LogisticRegression()
    ]):
        binarizer = None
        if i == 0:
            print("[*] DecisionTree Classifier")
            paramB = ParamGridBuilder().addGrid(
                ct.maxDepth,
                [5, 10, 20]).addGrid(ct.maxBins, [16, 32]).addGrid(
                    ct.impurity, ["gini", "entropy"]).build()
            continue
        elif i == 1:
            print("[*] DecisionTree Regressor")
            paramB = ParamGridBuilder().addGrid(
                ct.maxDepth,
                [5, 10, 20]).addGrid(ct.maxBins, [16, 32]).addGrid(
                    ct.minInfoGain, [0.0, 0.25, 0.3]).build()
            binarizer = Binarizer(threshold=0.5,
                                  inputCol="prediction_c",
                                  outputCol="prediction")
        else:
            print("[*] Logistic Regression")
            paramB = ParamGridBuilder().addGrid(ct.maxIter,
                                                [5, 10, 15]).addGrid(
                                                    ct.regParam,
                                                    [0.05, 0.1, 0.5]).build()

        if binarizer is not None: pipeline = Pipeline(stages=[ct, binarizer])
        else: pipeline = Pipeline(stages=[ct])

        print("[*] Running for areaUnderROC")
        bp, metric_roc = run_metric(
            s_train, s_test, pipeline, paramB,
            BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          metricName="areaUnderROC"))
        print("[*] Done for areaUnderROC")
        print("[*] Best Params: %s, AreaUnderROC value: %f" % (bp, metric_roc))

        print("[*] Running for accuracy")
        mp, metric_acc = run_metric(
            s_train, s_test, pipeline, paramB,
            MulticlassClassificationEvaluator(predictionCol="prediction",
                                              metricName="accuracy"))
        print("[*] Done for accuracy")
        print("[*] Best Params: %s, Accuracy value: %f" % (mp, metric_acc))
예제 #27
0
def main():
    spark = (SparkSession
             .builder
             .appName("PowerPlant")
             .getOrCreate()
             )

    powerPlantDF = spark.read.csv("../data/CCPP/sheet*.csv",header=True,inferSchema=True)

    vectorizer = VectorAssembler(inputCols = ["AT","V","AP","RH"],outputCol="features")

    split20DF,split80DF = powerPlantDF.randomSplit([0.20,0.80],seed=100)
    testSetDF = split20DF.cache()
    trainingSetDF = split80DF.cache()

    dt = (DecisionTreeRegressor()
          .setLabelCol("PE")
          .setPredictionCol("Predicted_PE")
          .setFeaturesCol("features")
          .setMaxBins(100)
         )

    dtPipeline = ( <CODE>
                 )

    regEval = RegressionEvaluator(predictionCol="Predicted_PE",labelCol="PE",metricName="rmse")
    crossval = CrossValidator(estimator=dtPipeline,evaluator=regEval,numFolds=3)

    paramGrid = ( <CODE>
                )

    crossval.setEstimatorParamMaps(paramGrid)

    dtModel = crossval.fit(trainingSetDF).bestModel

    predictionsAndLabelsDF = (dtModel
                              .transform(testSetDF)
                              .select("AT","V","AP","RH","PE","Predicted_PE")
                             )
    rmseDT = regEval.evaluate(predictionsAndLabelsDF)
    r2DT = regEval.evaluate(predictionsAndLabelsDF,{<CODE>})

    print("DT RMSE: {0:.2f}".format(rmseDT))
    print("DT R2: {0:.2f}".format(r2DT))

    spark.stop()
예제 #28
0
 def test_decisiontree_regressor(self):
     dt = DecisionTreeRegressor(maxDepth=1)
     path = tempfile.mkdtemp()
     dtr_path = path + "/dtr"
     dt.save(dtr_path)
     dt2 = DecisionTreeClassifier.load(dtr_path)
     self.assertEqual(dt2.uid, dt2.maxDepth.parent,
                      "Loaded DecisionTreeRegressor instance uid (%s) "
                      "did not match Param's uid (%s)"
                      % (dt2.uid, dt2.maxDepth.parent))
     self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth],
                      "Loaded DecisionTreeRegressor instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
예제 #29
0
def get_predictor(ensemble_type, label_column, features_column,
                  prediction_column):
    models = {
        'dct':
        DecisionTreeRegressor(labelCol=label_column,
                              featuresCol=features_column,
                              predictionCol=prediction_column),
        'gbt':
        GBTRegressor(labelCol=label_column,
                     featuresCol=features_column,
                     predictionCol=prediction_column),
        'rf':
        RandomForestRegressor(labelCol=label_column,
                              featuresCol=features_column,
                              predictionCol=prediction_column),
    }
    return models.get(ensemble_type)
예제 #30
0
    def test_decision_tree_regressor_pipeline(self):
        import os
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)

        feature_count = 5
        self.spark.udf.register(
            "truncateFeatures",
            lambda x: SparseVector(feature_count, range(0, feature_count),
                                   x.toArray()[125:130]), VectorUDT())
        data = original_data.selectExpr(
            "label", "truncateFeatures(features) as features")

        featureIndexer = \
            VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4, handleInvalid='error')
        (trainingData, testData) = data.randomSplit([0.7, 0.3])
        dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
        pipeline = Pipeline(stages=[featureIndexer, dt])
        model = pipeline.fit(trainingData)
        model_onnx = convert_sparkml(
            model,
            'Sparkml Decision Tree Regressor Pipeline',
            [('features', FloatTensorType([None, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(testData)
        data_np = testData.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(
            data_np,
            expected,
            model,
            model_onnx,
            basename="SparkmlDecisionTreeRegressorPipeline")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['prediction'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)