def build_decision_tree_regression(observation_df, feature_columns):
    # Create new column with all of the features
    vector_observation_df = create_feature_column(observation_df,
                                                  feature_columns,
                                                  ['features', 'duration_sec'])

    train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3])
    lr = DecisionTreeRegressor(featuresCol="features", labelCol="duration_sec")

    model = lr.fit(train_df)

    test_predictions = model.transform(test_df)

    test_predictions.select("prediction", "duration_sec", "features").show(5)

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="rmse")
    print("RMSE on test data = %g" % evaluator.evaluate(test_predictions))

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="r2")

    print("R2 on test data = %g" % evaluator.evaluate(test_predictions))

    return model
Exemplo n.º 2
0
def predict_price_of_unit_area_by_decision_tree(
        real_estate_dataset_df: DataFrame):
    """
    Predict the price per unit area based on house age, distance to MRT (public transportation) and number of convenience stores,
    using decision tree regression.
    :param real_estate_dataset_df:
    :return:
    """

    real_estate_dataset_df = transform_dataset_to_label_feature_form(
        real_estate_dataset_df)

    train_test_datasets = real_estate_dataset_df.randomSplit([0.5, 0.5])
    train_dataset = train_test_datasets[0]
    test_dataset = train_test_datasets[1]

    # setLabelCol, setFeatureCol: Change column name for "label" and "features" columns.
    decision_tree_regressor = DecisionTreeRegressor().setLabelCol(
        'actual_price')
    model = decision_tree_regressor.fit(train_dataset)

    # Create predictions for testing dataset.
    predictions = model.transform(test_dataset).\
        select('actual_price', func.round(func.col('prediction'), 2).alias('predicted_price')).\
        orderBy(func.desc('actual_price')).cache()

    return predictions
Exemplo n.º 3
0
def dtRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : decisiontree_regression model [model]
    """
    featuresCol = conf["params"].get("featuresCol")
    impurity = conf["params"].get("impurity", "variance")
    
    maxDepth    = conf["params"].get("maxDepth", 5)
    maxBin = conf["params"].get("maxBins",32)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB",256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    seed = conf["params"].get("seed", None)
    varianceCol = conf["params"].get("varianceCol", None)   
    
    dt = DecisionTreeRegressor(maxDepth=maxDepth,featuresCol=featuresCol)
    pipeline = Pipeline(stages=[featureIndexer, dt])
    
    print ("maxDepth : " , dt.getMaxDepth())
    
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
          #jika menggunakan ml-tuning cross validation  
          if conf["tuning"].get("method").lower() == "crossval":
            paramgGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramgGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
          #jika menggunakan ml-tuning train validation split
          elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramgGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = pipeline.fit(df)
          
    return model
Exemplo n.º 4
0
def task_7(data_io, train_data, test_data):

    # ---------------------- Your implementation begins------------------------
    dt = DecisionTreeRegressor(labelCol="overall",
                               featuresCol="features",
                               maxDepth=5)
    model = dt.fit(train_data)
    predictions = model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="overall",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {'test_rmse': None}
    # Modify res:
    res['test_rmse'] = rmse

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_7')
    return res
Exemplo n.º 5
0
 def test_decision_tree_regressor(self):
     features = [[0, 1], [1, 1], [2, 0]]
     features = numpy.array(features, dtype=numpy.float32)
     labels = [100, -10, 50]
     dd = [(labels[i], Vectors.dense(features[i]))
           for i in range(len(labels))]
     data = self.spark.createDataFrame(
         self.spark.sparkContext.parallelize(dd),
         schema=["label", "features"])
     dt = DecisionTreeRegressor(labelCol="label", featuresCol="features")
     model = dt.fit(data)
     feature_count = data.select('features').first()[0].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml Decision Tree Regressor',
         [('features', FloatTensorType([None, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     predicted = model.transform(data)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlDecisionTreeRegressor")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
def decision_tree_regression(train_data, test_data):
    dt = DecisionTreeRegressor(featuresCol='features', labelCol='MEDV')
    dt_model = dt.fit(train_data)
    dt_predictions = dt_model.transform(test_data)
    dt_evaluator = RegressionEvaluator(
        labelCol='MEDV',
        predictionCol='prediction',
        metricName='rmse',
    )
    rmse = dt_evaluator.evaluate(dt_predictions)
    print('Root Mean Squared Error (RMSE) on test data = %g' % rmse)
    print(dt_model.featureImportances)
Exemplo n.º 7
0
def task_8(data_io, train_data, test_data):

    # ---------------------- Your implementation begins------------------------
    trainingData, testData = train_data.randomSplit([0.75, 0.25])
    best = 0
    all_rmse = []
    lowest_rmse = 100
    for i in [5, 7, 9, 12]:
        dt = DecisionTreeRegressor(labelCol="overall",
                                   featuresCol="features",
                                   maxDepth=i)
        model = dt.fit(trainingData)
        predictions = model.transform(testData)
        evaluator = RegressionEvaluator(labelCol="overall",
                                        predictionCol="prediction",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        all_rmse = all_rmse + [rmse]
        if rmse <= lowest_rmse:
            lowest_rmse = rmse
            best = i
            best_model = model

    predictions = best_model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="overall",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'test_rmse': None,
        'valid_rmse_depth_5': None,
        'valid_rmse_depth_7': None,
        'valid_rmse_depth_9': None,
        'valid_rmse_depth_12': None,
    }
    # Modify res:
    res['test_rmse'] = rmse
    res['valid_rmse_depth_5'] = all_rmse[0]
    res['valid_rmse_depth_7'] = all_rmse[1]
    res['valid_rmse_depth_9'] = all_rmse[2]
    res['valid_rmse_depth_12'] = all_rmse[3]

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_8')
    return res
def decisionTreeRegressor(data, ncolumns, schemaNames):
    from pyspark.ml import Pipeline
    from pyspark.ml.regression import DecisionTreeRegressor
    from pyspark.ml.tuning import ParamGridBuilder
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.tuning import CrossValidator
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.feature import Binarizer
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    import numpy as np
    import time

    binarizer = Binarizer(
        threshold=0.00001,
        inputCol="features",
        outputCol="binarized_features",
    )
    binarizedDataFrame = binarizer.transform(data)

    (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50)
    dtr = DecisionTreeRegressor(labelCol="label",
                                featuresCol="binarized_features",
                                maxDepth=10,
                                maxBins=10,
                                impurity='Variance')

    timer = ''
    start = time.time()
    cvModel = dtr.fit(trainingData)
    end = time.time()
    timer = ((end - start) / 60)

    prediction = cvModel.transform(testData)
    evaluator = RegressionEvaluator\
         (labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(prediction)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    areaUC = evaluator.evaluate(prediction)

    fi = cvModel.featureImportances
    imp_feat = np.zeros(ncolumns - 1)
    imp_feat[fi.indices] = fi.values
    x = np.arange(ncolumns - 1)
    idx = (-imp_feat).argsort()[:3]
    feat = []
    for i in idx:
        feat.append(schemaNames[i])

    return feat, rmse, areaUC, timer
Exemplo n.º 9
0
def score_dt(split_input_train_df, split_input_validation_df, model_evaluator):
    global model_rmse, model_dict, model_count

    print(
        "###################### Decision Tree Regression #########################"
    )
    dt_regressor = DecisionTreeRegressor(featuresCol='features',
                                         labelCol='total_delivery_duration')

    print("CrossValidation...")
    dt_paramGrid = ParamGridBuilder()\
    .addGrid(dt_regressor.maxBins, [5700, 6000])\
    .addGrid(dt_regressor.maxMemoryInMB, [256, 512])\
    .build()
    dt_cross_val = CrossValidator(estimator=dt_regressor,
                                  estimatorParamMaps=dt_paramGrid,
                                  evaluator=model_evaluator,
                                  numFolds=3)
    print("Done")
    print("Fitting training data...")
    dt_cv_model = dt_cross_val.fit(split_input_train_df)
    print("Done")
    print("Evaluating on validation data...")
    rmse = model_evaluator.evaluate(
        dt_cv_model.transform(split_input_validation_df))
    model_rmse.append(rmse)
    model_count += 1
    model_dict[model_count] = {}
    model_dict[model_count]["DT"] = dt_cv_model
    print("RMSE on validation data: %f" % rmse)
Exemplo n.º 10
0
def model_train(input,model_path):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(input,schema= tmax_schema)
    train, validation = data.randomSplit([0.75,0.25])
    train = train.cache()
    validation = validation.cache()

    sql_query = """SELECT today.latitude, today.longitude, today.elevation, dayofyear(today.date) AS dy,yesterday.tmax AS yesterday_tmax, today.tmax
                     FROM __THIS__ as today
               INNER JOIN __THIS__ as yesterday
                       ON date_sub(today.date, 1) = yesterday.date
                      AND today.station = yesterday.station"""
    transformer = SQLTransformer(statement=sql_query)
    assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation','dy','yesterday_tmax'],outputCol='features')
    regressor = DecisionTreeRegressor(featuresCol='features',labelCol='tmax')
    weather_pipeline = Pipeline(stages=[transformer,assemble_features,regressor])
    model = weather_pipeline.fit(train)
    model.write().overwrite().save(model_path)

    prediction = model.transform(validation)
    #Scoring the model
    evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='tmax',metricName='rmse')
    score = evaluator.evaluate(prediction)
    print("Score of the weather model is",score)
Exemplo n.º 11
0
def trainAndEvalModelByDecisionTreeRegressor(stages, train_df, test_df,
                                             evaluator):
    '''
    使用 DecisionTreeRegressor 决策树回归建立机器学习Pipeline流程进行模型训练和评估
    :param stages:
    :param train_df:
    :param test_df:
    :param evaluator:
    :return:
    '''
    print(
        '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练 ======================='
    )
    dt = DecisionTreeRegressor(labelCol='cnt', featuresCol='features')
    dtPipeline = Pipeline(stages=stages +
                          [dt])  # print(str(dtPipeline.getStages()))
    dtPipelineModel = dtPipeline.fit(train_df)
    bestModel = dtPipelineModel.stages[2]  # print(bestModel.toDebugString)
    print(
        '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 ======================='
    )
    predicts = dtPipelineModel.transform(test_df)
    # print(str(predicts.columns))  # 预测后新增的字段:'aFeatures', 'features', 'prediction'
    predicts.select('season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
                    'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'cnt',
                    'prediction').show(10)
    rmse = evaluator.evaluate(predicts)
    print(
        '======================= 使用 DecisionTreeRegressor 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(rmse='
        + str(rmse) + ') =======================')
    return (bestModel, predicts, rmse)
Exemplo n.º 12
0
def prepare_spark_pipeline_for_DT():
    print('----------Preparing spark pipeline for DT----------')
    label_indexer = StringIndexer(inputCol="price",
                                  outputCol="label",
                                  handleInvalid="keep")
    vector_assembler = VectorAssembler(inputCols=features,
                                       outputCol="unscaled_features")
    standard_scaler = StandardScaler(inputCol="unscaled_features",
                                     outputCol="features")
    DT_model = DecisionTreeRegressor(maxDepth=8)

    stages = [label_indexer, vector_assembler, standard_scaler, DT_model]
    pipeline = Pipeline(stages=stages)

    estimator_param = ParamGridBuilder().addGrid(DT_model.maxDepth,
                                                 [8, 16]).addGrid(
                                                     DT_model.impurity,
                                                     ["variance"]).build()
    eval = RegressionEvaluator(labelCol="label",
                               predictionCol="prediction",
                               metricName="mse")
    return CrossValidator(estimator=pipeline,
                          estimatorParamMaps=estimator_param,
                          evaluator=eval,
                          numFolds=3), eval
Exemplo n.º 13
0
def trainAndEvalModelByDecisionTreeRegressorAndCrossValidator(
        stages, train_df, test_df, evaluator):
    '''
    使用 DecisionTreeRegressor 决策树回归和 CrossValidator 建立机器学习Pipeline流程进行模型训练和验证,并找出最佳模型
    :param stages:
    :param train_df:
    :param test_df:
    :param evaluator:
    :return:
    '''
    print(
        '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练 ======================='
    )
    dt = DecisionTreeRegressor(labelCol='cnt', featuresCol='features')
    paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [
        5, 10, 15, 25
    ]).addGrid(dt.maxBins, [25, 35, 45, 50]).build(
    )  # 执行模型参数训练 4*4=16次,其中impurity="variance"固定不变,不用再参与训练,由于在line:108,创建 vectorIndexer 时,设置了maxCategories=24,因此这里maxBins要大于24
    cv = CrossValidator(estimator=dt,
                        evaluator=evaluator,
                        estimatorParamMaps=paramGrid,
                        numFolds=3)
    cvPipeline = Pipeline(stages=stages + [cv])
    cvPipelineModel = cvPipeline.fit(train_df)
    bestModel = cvPipelineModel.stages[2].bestModel
    print(
        '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练后,使用模型进行预测 ======================='
    )
    predicts = cvPipelineModel.transform(test_df)
    rmse = evaluator.evaluate(predicts)
    print(
        '======================= 使用 DecisionTreeRegressor、CrossValidator 建立 ML Pipeline 流程进行模型训练后,评估模型准确率(rmse='
        + str(rmse) + ') =======================')
    return (bestModel, predicts, rmse)
Exemplo n.º 14
0
def create_model(training_data, features_col, label_col, max_bins=32):
    """
    Create machine learning model
    :param training_data: -- dataframe: training dataset
    :param features_col: -- col: containing all the features needed.
    :param label_col: -- col: label
    :param max_bins: -- integer: number of bins needed for
    :return: model created and its evaluator
    """

    # Create Decision Tree Model
    dt = DecisionTreeRegressor()

    # Create params for the model
    params = ParamGridBuilder().baseOn({
        dt.featuresCol: features_col
    }).baseOn({
        dt.labelCol: label_col
    }).addGrid(dt.maxDepth, [3, 5, 7]).addGrid(
        dt.maxBins, [32 if max_bins <= 32 else max_bins + 1]).build()

    # Model Evaluator
    dt_evaluator = RegressionEvaluator(labelCol=label_col)

    # Create model with Cross Validation to get the best results
    dt_cv = CrossValidator(estimator=dt,
                           estimatorParamMaps=params,
                           evaluator=dt_evaluator)

    dt_cv_model = dt_cv.fit(training_data)

    return dt_cv_model, dt_evaluator
Exemplo n.º 15
0
    def dtr(self):
        # Load and parse the data file, converting it to a DataFrame.
        data = self.session.read.format("libsvm").load(self.dataDir + "/data/mllib/sample_libsvm_data.txt")

        # Automatically identify categorical features, and index them.
        # Set maxCategories so features with > 4 distinct values are treated as continuous.
        featureIndexer = \
            VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = data.randomSplit([0.7, 0.3])

        # Train a GBT model.
        drg = DecisionTreeRegressor(featuresCol="indexedFeatures")

        # Chain indexer and GBT in a Pipeline
        pipeline = Pipeline(stages=[featureIndexer, drg])

        # Train model.  This also runs the indexer.
        model = pipeline.fit(trainingData)

        # Make predictions.
        predictions = model.transform(testData)

        # Select example rows to display.
        predictions.select("prediction", "label", "features").show(5)

        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(
            labelCol="label", predictionCol="prediction", metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

        gbtModel = model.stages[1]
        print(gbtModel)  # summary only
Exemplo n.º 16
0
def train(data, max_depth, max_bins, model_name, log_as_mleap, log_as_onnx):
    (trainingData, testData) = data.randomSplit([0.7, 0.3], 42)
    print("testData.schema:")
    testData.printSchema()

    # MLflow - log parameters
    print("Parameters:")
    print("  max_depth:", max_depth)
    print("  max_bins:", max_bins)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("max_bins", max_bins)

    # Create pipeline
    dt = DecisionTreeRegressor(labelCol=colLabel,
                               featuresCol=colFeatures,
                               maxDepth=max_depth,
                               maxBins=max_bins)
    assembler = VectorAssembler(inputCols=data.columns[:-1],
                                outputCol=colFeatures)
    pipeline = Pipeline(stages=[assembler, dt])

    # Fit model and predict
    model = pipeline.fit(trainingData)
    predictions = model.transform(testData)

    # MLflow - log metrics
    print("Metrics:")
    predictions = model.transform(testData)
    metrics = ["rmse", "r2", "mae"]
    for metric_name in metrics:
        evaluator = RegressionEvaluator(labelCol=colLabel,
                                        predictionCol=colPrediction,
                                        metricName=metric_name)
        metric_value = evaluator.evaluate(predictions)
        print(f"  {metric_name}: {metric_value}")
        mlflow.log_metric(metric_name, metric_value)

    # MLflow - log spark model
    mlflow.spark.log_model(model, "spark-model", \
        registered_model_name=None if not model_name else f"{model_name}")

    # MLflow - log as MLeap model
    if log_as_mleap:
        scoreData = testData.drop("quality")
        mlflow.mleap.log_model(spark_model=model, sample_input=scoreData, artifact_path="mleap-model", \
            registered_model_name=None if not model_name else f"{model_name}_mleap")

        # Log MLeap schema file for MLeap runtime deserialization
        schema_path = "schema.json"
        with open(schema_path, 'w') as f:
            f.write(scoreData.schema.json())
        print("schema_path:", schema_path)
        mlflow.log_artifact(schema_path, "mleap-model")

    # MLflow - log as ONNX model
    if log_as_onnx:
        import onnx_utils
        scoreData = testData.drop("quality")
        onnx_utils.log_model(spark, model, "onnx-model", model_name, scoreData)
Exemplo n.º 17
0
 def model_define(self):
     """Returns a model with the hyperparameters inputted in :func:
     `get_parameters`
     Returns:
         (pyspark.ml.regression.DecisionTreeRegressor):
             Decision Tree Regression model
     """
     return DecisionTreeRegressor()
Exemplo n.º 18
0
 def test_decisiontree_regressor(self):
     dt = DecisionTreeRegressor(maxDepth=1)
     path = tempfile.mkdtemp()
     dtr_path = path + "/dtr"
     dt.save(dtr_path)
     dt2 = DecisionTreeClassifier.load(dtr_path)
     self.assertEqual(dt2.uid, dt2.maxDepth.parent,
                      "Loaded DecisionTreeRegressor instance uid (%s) "
                      "did not match Param's uid (%s)"
                      % (dt2.uid, dt2.maxDepth.parent))
     self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth],
                      "Loaded DecisionTreeRegressor instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Exemplo n.º 19
0
 def test_decisiontree_regressor(self):
     dt = DecisionTreeRegressor(maxDepth=1)
     path = tempfile.mkdtemp()
     dtr_path = path + "/dtr"
     dt.save(dtr_path)
     dt2 = DecisionTreeClassifier.load(dtr_path)
     self.assertEqual(dt2.uid, dt2.maxDepth.parent,
                      "Loaded DecisionTreeRegressor instance uid (%s) "
                      "did not match Param's uid (%s)"
                      % (dt2.uid, dt2.maxDepth.parent))
     self.assertEqual(dt._defaultParamMap[dt.maxDepth], dt2._defaultParamMap[dt2.maxDepth],
                      "Loaded DecisionTreeRegressor instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
Exemplo n.º 20
0
def DT_Algorithm(tr,te,featureIndexer):
    # Train a DecisionTree model.
    dt = DecisionTreeRegressor(featuresCol="indexedFeatures",labelCol='positive_rating_ratio')
    # Chain indexer and tree in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, dt])
    paramGrid=ParamGridBuilder().addGrid(dt.maxDepth, [5, 10, 15]) \
        .addGrid(dt.minInstancesPerNode, [1, 5, 10]).build()
    print("---------------------Decision Tree Regression---------------------")
    predict(tr, te, pipeline, paramGrid, False)
def estimator_pipeline(train_dataframe, test_dataframe):

    random.seed(0)

    #вектор features
    vector = VectorAssembler(inputCols=train_dataframe.columns[:-1],
                             outputCol='features')

    #estimator LR с параметрами из задания
    estimator_LR = LinearRegression(featuresCol='features',
                                    labelCol='ctr',
                                    maxIter=40,
                                    regParam=0.4,
                                    elasticNetParam=0.8)
    #другие эстиматоры с параметрами по умолчанию
    estimator_DT = DecisionTreeRegressor(featuresCol='features',
                                         labelCol='ctr')
    estimator_RF = RandomForestRegressor(featuresCol='features',
                                         labelCol='ctr')
    estimator_GB = GBTRegressor(featuresCol='features', labelCol='ctr')

    #evaluator
    RMSE_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='ctr',
                                         metricName='rmse')

    #спсиок моделей и непосредственно результаты будем записывать в списки
    models_ = []
    RMSE_result = []

    #обучаем все эстиматоры
    for est_r in [estimator_LR, estimator_DT, estimator_RF, estimator_GB]:

        #задаем pipline обучения (2 стадии, в реальности - доп.ступени отчистки и предобработки данных)
        pipeline = Pipeline(stages=[vector, est_r])
        #делаем fit для Pipline по тренировочному датасету (создаем вектор, обучаем эстиматор)
        model = pipeline.fit(train_dataframe)
        #добавляем  модель в список
        models_.append(model)

        #       #сохраняем модель (по заданию) - можем сохранть модель в цикле с uid, но тогда нет понимания как правильно
        #       #обращатсья к модели через PipelineModel.load из PySparkMLPredict т.к. uid будет постоянно меняться
        #       #по этому сохраняем вс модели в список и далее для каждую модель сохраняем с определенным названием
        #        model.save(est_r.uid)

        #делаем прогноз по тестовому датасету
        prediction = pipeline.fit(train_dataframe).transform(test_dataframe)
        #считаем метрику RMSE для тестового датасета
        RMSE = round(RMSE_evaluator.evaluate(prediction), 4)
        #записываем результат в массив для отображения в консоли
        RMSE_result.append(RMSE)

    #сохранение моделей
    for pair in zip(models_, ['LR_model', 'DT_model', 'RF_model', 'GB_model']):
        pair[0].save(pair[1])

    return models_, RMSE_result
Exemplo n.º 22
0
def TrainDT(trainingData, testData):
    # Train a DecisionTree model.
    dt = DecisionTreeRegressor()

    # Train model.  This also runs the indexer.
    start = time.time()
    model = dt.fit(trainingData)
    end = time.time()
    print('Training DT model took', end - start)

    # Make predictions.
    predictions = model.transform(testData)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 on test data = %g" % r2)

    # Make predictions for train
    predictions = model.transform(trainingData)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 on train data = %g" % r2)

    return model
Exemplo n.º 23
0
def decision_tree_regression(trainingDataFrame, maxCategories=4):
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                   maxCategories=maxCategories).fit(trainingDataFrame)
    dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[featureIndexer, dt])
    dtModel = pipeline.fit(trainingDataFrame)
    result = {}
    result["model"] = dtModel
    result["summary"] = dtModel.stages[1]
    return result
Exemplo n.º 24
0
def get_best_weather_model(data):
    train, test = data.randomSplit([0.75, 0.25])
    train = train.cache()
    test = test.cache()

    # e.g., use print(LinearRegression().explainParams()) to see what can be tuned
    estimator_gridbuilders = [
        estimator_gridbuilder(
            DecisionTreeRegressor(),
            dict(
                maxDepth=[10],
                minInstancesPerNode=[2],
                minInfoGain=[0.5],
            )),
        estimator_gridbuilder(
            LinearRegression(),
            dict(
                regParam=[0.2],  # [0.1, 0.01]
                elasticNetParam=[.8],  # 0-L2, 1-L1
                aggregationDepth=[5],
                tol=[0.000005],
                maxIter=[100])),
        estimator_gridbuilder(
            RandomForestRegressor(),
            dict(featureSubsetStrategy=["onethird"],
                 maxDepth=[10],
                 numTrees=[40])),
        estimator_gridbuilder(
            GBTRegressor(),
            dict(
                maxIter=[20],
                maxDepth=[10],
                lossType=['squared'],
            )),
        # TODO: find better estimators
    ]
    metricName = 'r2'
    tvs_list = make_weather_trainers(
        .2,  # fraction of data for training
        estimator_gridbuilders,
        metricName)
    ev = tvs_list[0].getEvaluator()
    scorescale = 1 if ev.isLargerBetter() else -1
    model_name_scores = []
    for tvs in tvs_list:
        model = tvs.fit(train)
        test_pred = model.transform(test)
        score = ev.evaluate(test_pred) * scorescale
        model_name_scores.append(
            (model, get_estimator_name(tvs.getEstimator()), score))
    best_model, best_name, best_score = max(model_name_scores,
                                            key=lambda triplet: triplet[2])
    print("Best model is %s with validation data %s score %f" %
          (best_name, ev.getMetricName(), best_score * scorescale))
    return best_model
Exemplo n.º 25
0
    def tree_builder(self, data, algorithm):
        print(algorithm)
        if algorithm == 2:
            regressor = DecisionTreeRegressor(featuresCol='features',
                                              labelCol=TARGET_VARIABLE,
                                              impurity='variance')
            model_path = "decision_tree"
        elif algorithm == 3:
            regressor = RandomForestRegressor(featuresCol='features',
                                              labelCol=TARGET_VARIABLE,
                                              numTrees=4)
            model_path = "random_forest"
            print("[REG] number of trees: ", regressor.getNumTrees())

        param_grid = ParamGridBuilder().addGrid(regressor.maxDepth, MAX_DEPTH_OPTIONS) \
            .addGrid(regressor.maxBins, MAX_BINS_OPTIONS) \
            .build()

        # learning on training data
        self.learn_from_training_data(data, regressor, model_path, param_grid)
Exemplo n.º 26
0
def test_pyspark_regression_decision_tree():
    try:
        import pyspark
        import sklearn.datasets
        from pyspark.sql import SparkSession
        from pyspark import SparkContext, SparkConf
        from pyspark.ml.feature import VectorAssembler, StringIndexer
        from pyspark.ml.regression import DecisionTreeRegressor, GBTRegressor, RandomForestRegressor
        import pandas as pd

        iris_sk = sklearn.datasets.load_iris()
        iris = pd.DataFrame(data=np.c_[iris_sk['data'], iris_sk['target']],
                            columns=iris_sk['feature_names'] +
                            ['target'])[:100]
        spark = SparkSession.builder.config(
            conf=SparkConf().set("spark.master", "local[*]")).getOrCreate()
    except:
        print("Skipping test_pyspark_regression_decision_tree!")
        return
    import shap
    import numpy as np

    # Simple regressor: try to predict sepal length based on the other features
    col = [
        "sepal_length", "sepal_width", "petal_length", "petal_width", "type"
    ]
    iris = spark.createDataFrame(iris, col).drop("type")
    iris = VectorAssembler(inputCols=col[1:-1],
                           outputCol="features").transform(iris)

    regressors = [
        GBTRegressor(labelCol="sepal_length", featuresCol="features"),
        RandomForestRegressor(labelCol="sepal_length", featuresCol="features"),
        DecisionTreeRegressor(labelCol="sepal_length", featuresCol="features")
    ]
    for regressor in regressors:
        model = regressor.fit(iris)
        explainer = shap.TreeExplainer(model)
        X = pd.DataFrame(data=iris_sk.data,
                         columns=iris_sk.feature_names).drop(
                             'sepal length (cm)', 1)[:100]  # pylint: disable=E1101

        shap_values = explainer.shap_values(X)
        expected_values = explainer.expected_value

        # validate values sum to the margin prediction of the model plus expected_value
        predictions = model.transform(iris).select("prediction").toPandas()
        diffs = expected_values + shap_values.sum(
            1) - predictions["prediction"]
        assert np.max(np.abs(
            diffs)) < 1e-4, "SHAP values don't sum to model output for class0!"
        assert (np.abs(expected_values - predictions.mean()) <
                1e-1).all(), "Bad expected_value!"
    spark.stop()
Exemplo n.º 27
0
def func2():
    """
    使用K折交叉验证
    :return:
    """
    hour_df = sqlContext.read.format("csv").option(
        "header", "true").load(Path + "hour.csv")
    # 舍弃不需要的字段
    hour_df = hour_df.drop("instant").drop("dteday").drop("yr").drop(
        "casual").drop("registered")
    # 数据转换为double
    hour_df = hour_df.select([
        col(column).cast("double").alias(column) for column in hour_df.columns
    ])
    # 将数据分为train_df和test_df,比例为0.7:0.3
    train_df, test_df = hour_df.randomSplit([0.7, 0.3])
    train_df.cache()
    test_df.cache()
    # 创建特征字段list
    featureCols = hour_df.columns[:-1]
    # 建立pipeline
    vectorAssembler = VectorAssembler(inputCols=featureCols,
                                      outputCol="aFeatures")
    vectorIndexer = VectorIndexer(inputCol="aFeatures",
                                  outputCol="features",
                                  maxCategories=24)
    dt = DecisionTreeRegressor(labelCol="cnt", featuresCol="features")
    dt_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, dt])
    # 训练
    dt_pipelineModel = dt_pipeline.fit(dataset=train_df)
    # 使用transform预测
    predicted = dt_pipelineModel.transform(test_df)
    ###评估模型
    evaluator = RegressionEvaluator(labelCol="cnt",
                                    predictionCol="prediction",
                                    metricName="rmse")
    predicted_df = dt_pipelineModel.transform(test_df)
    rmse = evaluator.evaluate(predicted_df)
    ##TrainValidationSplit训练找出最佳模型
    paramGrid = ParamGridBuilder().addGrid(
        dt.impurity,
        ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(
            dt.maxBins, [10, 15, 20]).build()
    cv = CrossValidator(estimator=dt,
                        evaluator=evaluator,
                        estimatorParamMaps=paramGrid,
                        numFolds=3)
    cv_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])
    cv_pipelineModel = cv_pipeline.fit(dataset=train_df)
    ##使用最佳模型进行预测
    predictions = cv_pipelineModel.transform(test_df)
    rmse2 = evaluator.evaluate(predictions)
    print(rmse2)
Exemplo n.º 28
0
def decision_tree_regressor(spark, original_label_col, feature_col_names):
    # Create two columns, 'label' and 'features'. Label is true or false, features is a vector of values.
    label_col = "label"
    vector_col = "features"

    dt = DecisionTreeRegressor(labelCol="indexedLabel",
                               featuresCol="indexedFeatures")
    evaluator = RegressionEvaluator(labelCol="indexedLabel",
                                    predictionCol="prediction",
                                    metricName="mae")
    return run_model(spark, original_label_col, label_col, vector_col,
                     feature_col_names, dt, [evaluator])
Exemplo n.º 29
0
def DTR(df_data):
    # Train a DecisionTree model.
    print("Train a DecisionTree model...")
    t1 = time.time()
    dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
    # Chain indexer and tree in a Pipeline
    pipeline = Pipeline(stages=[data.feature_indexer(df_data), dt])
    # Train model.  This also runs the indexer.
    dtr_model = pipeline.fit(df_data)
    t2 = time.time() - t1
    print("dt_model using time: %.2fs\n" % t2)
    return dtr_model
Exemplo n.º 30
0
def decisionTreeRegression(df,arguments):
	from pyspark.ml.regression import DecisionTreeRegressor
	maxDepth = 5
	minInstancesPerNode = 1
	impurity = "variance"

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.impurity != None:
		impurity = arguments.impurity

	dt = DecisionTreeRegressor(maxDepth=maxDepth,
							   minInstancesPerNode=minInstancesPerNode,
							   impurity=impurity)
	model = dt.fit(df)

	return model
Exemplo n.º 31
0
def que1():
    for i, ct in enumerate([
            DecisionTreeClassifier(seed=9008),
            DecisionTreeRegressor(predictionCol="prediction_c", seed=9008),
            LogisticRegression()
    ]):
        binarizer = None
        if i == 0:
            print("[*] DecisionTree Classifier")
            paramB = ParamGridBuilder().addGrid(
                ct.maxDepth,
                [5, 10, 20]).addGrid(ct.maxBins, [16, 32]).addGrid(
                    ct.impurity, ["gini", "entropy"]).build()
            continue
        elif i == 1:
            print("[*] DecisionTree Regressor")
            paramB = ParamGridBuilder().addGrid(
                ct.maxDepth,
                [5, 10, 20]).addGrid(ct.maxBins, [16, 32]).addGrid(
                    ct.minInfoGain, [0.0, 0.25, 0.3]).build()
            binarizer = Binarizer(threshold=0.5,
                                  inputCol="prediction_c",
                                  outputCol="prediction")
        else:
            print("[*] Logistic Regression")
            paramB = ParamGridBuilder().addGrid(ct.maxIter,
                                                [5, 10, 15]).addGrid(
                                                    ct.regParam,
                                                    [0.05, 0.1, 0.5]).build()

        if binarizer is not None: pipeline = Pipeline(stages=[ct, binarizer])
        else: pipeline = Pipeline(stages=[ct])

        print("[*] Running for areaUnderROC")
        bp, metric_roc = run_metric(
            s_train, s_test, pipeline, paramB,
            BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          metricName="areaUnderROC"))
        print("[*] Done for areaUnderROC")
        print("[*] Best Params: %s, AreaUnderROC value: %f" % (bp, metric_roc))

        print("[*] Running for accuracy")
        mp, metric_acc = run_metric(
            s_train, s_test, pipeline, paramB,
            MulticlassClassificationEvaluator(predictionCol="prediction",
                                              metricName="accuracy"))
        print("[*] Done for accuracy")
        print("[*] Best Params: %s, Accuracy value: %f" % (mp, metric_acc))
Exemplo n.º 32
0
sepalPredictions = sepalModels[0].transform(irisSepal)
print regEval.evaluate(sepalPredictions,
                       {regEval.metricName: 'r2', regEval.labelCol: 'sepalWidth'})
print regEval.evaluate(sepalPredictions,
                       {regEval.metricName: 'rmse', regEval.labelCol: 'sepalWidth'})

# COMMAND ----------

# MAGIC %md
# MAGIC #### Regression with decision trees

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor

dtr = DecisionTreeRegressor().setLabelCol('petalWidth')
print dtr.explainParams()

# COMMAND ----------

dtrModel = dtr.fit(irisPetal)
dtrPredictions = dtrModel.transform(irisPetal)
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'})
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'})

# COMMAND ----------

# MAGIC %md
# MAGIC Let's also build a gradient boosted tree.

# COMMAND ----------
Exemplo n.º 33
0
def spark_process(sqlContext, sc, validate, path_to_file):

	######################
	#
	# HDFS to DataFrame 
	#
	######################

	
	## all fields:
	#  ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 
	#   'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', 
	#   'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', 
	#   'tolls_amount', 'total_amount']

	# columns to select
	feature_columns = [1,2,3,5,6,9,10]

	# read file and convert to DataFrame
	# dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache()
	customSchema = StructType([
    							StructField("vendor_id", StringType(), True),
							    StructField("pickup_datetime", TimestampType(), True),
							    StructField("dropoff_datetime", TimestampType(), True),
							    StructField("passenger_count", StringType(), True),
							    StructField("trip_distance", StringType(), True),
							    StructField("pickup_longitude", DoubleType(), True),
							    StructField("pickup_latitude", DoubleType(), True),
							    StructField("rate_code", StringType(), True),
							    StructField("store_and_fwd_flag", StringType(), True),
							    StructField("dropoff_longitude", DoubleType(), True),
							    StructField("dropoff_latitude", DoubleType(), True),
							    StructField("payment_type", StringType(), True),
							    StructField("fare_amount", StringType(), True),
							    StructField("surcharge", StringType(), True),
							    StructField("mta_tax", StringType(), True),
							    StructField("tip_amount", StringType(), True),
							    StructField("tolls_amount", StringType(), True),
							    StructField("total_amount", StringType(), True)
							    ])

	dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file)
	# create dataframe with selected columns
	dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns))
	
	# this number does not include the header
	# number_of_trips = dataframe.count()

	sqlContext.clearCache()
	######################
	#
	# Preprocess data 
	#
	######################

	# filter rows with null fields
	# if passenger count is missing assign it a value of 1
	# filter invalid location: keep only areas near NYC
	dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \
						.fillna(1,subset=["passenger_count"])     \
						.filter(dataframe.pickup_latitude>40.0)   \
						.filter(dataframe.pickup_latitude<41.0)   \
						.filter(dataframe.pickup_longitude<-73.0) \
						.filter(dataframe.pickup_longitude>-74.0) \
						.filter(dataframe.dropoff_latitude>40.0)  \
						.filter(dataframe.dropoff_latitude<41.0)  \
						.filter(dataframe.dropoff_longitude<-73.0)\
						.filter(dataframe.dropoff_longitude>-74.0)


	######################
	#
	# features engineering
	#
	######################

	# create new column based on time-delta (minutes)
	# convert pickup-datetime column to hour
		
	time_delta_udf = udf(time_delta_minutes,FloatType())

	dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
						 .withColumn('pick_up_hour', hour(dataframe.pickup_datetime))

 	dataframe = dataframe.select(dataframe.pick_up_hour,    \
 								dataframe.passenger_count.cast("integer"),  \
								dataframe.pickup_longitude.cast("double"), \
								dataframe.pickup_latitude.cast("double"),  \
								dataframe.dropoff_longitude.cast("double"),\
								dataframe.dropoff_latitude.cast("double"), \
								dataframe.time_delta.cast("double"))

 	dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()


 	# split dataframe into feature and label vector
	# create feature vectors and labels for model training
	feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features')

	transformed = feature_assembler.transform(dataframe)
	vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache()

	######################
	#
	# train model
	#
	######################

	if validate:

		################################
		#
		# validate model on 60/40 split
		#
		################################

		# split 
		training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(training)

		train_pred = model.transform(training)
		test_pred = model.transform(test)

		evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_train = evaluator.evaluate(train_pred)

		evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_test = evaluator_test.evaluate(test_pred)

		output = test_pred.select("prediction", "label", "features")

		return output, r2_test, r2_train
	
	else:

		###################
		#
		# train on all data
		#
		###################

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(vector_dataframe)

		predictions = model.transform(vector_dataframe)

		output = predictions.select("prediction", "label", "features")

		###########################
		#
		# process to send to Kafka
		#
		###########################

		schema = StructType([StructField("prediction_mins", FloatType(), True),
							StructField("pick_up_hour", IntegerType(), True),
							StructField("pickup_longitude", DoubleType(), True),
							StructField("pickup_latitude", DoubleType(), True),
							StructField("dropoff_longitude", DoubleType(), True),
							StructField("dropoff_latitude", DoubleType(), True)])

		features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect()
		sqlContext.clearCache()
		dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache()

		return dataframe_from_prediction_vector
Exemplo n.º 34
0
categoricalColumns = ['store_and_fwd_flag']
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

#encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend']
encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type']
for eCol in encColumns:
  encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec")
  stages += [encoder]
#label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label")
#stages += [label_stringIdx]

numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(labelCol="total_amount", featuresCol="features", maxBins=32)
model = dt.fit(dataset)
model.write().overwrite().save("./nyc-01020304-6vm-18-DT-model")


from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
print glr.explainParams()
glrModel = glr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
dtrModel = dtr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf =  RandomForestRegressor()
print rf.explainParams()
rfModel = rf.fit(df)
gbt = GBTRegressor()
print gbt.explainParams()
gbtModel = gbt.fit(df)