Exemplo n.º 1
0
def task_7(data_io, train_data, test_data):

    # ---------------------- Your implementation begins------------------------
    dt = DecisionTreeRegressor(labelCol="overall",
                               featuresCol="features",
                               maxDepth=5)
    model = dt.fit(train_data)
    predictions = model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="overall",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {'test_rmse': None}
    # Modify res:
    res['test_rmse'] = rmse

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_7')
    return res
def build_decision_tree_regression(observation_df, feature_columns):
    # Create new column with all of the features
    vector_observation_df = create_feature_column(observation_df,
                                                  feature_columns,
                                                  ['features', 'duration_sec'])

    train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3])
    lr = DecisionTreeRegressor(featuresCol="features", labelCol="duration_sec")

    model = lr.fit(train_df)

    test_predictions = model.transform(test_df)

    test_predictions.select("prediction", "duration_sec", "features").show(5)

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="rmse")
    print("RMSE on test data = %g" % evaluator.evaluate(test_predictions))

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="r2")

    print("R2 on test data = %g" % evaluator.evaluate(test_predictions))

    return model
Exemplo n.º 3
0
 def test_decision_tree_regressor(self):
     features = [[0, 1], [1, 1], [2, 0]]
     features = numpy.array(features, dtype=numpy.float32)
     labels = [100, -10, 50]
     dd = [(labels[i], Vectors.dense(features[i]))
           for i in range(len(labels))]
     data = self.spark.createDataFrame(
         self.spark.sparkContext.parallelize(dd),
         schema=["label", "features"])
     dt = DecisionTreeRegressor(labelCol="label", featuresCol="features")
     model = dt.fit(data)
     feature_count = data.select('features').first()[0].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml Decision Tree Regressor',
         [('features', FloatTensorType([None, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     predicted = model.transform(data)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlDecisionTreeRegressor")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Exemplo n.º 4
0
def predict_price_of_unit_area_by_decision_tree(
        real_estate_dataset_df: DataFrame):
    """
    Predict the price per unit area based on house age, distance to MRT (public transportation) and number of convenience stores,
    using decision tree regression.
    :param real_estate_dataset_df:
    :return:
    """

    real_estate_dataset_df = transform_dataset_to_label_feature_form(
        real_estate_dataset_df)

    train_test_datasets = real_estate_dataset_df.randomSplit([0.5, 0.5])
    train_dataset = train_test_datasets[0]
    test_dataset = train_test_datasets[1]

    # setLabelCol, setFeatureCol: Change column name for "label" and "features" columns.
    decision_tree_regressor = DecisionTreeRegressor().setLabelCol(
        'actual_price')
    model = decision_tree_regressor.fit(train_dataset)

    # Create predictions for testing dataset.
    predictions = model.transform(test_dataset).\
        select('actual_price', func.round(func.col('prediction'), 2).alias('predicted_price')).\
        orderBy(func.desc('actual_price')).cache()

    return predictions
def decision_tree_regression(train_data, test_data):
    dt = DecisionTreeRegressor(featuresCol='features', labelCol='MEDV')
    dt_model = dt.fit(train_data)
    dt_predictions = dt_model.transform(test_data)
    dt_evaluator = RegressionEvaluator(
        labelCol='MEDV',
        predictionCol='prediction',
        metricName='rmse',
    )
    rmse = dt_evaluator.evaluate(dt_predictions)
    print('Root Mean Squared Error (RMSE) on test data = %g' % rmse)
    print(dt_model.featureImportances)
Exemplo n.º 6
0
def task_8(data_io, train_data, test_data):

    # ---------------------- Your implementation begins------------------------
    trainingData, testData = train_data.randomSplit([0.75, 0.25])
    best = 0
    all_rmse = []
    lowest_rmse = 100
    for i in [5, 7, 9, 12]:
        dt = DecisionTreeRegressor(labelCol="overall",
                                   featuresCol="features",
                                   maxDepth=i)
        model = dt.fit(trainingData)
        predictions = model.transform(testData)
        evaluator = RegressionEvaluator(labelCol="overall",
                                        predictionCol="prediction",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        all_rmse = all_rmse + [rmse]
        if rmse <= lowest_rmse:
            lowest_rmse = rmse
            best = i
            best_model = model

    predictions = best_model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="overall",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'test_rmse': None,
        'valid_rmse_depth_5': None,
        'valid_rmse_depth_7': None,
        'valid_rmse_depth_9': None,
        'valid_rmse_depth_12': None,
    }
    # Modify res:
    res['test_rmse'] = rmse
    res['valid_rmse_depth_5'] = all_rmse[0]
    res['valid_rmse_depth_7'] = all_rmse[1]
    res['valid_rmse_depth_9'] = all_rmse[2]
    res['valid_rmse_depth_12'] = all_rmse[3]

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_8')
    return res
def decisionTreeRegressor(data, ncolumns, schemaNames):
    from pyspark.ml import Pipeline
    from pyspark.ml.regression import DecisionTreeRegressor
    from pyspark.ml.tuning import ParamGridBuilder
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.tuning import CrossValidator
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.feature import Binarizer
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    import numpy as np
    import time

    binarizer = Binarizer(
        threshold=0.00001,
        inputCol="features",
        outputCol="binarized_features",
    )
    binarizedDataFrame = binarizer.transform(data)

    (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50)
    dtr = DecisionTreeRegressor(labelCol="label",
                                featuresCol="binarized_features",
                                maxDepth=10,
                                maxBins=10,
                                impurity='Variance')

    timer = ''
    start = time.time()
    cvModel = dtr.fit(trainingData)
    end = time.time()
    timer = ((end - start) / 60)

    prediction = cvModel.transform(testData)
    evaluator = RegressionEvaluator\
         (labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(prediction)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    areaUC = evaluator.evaluate(prediction)

    fi = cvModel.featureImportances
    imp_feat = np.zeros(ncolumns - 1)
    imp_feat[fi.indices] = fi.values
    x = np.arange(ncolumns - 1)
    idx = (-imp_feat).argsort()[:3]
    feat = []
    for i in idx:
        feat.append(schemaNames[i])

    return feat, rmse, areaUC, timer
Exemplo n.º 8
0
def TrainDT(trainingData, testData):
    # Train a DecisionTree model.
    dt = DecisionTreeRegressor()

    # Train model.  This also runs the indexer.
    start = time.time()
    model = dt.fit(trainingData)
    end = time.time()
    print('Training DT model took', end - start)

    # Make predictions.
    predictions = model.transform(testData)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 on test data = %g" % r2)

    # Make predictions for train
    predictions = model.transform(trainingData)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 on train data = %g" % r2)

    return model
Exemplo n.º 9
0
def decisionTreeRegression(df,arguments):
	from pyspark.ml.regression import DecisionTreeRegressor
	maxDepth = 5
	minInstancesPerNode = 1
	impurity = "variance"

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.impurity != None:
		impurity = arguments.impurity

	dt = DecisionTreeRegressor(maxDepth=maxDepth,
							   minInstancesPerNode=minInstancesPerNode,
							   impurity=impurity)
	model = dt.fit(df)

	return model
Exemplo n.º 10
0
def decision_tree_regressor():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance")
    model = dt.fit(df)
    model.depth
    # 1
    model.numNodes
    # 3
    model.featureImportances
    # SparseVector(1, {0: 1.0})
    model.numFeatures
    # 1
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    model.transform(test0).head().prediction
    # 0.0
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    model.transform(test1).head().prediction
    # 1.0
    temp_path = "./"
    dtr_path = temp_path + "/dtr"
    dt.save(dtr_path)
    dt2 = DecisionTreeRegressor.load(dtr_path)
    dt2.getMaxDepth()
    # 2
    model_path = temp_path + "/dtr_model"
    model.save(model_path)
    model2 = DecisionTreeRegressionModel.load(model_path)
    model.numNodes == model2.numNodes
    # True
    model.depth == model2.depth
    # True
    model.transform(test1).head().variance
                      elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

train_df.describe().show()

lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction", "PE", "features").show(5)
lr_evaluator = RegressionEvaluator(predictionCol="prediction",
                                   labelCol="PE",
                                   metricName="r2")
print("R Squared (R2) on test data = %g" %
      lr_evaluator.evaluate(lr_predictions))

## DecisionTreeRegressor portion
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol='features', labelCol='PE')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(labelCol="PE",
                                   predictionCol="prediction",
                                   metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print(
    "DecisionTreeRegressor Root Mean Squared Error (RMSE) on test data = %g" %
    rmse)
Exemplo n.º 12
0
    lr_evaluator = RegressionEvaluator(predictionCol="prediction",
                                       labelCol="MPG",
                                       metricName="r2")
    print("R Squared (R2) for Linear Regression on test data = %g" %
          lr_evaluator.evaluate(lr_predictions))

    # RMSE on test data
    test_result = lr_model.evaluate(test_df)
    print(
        "Root Mean Squared Error (RMSE) for Linear Regression on test data = %g\n"
        % test_result.rootMeanSquaredError)

    #############################---DECISION TREE REGRESSION---##################################

    dt = DecisionTreeRegressor(featuresCol='features', labelCol='MPG')
    decisionTree_model = dt.fit(train_df)
    decisionTree_model_predictions = decisionTree_model.transform(test_df)
    decisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="rmse")
    rmse = decisionTree_model_evaluator.evaluate(
        decisionTree_model_predictions)
    print(
        "Root Mean Squared Error (RMSE) for Decision Tree on test data = %g" %
        rmse)
    r2_dt = ecisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="r2")
    print("R Squared (R2) for Decision Tree on test data = %g" %
          r2_dt.evaluate(decisionTree_model_predictions))

    ############################---RANDOM FOREST REGRESSION---##################################
Exemplo n.º 13
0
def main():
    errorsRMSE_LR = []
    errorsR2_LR = []
    errorsR2_DT = []
    errorsR2_DT5 = []
    errorsRMSE_DT = []
    errorsRMSE_DT5 = []
    rows_training = []
    rows_testing = [[] for i in range(N_OF_CLUSTERS)]

    for week_nb in range(FIRST_WEEK, LAST_WEEK + 1):
        print('week nb : ', week_nb)
        for day_of_week in range(DAY_IN_WEEK):
            for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY):
                for cid in range(N_OF_CLUSTERS):
                    curFeature = demandCache.get_demand(
                        week_nb, day_of_week, time_of_day_code, cid)
                    if curFeature != []:
                        time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature(
                            curFeature)

                        if (week_nb < WEEK_NB_TEST):
                            rows_training.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, is_manhattan, is_airport,
                                 amount))
                        else:
                            rows_testing[cid].append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, is_manhattan, is_airport,
                                 amount))

    df_training = spark.createDataFrame(rows_training, [
        "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
        "minute", "is_manhattan", "is_airport", "amount"
    ])

    assembler = VectorAssembler(inputCols=[
        "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
        "minute", "is_manhattan", "is_airport"
    ],
                                outputCol='features')
    output_training = assembler.transform(df_training)

    final_data_training = output_training.select('features', 'amount')

    decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3)
    dt_model = decisionTree.fit(final_data_training)

    #print(dt_model.toDebugString)

    decisionTree5 = DecisionTreeRegressor(labelCol='amount', maxDepth=5)
    dt_model5 = decisionTree5.fit(final_data_training)

    #print(dt_model5.toDebugString)

    file = open("DT_final_features_one_model_INFO.txt", "w")
    file.write("DT maxDepth 3 : \n" + dt_model.toDebugString)
    file.write("DT maxDepth 5 : \n" + dt_model5.toDebugString)
    file.close()

    linearRegression = LinearRegression(labelCol='amount')
    lr_model = linearRegression.fit(final_data_training)

    for cid in range(N_OF_CLUSTERS):
        print('cluster: ', cid)
        df_testing = spark.createDataFrame(rows_testing[cid], [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "is_manhattan", "is_airport", "amount"
        ])
        #df_testing.show()
        output_testing = assembler.transform(df_testing)
        final_data_testing = output_testing.select('features', 'amount')
        predictionsDT = dt_model.transform(final_data_testing)
        predictionsDT5 = dt_model5.transform(final_data_testing)
        predictionsLR = lr_model.evaluate(final_data_testing)
        """ Evaluation rmse : """
        rmse = predictionsLR.rootMeanSquaredError
        errorsRMSE_LR.append(rmse)
        #print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse)

        r2 = predictionsLR.r2
        errorsR2_LR.append(r2)
        #print("R Squared Error (R2) for LR on test data = %g" % r2)
        """ Evaluation rmse : """
        evaluatorRMSE = RegressionEvaluator(labelCol="amount",
                                            predictionCol="prediction",
                                            metricName="rmse")
        rmse = evaluatorRMSE.evaluate(predictionsDT)
        rmse5 = evaluatorRMSE.evaluate(predictionsDT5)
        errorsRMSE_DT.append(rmse)
        errorsRMSE_DT5.append(rmse5)
        #print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse)

        evaluatorR2 = RegressionEvaluator(labelCol="amount",
                                          predictionCol="prediction",
                                          metricName="r2")
        r2 = evaluatorR2.evaluate(predictionsDT)
        r25 = evaluatorR2.evaluate(predictionsDT5)
        errorsR2_DT.append(r2)
        errorsR2_DT5.append(r25)
        #print("R Squared Error (R2) for DT on test data = %g" % r2)
    return errorsRMSE_LR, errorsR2_LR, errorsRMSE_DT, errorsR2_DT, errorsRMSE_DT5, errorsR2_DT5
Exemplo n.º 14
0
#testLFDF.take(10)

# COMMAND ----------

#Creating an evaluator measuring our label vs our prediction using RMSE evaluation.
evaluator = RegressionEvaluator(metricName="rmse")\
  .setLabelCol("price_doc")\
  .setPredictionCol("prediction")

# COMMAND ----------

#Decision tree regression, testing on both train and test dataset.
dt = DecisionTreeRegressor(labelCol='price_doc')

#This builds the dt model using the train dataset
model = dt.fit(trainLFDF)
#This predicts dt model outcomes on train and test dataset
trainPredictions = model.transform(trainLFDF)
testPredictions = model.transform(testLFDF)

trainscore = evaluator.evaluate(trainPredictions)
testscore = evaluator.evaluate(testPredictions)
print(trainscore, testscore)

#DT 8 Vars RMSE 3493522, 3901961

# COMMAND ----------

#Gradient boosted tree regression
gbt = GBTRegressor(labelCol='price_doc')
model = gbt.fit(trainLFDF)
Exemplo n.º 15
0
# COMMAND ----------

# MAGIC %md
# MAGIC #### Regression with decision trees

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor

dtr = DecisionTreeRegressor().setLabelCol('petalWidth')
print dtr.explainParams()

# COMMAND ----------

dtrModel = dtr.fit(irisPetal)
dtrPredictions = dtrModel.transform(irisPetal)
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'})
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'})

# COMMAND ----------

# MAGIC %md
# MAGIC Let's also build a gradient boosted tree.

# COMMAND ----------

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor().setLabelCol('petalWidth')
print gbt.explainParams()
Exemplo n.º 16
0
def spark_process(sqlContext, sc, validate, path_to_file):

	######################
	#
	# HDFS to DataFrame 
	#
	######################

	
	## all fields:
	#  ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 
	#   'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', 
	#   'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', 
	#   'tolls_amount', 'total_amount']

	# columns to select
	feature_columns = [1,2,3,5,6,9,10]

	# read file and convert to DataFrame
	# dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache()
	customSchema = StructType([
    							StructField("vendor_id", StringType(), True),
							    StructField("pickup_datetime", TimestampType(), True),
							    StructField("dropoff_datetime", TimestampType(), True),
							    StructField("passenger_count", StringType(), True),
							    StructField("trip_distance", StringType(), True),
							    StructField("pickup_longitude", DoubleType(), True),
							    StructField("pickup_latitude", DoubleType(), True),
							    StructField("rate_code", StringType(), True),
							    StructField("store_and_fwd_flag", StringType(), True),
							    StructField("dropoff_longitude", DoubleType(), True),
							    StructField("dropoff_latitude", DoubleType(), True),
							    StructField("payment_type", StringType(), True),
							    StructField("fare_amount", StringType(), True),
							    StructField("surcharge", StringType(), True),
							    StructField("mta_tax", StringType(), True),
							    StructField("tip_amount", StringType(), True),
							    StructField("tolls_amount", StringType(), True),
							    StructField("total_amount", StringType(), True)
							    ])

	dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file)
	# create dataframe with selected columns
	dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns))
	
	# this number does not include the header
	# number_of_trips = dataframe.count()

	sqlContext.clearCache()
	######################
	#
	# Preprocess data 
	#
	######################

	# filter rows with null fields
	# if passenger count is missing assign it a value of 1
	# filter invalid location: keep only areas near NYC
	dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \
						.fillna(1,subset=["passenger_count"])     \
						.filter(dataframe.pickup_latitude>40.0)   \
						.filter(dataframe.pickup_latitude<41.0)   \
						.filter(dataframe.pickup_longitude<-73.0) \
						.filter(dataframe.pickup_longitude>-74.0) \
						.filter(dataframe.dropoff_latitude>40.0)  \
						.filter(dataframe.dropoff_latitude<41.0)  \
						.filter(dataframe.dropoff_longitude<-73.0)\
						.filter(dataframe.dropoff_longitude>-74.0)


	######################
	#
	# features engineering
	#
	######################

	# create new column based on time-delta (minutes)
	# convert pickup-datetime column to hour
		
	time_delta_udf = udf(time_delta_minutes,FloatType())

	dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
						 .withColumn('pick_up_hour', hour(dataframe.pickup_datetime))

 	dataframe = dataframe.select(dataframe.pick_up_hour,    \
 								dataframe.passenger_count.cast("integer"),  \
								dataframe.pickup_longitude.cast("double"), \
								dataframe.pickup_latitude.cast("double"),  \
								dataframe.dropoff_longitude.cast("double"),\
								dataframe.dropoff_latitude.cast("double"), \
								dataframe.time_delta.cast("double"))

 	dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()


 	# split dataframe into feature and label vector
	# create feature vectors and labels for model training
	feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features')

	transformed = feature_assembler.transform(dataframe)
	vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache()

	######################
	#
	# train model
	#
	######################

	if validate:

		################################
		#
		# validate model on 60/40 split
		#
		################################

		# split 
		training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(training)

		train_pred = model.transform(training)
		test_pred = model.transform(test)

		evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_train = evaluator.evaluate(train_pred)

		evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_test = evaluator_test.evaluate(test_pred)

		output = test_pred.select("prediction", "label", "features")

		return output, r2_test, r2_train
	
	else:

		###################
		#
		# train on all data
		#
		###################

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(vector_dataframe)

		predictions = model.transform(vector_dataframe)

		output = predictions.select("prediction", "label", "features")

		###########################
		#
		# process to send to Kafka
		#
		###########################

		schema = StructType([StructField("prediction_mins", FloatType(), True),
							StructField("pick_up_hour", IntegerType(), True),
							StructField("pickup_longitude", DoubleType(), True),
							StructField("pickup_latitude", DoubleType(), True),
							StructField("dropoff_longitude", DoubleType(), True),
							StructField("dropoff_latitude", DoubleType(), True)])

		features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect()
		sqlContext.clearCache()
		dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache()

		return dataframe_from_prediction_vector
Exemplo n.º 17
0
df = df.selectExpr("fare_amount as label", 'pickup_longitude',
                   'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
                   'passenger_count')

new_df = vecAssembler.setHandleInvalid("skip").transform(df)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = new_df.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeRegressor()

start_time = datetime.now()

# Train model.  This also runs the indexer.
model = dt.fit(trainingData)

time_elapsed = datetime.now() - start_time
print('TIME OF DECISION TREE REGRESSION TRAINING (hh:mm:ss.ms) {}'.format(
    time_elapsed))

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")
Exemplo n.º 18
0
def spark_process(sqlContext, sc, validate, path_to_file):

    ######################
    #
    # HDFS to DataFrame
    #
    ######################

    ## all fields:
    #  ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance',
    #   'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude',
    #   'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount',
    #   'tolls_amount', 'total_amount']

    # columns to select
    feature_columns = [1, 2, 3, 5, 6, 9, 10]

    # read file and convert to DataFrame
    # dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache()
    customSchema = StructType([
        StructField("vendor_id", StringType(), True),
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True),
        StructField("passenger_count", StringType(), True),
        StructField("trip_distance", StringType(), True),
        StructField("pickup_longitude", DoubleType(), True),
        StructField("pickup_latitude", DoubleType(), True),
        StructField("rate_code", StringType(), True),
        StructField("store_and_fwd_flag", StringType(), True),
        StructField("dropoff_longitude", DoubleType(), True),
        StructField("dropoff_latitude", DoubleType(), True),
        StructField("payment_type", StringType(), True),
        StructField("fare_amount", StringType(), True),
        StructField("surcharge", StringType(), True),
        StructField("mta_tax", StringType(), True),
        StructField("tip_amount", StringType(), True),
        StructField("tolls_amount", StringType(), True),
        StructField("total_amount", StringType(), True)
    ])

    dataframe = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', schema=customSchema).load(path_to_file)
    # create dataframe with selected columns
    dataframe = dataframe.select(*(dataframe.columns[n]
                                   for n in feature_columns))

    # this number does not include the header
    # number_of_trips = dataframe.count()

    sqlContext.clearCache()
    ######################
    #
    # Preprocess data
    #
    ######################

    # filter rows with null fields
    # if passenger count is missing assign it a value of 1
    # filter invalid location: keep only areas near NYC
    dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \
         .fillna(1,subset=["passenger_count"])     \
         .filter(dataframe.pickup_latitude>40.0)   \
         .filter(dataframe.pickup_latitude<41.0)   \
         .filter(dataframe.pickup_longitude<-73.0) \
         .filter(dataframe.pickup_longitude>-74.0) \
         .filter(dataframe.dropoff_latitude>40.0)  \
         .filter(dataframe.dropoff_latitude<41.0)  \
         .filter(dataframe.dropoff_longitude<-73.0)\
         .filter(dataframe.dropoff_longitude>-74.0)

    ######################
    #
    # features engineering
    #
    ######################

    # create new column based on time-delta (minutes)
    # convert pickup-datetime column to hour

    time_delta_udf = udf(time_delta_minutes, FloatType())

    dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
          .withColumn('pick_up_hour', hour(dataframe.pickup_datetime))

    dataframe = dataframe.select(dataframe.pick_up_hour,    \
           dataframe.passenger_count.cast("integer"),  \
          dataframe.pickup_longitude.cast("double"), \
          dataframe.pickup_latitude.cast("double"),  \
          dataframe.dropoff_longitude.cast("double"),\
          dataframe.dropoff_latitude.cast("double"), \
          dataframe.time_delta.cast("double"))

    dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()

    # split dataframe into feature and label vector
    # create feature vectors and labels for model training
    feature_assembler = VectorAssembler(inputCols=[
        'pick_up_hour', 'pickup_longitude', 'pickup_latitude',
        'dropoff_longitude', 'dropoff_latitude'
    ],
                                        outputCol='features')

    transformed = feature_assembler.transform(dataframe)
    vector_dataframe = transformed.select(
        col("time_delta").alias("label"), col("features")).cache()

    ######################
    #
    # train model
    #
    ######################

    if validate:

        ################################
        #
        # validate model on 60/40 split
        #
        ################################

        # split
        training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)

        decision_tree_reg = DecisionTreeRegressor(maxDepth=12, maxBins=25)
        model = decision_tree_reg.fit(training)

        train_pred = model.transform(training)
        test_pred = model.transform(test)

        evaluator = RegressionEvaluator(labelCol="label",
                                        predictionCol="prediction",
                                        metricName="r2")
        r2_train = evaluator.evaluate(train_pred)

        evaluator_test = RegressionEvaluator(labelCol="label",
                                             predictionCol="prediction",
                                             metricName="r2")
        r2_test = evaluator_test.evaluate(test_pred)

        output = test_pred.select("prediction", "label", "features")

        return output, r2_test, r2_train

    else:

        ###################
        #
        # train on all data
        #
        ###################

        decision_tree_reg = DecisionTreeRegressor(maxDepth=12, maxBins=25)
        model = decision_tree_reg.fit(vector_dataframe)

        predictions = model.transform(vector_dataframe)

        output = predictions.select("prediction", "label", "features")

        ###########################
        #
        # process to send to Kafka
        #
        ###########################

        schema = StructType([
            StructField("prediction_mins", FloatType(), True),
            StructField("pick_up_hour", IntegerType(), True),
            StructField("pickup_longitude", DoubleType(), True),
            StructField("pickup_latitude", DoubleType(), True),
            StructField("dropoff_longitude", DoubleType(), True),
            StructField("dropoff_latitude", DoubleType(), True)
        ])

        features_from_predictions = output.map(lambda row: (
            float(row.prediction), int(row.features[0]), float(row.features[
                1]), float(row.features[2]), float(row.features[3]),
            float(row.features[4]))).collect()
        sqlContext.clearCache()
        dataframe_from_prediction_vector = sqlContext.createDataFrame(
            features_from_predictions, schema).cache()

        return dataframe_from_prediction_vector
Exemplo n.º 19
0
    encoder = OneHotEncoder(inputCol=categoricalCol + "Index",
                            outputCol=categoricalCol + "classVec")
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

#encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend']
encColumns = [
    'VendorID', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type'
]
for eCol in encColumns:
    encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol + "classVec")
    stages += [encoder]
#label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label")
#stages += [label_stringIdx]

numericCols = ["trip_distance", "passenger_count", "fare_amount", "tip_amount"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(
    lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(labelCol="total_amount",
                           featuresCol="features",
                           maxBins=32)
model = dt.fit(dataset)
model.write().overwrite().save("./nyc-01020304-6vm-18-DT-model")
Exemplo n.º 20
0
def doGrid_one():
    grid_data = getGridData(sqlCtx, '_ngrid2500')
    errorsRMSE_LR = []
    errorsR2_LR = []
    errorsRMSE_DT = []
    errorsR2_DT = []
    hor = grid_data['horizontal_slots']
    vert = grid_data['vertical_slots']
    print(hor, vert)
    hor = 24
    vert = 24
    for x in range(hor):
        print('grid hor:', x)
        for y in range(vert):
            train, test = get_features_for_grid(spark, x, y)
            assembler = VectorAssembler(inputCols=[
                "day", "day_of_week", "hour", "is_airport", "is_manhattan",
                "minute", 'pickup_lat_slot', 'pickup_long_slot',
                "time_of_day_code", "week"
            ],
                                        outputCol='features')
            output_training = assembler.transform(train)
            output_testing = assembler.transform(test)

            final_data_training = output_training.select('features', 'amount')
            final_data_testing = output_testing.select('features', 'amount')

            final_data_training.describe().show()
            final_data_testing.describe().show()

            decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3)
            dt_model = decisionTree.fit(final_data_training)
            predictionsDT = dt_model.transform(final_data_testing)
            print(dt_model.toDebugString)

            linearRegression = LinearRegression(labelCol='amount')
            lr_model = linearRegression.fit(final_data_training)
            predictionsLR = lr_model.evaluate(final_data_testing)
            """ Evaluation LR : """
            rmse = predictionsLR.rootMeanSquaredError
            errorsRMSE_LR.append(rmse)
            #print("Root Mean Squared Error (RMSE) for LR on test data = ", rmse)

            r2 = predictionsLR.r2
            errorsR2_LR.append(r2)
            #print("R Squared Error (R2) for LR on test data = ", r2)
            """ Evaluation DT : """
            evaluatorRMSE = RegressionEvaluator(labelCol="amount",
                                                predictionCol="prediction",
                                                metricName="rmse")
            rmse = evaluatorRMSE.evaluate(predictionsDT)
            errorsRMSE_DT.append(rmse)
            #print("Root Mean Squared Error (RMSE) for DT on test data = ", rmse)

            evaluatorR2 = RegressionEvaluator(labelCol="amount",
                                              predictionCol="prediction",
                                              metricName="r2")
            r2 = evaluatorR2.evaluate(predictionsDT)
            errorsR2_DT.append(r2)
            #print("R Squared Error (R2) for DT on test data = ", r2)

    return hor, vert, errorsR2_DT, errorsRMSE_DT, errorsRMSE_LR, errorsR2_LR
Exemplo n.º 21
0
    c)
print("\n")

print("For the whole dataset, the DecisionTreeRegressor is starting...")
evaluator_reg = RegressionEvaluator\
      (labelCol="label", predictionCol="prediction", metricName="rmse")
print("\n")
print(
    "Fetching the best values of parameters from 25% dataset and using them..."
)
rtime = time.time()
dtr = DecisionTreeRegressor(labelCol="label",
                            featuresCol="features",
                            maxDepth=maxDepth_dtr,
                            maxBins=maxBins_dtr)
model_dtr = dtr.fit(trainingData)
predictions_dtr = model_dtr.transform(testData)
binarizer = Binarizer(threshold=0.5,
                      inputCol="prediction",
                      outputCol="binarized_prediction")
binarizedDataFrame = binarizer.transform(predictions_dtr)
binarized = binarizedDataFrame.drop('prediction')
bdf_dtr = binarized.withColumnRenamed('binarized_prediction', 'prediction')
r = time.time() - rtime
evaluator_reg = MulticlassClassificationEvaluator\
      (labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_reg = evaluator_reg.evaluate(bdf_dtr)
print("\n")
print("Accuracy for DecisionTreeRegressor on the whole dataset = %g " %
      accuracy_reg)
evaluate_area_dtr = BinaryClassificationEvaluator(
Exemplo n.º 22
0
def Forecast(df, forecast_days, nLags, \
             timeSeriesColumn, regressor, sparksession):
    
    # this performs model training
    # this calls the machine-learning algorithms of Spark ML library
    
    #labels for machine-learning
    LeadWindow = window.Window.rowsBetween(0,forecast_days)   
    df = df.withColumn("label",func.last(df[timeSeriesColumn]).over(LeadWindow))
    
    features = [timeSeriesColumn]
    
    #Auto-regression feature
    LagTransformer = LagGather()\
                     .setLagLength(nLags)\
                     .setInputCol(timeSeriesColumn)
    df = LagTransformer.transform(df)
    featuresGenerated = LagTransformer.getFeatureNames()
    features.extend(featuresGenerated)
    
    #Other feature generators here:
    #Moving Average Smoothing
    #TrendGather

#******************************************************************************
# VECTOR ASSEMBLER
    # this assembles the all the features 
    df = df.dropna()
    vA = VectorAssembler().setInputCols(features)\
                          .setOutputCol("features")
    df_m = vA.transform(df)
#******************************************************************************
# Splitting data into train, test
    splitratio = 0.7
    df_train, df_test = TimeSeriesSplit(df_m, splitratio, sparksession)
#******************************************************************************
# DECISION-TREE REGRESSOR
    if(regressor == "DecisionTreeRegression"):
           
        dr = DecisionTreeRegressor(featuresCol = "features",\
                                   labelCol = "label", maxDepth = 5)
        model = dr.fit(df_train)
        predictions_dr_test = model.transform(df_test)
        predictions_dr_train = model.transform(df_train)
        
        # RMSE is used as evaluation metric
        evaluator = RegressionEvaluator(predictionCol="prediction",\
                                        labelCol="label",\
                                        metricName ="r2")
        
        RMSE_dr_test = evaluator.evaluate(predictions_dr_test)
        RMSE_dr_train = evaluator.evaluate(predictions_dr_train)
        return (df_test, df_train, \
                predictions_dr_test, predictions_dr_train,\
                RMSE_dr_test, RMSE_dr_train)
#******************************************************************************
# LINEAR REGRESSOR
    if(regressor == 'LinearRegression'):
        lr = LinearRegression(featuresCol = "features", labelCol="label", \
                              maxIter = 100, regParam = 0.4, \
                              elasticNetParam = 0.1)
        model = lr.fit(df_train)
        predictions_lr_test = model.transform(df_test)
        predictions_lr_train = model.transform(df_train)
        
        # RMSE is used as evaluation metric
        evaluator = RegressionEvaluator(predictionCol="prediction",\
                                        labelCol="label",\
                                        metricName ="r2") 
        RMSE_lr_test= evaluator.evaluate(predictions_lr_test)
        RMSE_lr_train = evaluator.evaluate(predictions_lr_train)
        return (df_test, df_train, \
                predictions_lr_test, predictions_lr_train,\
                RMSE_lr_test, RMSE_lr_train)
    

#*****************************************************************************
# RANDOM FOREST REGRESSOR
    if(regressor == 'RandomForestRegression'):
        rfr = RandomForestRegressor(featuresCol="features",\
                                    labelCol="label",\
                                    maxDepth = 5,\
                                    subsamplingRate = 0.8,\
                                    )
        model = rfr.fit(df_train)
        predictions_rfr_test = model.transform(df_test)
        predictions_rfr_train = model.transform(df_train)
        
        # RMSE is used as evaluation metric
        evaluator = RegressionEvaluator(predictionCol="prediction",\
                                        labelCol="label",\
                                        metricName ="rmse")
        RMSE_rfr_test= evaluator.evaluate(predictions_rfr_test)
        RMSE_rfr_train = evaluator.evaluate(predictions_rfr_train)
        return (df_test, df_train, \
                predictions_rfr_test, predictions_rfr_train,\
                RMSE_rfr_test, RMSE_rfr_train)
    

#*****************************************************************************
# GRADIENT BOOSTING TREE REGRESSOR
    if(regressor == 'GBTRegression'):
        gbt = GBTRegressor(featuresCol="features",\
                           labelCol="label",\
                           maxDepth=5,\
                           subsamplingRate=0.8)
        
        model = gbt.fit(df_train)
        predictions_gbt_test = model.transform(df_test)
        predictions_gbt_train = model.transform(df_train)
        
        # RMSE is used as evaluation metric
        evaluator = RegressionEvaluator(predictionCol="prediction",\
                                        labelCol="label",\
                                        metricName ="rmse")
        
        RMSE_gbt_test= evaluator.evaluate(predictions_gbt_test)
        RMSE_gbt_train = evaluator.evaluate(predictions_gbt_train)
        return (df_test, df_train, \
                predictions_gbt_test, predictions_gbt_train,\
                RMSE_gbt_test, RMSE_gbt_train)
Exemplo n.º 23
0
trainDF.cache()
testDF.cache()


# - ##### Entrenar un árbol de regresión para predecir la variable minutos.

# In[57]:


dt = DecisionTreeRegressor(labelCol='minutos') #toma como inputCo "features" de manera predeterminada


# In[58]:


model=dt.fit(trainDF)


# - ##### Evaluar el modelo resultante usando RMSE tanto en la muestra de entrenamiento como en la muestra de test: Comentar el resultado.

# In[59]:


predictionDF = model.transform(testDF)


# In[23]:


evaluator = RegressionEvaluator(labelCol="minutos")
Exemplo n.º 24
0
def main():
    for cid in range(N_OF_CLUSTERS):
        rows_training = []
        rows_testing = []
        for week_nb in range(FIRST_WEEK, LAST_WEEK + 1):
            print('week nb : ', week_nb)
            for day_of_week in range(DAY_IN_WEEK):
                for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY):
                    #for tid in range(TOTAL_SLOTS_FOR_LOOP): #TODO do the loop per week, per day and day slot and change fi_features_cache too

                    curFeature = demandCache.get_demand(
                        week_nb, day_of_week, time_of_day_code, cid)
                    if curFeature != []:
                        time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature(
                            curFeature)

                        if (week_nb < WEEK_NB_TEST):
                            rows_training.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, amount))
                        else:
                            rows_testing.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, amount))
        df_training = spark.createDataFrame(rows_training, [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "amount"
        ])
        df_testing = spark.createDataFrame(rows_testing, [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "amount"
        ])

        assembler = VectorAssembler(inputCols=[
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute"
        ],
                                    outputCol='features')
        output_training = assembler.transform(df_training)
        output_testing = assembler.transform(df_testing)

        final_data_training = output_training.select('features', 'amount')
        final_data_testing = output_testing.select('features', 'amount')

        decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3)
        dt_model = decisionTree.fit(final_data_training)
        predictionsDT = dt_model.transform(final_data_testing)

        linearRegression = LinearRegression(labelCol='amount')
        lr_model = linearRegression.fit(final_data_training)
        predictionsLR = lr_model.evaluate(final_data_testing)

        # print("Decision tree model max depth = %g" % decisionTree.getMaxDepth())
        # print(dt_model.toDebugString)
        """ Evaluation rmse : """
        rmse = predictionsLR.rootMeanSquaredError
        errorsRMSE_LR.append(rmse)
        print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse)

        r2 = predictionsLR.r2
        errorsR2_LR.append(r2)
        print("R Squared Error (R2) for LR on test data = %g" % r2)
        """ Evaluation rmse : """
        evaluatorRMSE = RegressionEvaluator(labelCol="amount",
                                            predictionCol="prediction",
                                            metricName="rmse")
        rmse = evaluatorRMSE.evaluate(predictionsDT)
        errorsRMSE_DT.append(rmse)
        print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse)

        evaluatorR2 = RegressionEvaluator(labelCol="amount",
                                          predictionCol="prediction",
                                          metricName="r2")
        r2 = evaluatorR2.evaluate(predictionsDT)
        errorsR2_DT.append(r2)
        print("R Squared Error (R2) for DT on test data = %g" % r2)
Exemplo n.º 25
0
def main():
    for cid in range(N_OF_CLUSTERS):
        rows_training = []
        rows_testing = []
        for week_nb in range(FIRST_WEEK, LAST_WEEK + 1):
            print('week nb : ', week_nb)
            for day_of_week in range(DAY_IN_WEEK):
                for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY):
                    #retrieving the features'values:
                    curFeature = demandCache.get_demand(
                        week_nb, day_of_week, time_of_day_code, cid)
                    if curFeature != []:
                        time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature(
                            curFeature)

                        # Checking whether the current row should be added to the training or testing set:
                        if (week_nb < WEEK_NB_TEST):
                            rows_training.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, amount))
                        else:
                            rows_testing.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, amount))

# Creating the dataframe for the model containing all the rows :
        df_training = spark.createDataFrame(rows_training, [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "amount"
        ])
        df_testing = spark.createDataFrame(rows_testing, [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "amount"
        ])

        assembler = VectorAssembler(inputCols=[
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute"
        ],
                                    outputCol='features')
        output_training = assembler.transform(df_training)
        output_testing = assembler.transform(df_testing)

        final_data_training = output_training.select('features', 'amount')
        final_data_testing = output_testing.select('features', 'amount')

        # Training the Desition Tree:
        decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3)
        dt_model = decisionTree.fit(final_data_training)
        predictionsDT = dt_model.transform(final_data_testing)
        # print(dt_model.toDebugString) # showing the decision tree

        # Training the linear regression:
        linearRegression = LinearRegression(labelCol='amount')
        lr_model = linearRegression.fit(final_data_training)
        predictionsLR = lr_model.evaluate(final_data_testing)
        """ Evaluation rmse : """
        rmse = predictionsLR.rootMeanSquaredError
        errorsRMSE_LR.append(rmse)
        print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse)

        r2 = predictionsLR.r2
        errorsR2_LR.append(r2)
        print("R Squared Error (R2) for LR on test data = %g" % r2)
        """ Evaluation rmse : """
        evaluatorRMSE = RegressionEvaluator(labelCol="amount",
                                            predictionCol="prediction",
                                            metricName="rmse")
        rmse = evaluatorRMSE.evaluate(predictionsDT)
        errorsRMSE_DT.append(rmse)
        print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse)

        evaluatorR2 = RegressionEvaluator(labelCol="amount",
                                          predictionCol="prediction",
                                          metricName="r2")
        r2 = evaluatorR2.evaluate(predictionsDT)
        errorsR2_DT.append(r2)
        print("R Squared Error (R2) for DT on test data = %g" % r2)
glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
print glr.explainParams()
glrModel = glr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
dtrModel = dtr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf =  RandomForestRegressor()
print rf.explainParams()
rfModel = rf.fit(df)
gbt = GBTRegressor()
print gbt.explainParams()
gbtModel = gbt.fit(df)


# COMMAND ----------
# Split the data 70-30
train_test_data = model_data.randomSplit([0.8, 0.2], 16430212)
train_data = train_test_data[0]
test_data = train_test_data[1]

print("Train DT")
rmseEvaluator = myRmseEvaluator(
    RegressionEvaluator(predictionCol="prediction",
                        labelCol="trip_duration",
                        metricName="rmse"))
maeEvaluator = RegressionEvaluator(predictionCol="prediction",
                                   labelCol="trip_duration",
                                   metricName="mae")
dtr = DecisionTreeRegressor(
    maxDepth=3).setFeaturesCol("features").setLabelCol("trip_duration")
trained_model = dtr.fit(train_data)
predictions = trained_model.transform(test_data)

# final_result = predictions.select("prediction", "trip_duration").rdd
print(trained_model)
print("RMSE for Regression Tree:", rmseEvaluator.evaluate(predictions))
print("MAE for Regression Tree:", maeEvaluator.evaluate(predictions))
"""
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_826b1c042824, depth=3, numNodes=15, numFeatures=7
  If (feature 6 <= 2.7002733639393384)
   If (feature 6 <= 1.3071311166631614)
    If (feature 6 <= 0.825910208978972)
     Predict: 481.0347939172201
    Else (feature 6 > 0.825910208978972)
     Predict: 704.5021037177617
   Else (feature 6 > 1.3071311166631614)
Exemplo n.º 28
0
categoricalColumns = ['store_and_fwd_flag']
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

#encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend']
encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type']
for eCol in encColumns:
  encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec")
  stages += [encoder]
#label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label")
#stages += [label_stringIdx]

numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(labelCol="total_amount", featuresCol="features", maxBins=32)
model = dt.fit(dataset)
model.write().overwrite().save("./nyc-01020304-6vm-18-DT-model")


Exemplo n.º 29
0
from pyspark.ml.regression import DecisionTreeRegressor
dt_models = {}
dt_predictions = {}

compute_again = False 
if compute_again == False:
  dt_models = loadModels("TreeModel_","tree")
  for park in park_data_with_date_dict:
    dt_predictions[park] = dt_models[park].transform(test_ds[park])
else:
  for park in park_data_with_date_dict:
    #vectorAssembler = VectorAssembler(inputCols=features, outputCol="features")
    #data = vectorAssembler.transform(all_tables[park])
    #train, test = data.randomSplit([0.8,0.2], seed = 12345)
    dt = DecisionTreeRegressor()
    dt_models[park] = dt.fit(train_ds[park])
    dt_predictions[park] = dt_models[park].transform(test_ds[park])
  saveModels(dt_models,"TreeModel_","tree")

# COMMAND ----------

#ATTENZIONE: se vuoi visualizzare proprio gli alberi puoi chiamare display(dt_models[park])

# COMMAND ----------

def printEvaluateModel(park,modelsCollection, predictionsCollection):
  print("EVALUATE MODEL FOR PARKING "+str(park))
  print("OVER TEST SET")
  print("Features importance:" + str(modelsCollection[park].featureImportances))
  eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
  # Root Mean Square Error
import numpy as np
import matplotlib.pyplot as plt

# In[106]:

### model building process
#create a sample for model test
sample, x = final_data.randomSplit([0.1, 0.8])

# In[107]:

# decision trees
r2_dtr = np.zeros(10)
for i in np.arange(10):
    dtr = DecisionTreeRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.)
    dtrModel = dtr.fit(sample)
    prediction_dtr = dtrModel.transform(sample)
    r2_dtr[i] = evaluator.evaluate(prediction_dtr)
plt.plot(np.arange(3, 33, 3), r2_dtr)
# so choose 10 as the maxDepth

# In[108]:

# Random Forest
r2_rfr = np.zeros(10)
for i in np.arange(10):
    rfr = RandomForestRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.)
    rfrModel = rfr.fit(sample)
    prediction_rfr = rfrModel.transform(sample)
    r2_rfr[i] = evaluator.evaluate(prediction_rfr)
plt.plot(np.arange(3, 33, 3), r2_rfr)
    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

        appType = self._dataframe_context.get_app_type()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = filter(lambda x:x.get_algorithm_slug()==self._slug,algosToRun)[0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns)-set(allDateCols))
        print categorical_columns
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [x for x in numerical_columns if x != result_column]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print "model_path",model_path
        pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/"
        model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model"
        pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml"

        df = self._data_frame
        if self._mlEnv == "spark":
            pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression")

            pipelineModel = pipeline.fit(df)
            indexed = pipelineModel.transform(df)
            featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values())))

            # print indexed.select([result_column,"features"]).show(5)
            MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath)
            # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")
            dtreer = DecisionTreeRegressor(labelCol=result_column, featuresCol='features',predictionCol="prediction")
            if validationDict["name"] == "kFold":
                defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"]
                numFold = int(validationDict["value"])
                if numFold == 0:
                    numFold = 3
                trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345)
                paramGrid = ParamGridBuilder()\
                    .addGrid(dtreer.regParam, [0.1, 0.01]) \
                    .addGrid(dtreer.fitIntercept, [False, True])\
                    .addGrid(dtreer.elasticNetParam, [0.0, 0.5, 1.0])\
                    .build()
                crossval = CrossValidator(estimator=dtreer,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column),
                              numFolds=numFold)
                st = time.time()
                cvModel = crossval.fit(indexed)
                trainingTime = time.time()-st
                print "cvModel training takes",trainingTime
                bestModel = cvModel.bestModel
            elif validationDict["name"] == "trainAndtest":
                trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345)
                st = time.time()
                fit = dtreer.fit(trainingData)
                trainingTime = time.time()-st
                print "time to train",trainingTime
                bestModel = fit

            featureImportance = bestModel.featureImportances
            print featureImportance,type(featureImportance)
            # print featureImportance[0],len(featureImportance[1],len(featureImportance[2]))
            print len(featureMapping)
            featuresArray = [(name, featureImportance[idx]) for idx, name in featureMapping]
            print featuresArray
            MLUtils.save_pipeline_or_model(bestModel,model_filepath)
            transformed = bestModel.transform(validationData)
            transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType()))
            transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]])
            transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference")
            transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]])
            transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape")
            sampleData = None
            nrows = transformed.count()
            if nrows > 100:
                sampleData = transformed.sample(False, float(100)/nrows, seed=420)
            else:
                sampleData = transformed
            print sampleData.show()
            evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column)
            metrics = {}
            metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"})
            metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"})
            metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"})
            metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"})
            runtime = round((time.time() - st_global),2)
            # print transformed.count()
            mapeDf = transformed.select("mape")
            # print mapeDf.show()
            mapeStats = MLUtils.get_mape_stats(mapeDf,"mape")
            mapeStatsArr = mapeStats.items()
            mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0]))
            # print mapeStatsArr
            quantileDf = transformed.select("prediction")
            # print quantileDf.show()
            quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction")
            quantileSummaryArr = quantileSummaryDict.items()
            quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0]))
            # print quantileSummaryArr
            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name("dtree Regression")
            self._model_summary.set_algorithm_display_name("Decision Tree Regression")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(bestEstimator.get_params())
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.toPandas().to_dict())
            self._model_summary.set_feature_importance(featureImportance)
            # print CommonUtils.convert_python_object_to_json(self._model_summary)
        elif self._mlEnv == "sklearn":
            model_filepath = model_path+"/"+self._slug+"/model.pkl"
            x_train,x_test,y_train,y_test = self._dataframe_helper.get_train_test_data()
            x_train = MLUtils.create_dummy_columns(x_train,[x for x in categorical_columns if x != result_column])
            x_test = MLUtils.create_dummy_columns(x_test,[x for x in categorical_columns if x != result_column])
            x_test = MLUtils.fill_missing_columns(x_test,x_train.columns,result_column)

            st = time.time()
            est = DecisionTreeRegressor()

            CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"training","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

            if algoSetting.is_hyperparameter_tuning_enabled():
                hyperParamInitParam = algoSetting.get_hyperparameter_params()
                evaluationMetricDict = {"name":hyperParamInitParam["evaluationMetric"]}
                evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]]
                hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name()
                params_grid = algoSetting.get_params_dict_hyperparameter()
                params_grid = {k:v for k,v in params_grid.items() if k in est.get_params()}
                print params_grid
                if hyperParamAlgoName == "gridsearchcv":
                    estGrid = GridSearchCV(est,params_grid)
                    gridParams = estGrid.get_params()
                    hyperParamInitParam = {k:v for k,v in hyperParamInitParam.items() if k in gridParams}
                    estGrid.set_params(**hyperParamInitParam)
                    estGrid.fit(x_train,y_train)
                    bestEstimator = estGrid.best_estimator_
                    modelFilepath = "/".join(model_filepath.split("/")[:-1])
                    sklearnHyperParameterResultObj = SklearnGridSearchResult(estGrid.cv_results_,est,x_train,x_test,y_train,y_test,appType,modelFilepath,evaluationMetricDict=evaluationMetricDict)
                    resultArray = sklearnHyperParameterResultObj.train_and_save_models()
                    self._result_setter.set_hyper_parameter_results(self._slug,resultArray)
                    self._result_setter.set_metadata_parallel_coordinates(self._slug,{"ignoreList":sklearnHyperParameterResultObj.get_ignore_list(),"hideColumns":sklearnHyperParameterResultObj.get_hide_columns(),"metricColName":sklearnHyperParameterResultObj.get_comparison_metric_colname(),"columnOrder":sklearnHyperParameterResultObj.get_keep_columns()})

                elif hyperParamAlgoName == "randomsearchcv":
                    estRand = RandomizedSearchCV(est,params_grid)
                    estRand.set_params(**hyperParamInitParam)
                    bestEstimator = None
            else:
                evaluationMetricDict = {"name":GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC}
                evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]]
                algoParams = algoSetting.get_params_dict()
                algoParams = {k:v for k,v in algoParams.items() if k in est.get_params().keys()}
                est.set_params(**algoParams)
                self._result_setter.set_hyper_parameter_results(self._slug,None)
                if validationDict["name"] == "kFold":
                    defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"]
                    numFold = int(validationDict["value"])
                    if numFold == 0:
                        numFold = 3
                    kFoldClass = SkleanrKFoldResult(numFold,est,x_train,x_test,y_train,y_test,appType,evaluationMetricDict=evaluationMetricDict)
                    kFoldClass.train_and_save_result()
                    kFoldOutput = kFoldClass.get_kfold_result()
                    bestEstimator = kFoldClass.get_best_estimator()
                elif validationDict["name"] == "trainAndtest":
                    est.fit(x_train, y_train)
                    bestEstimator = est
            trainingTime = time.time()-st
            y_score = bestEstimator.predict(x_test)
            try:
                y_prob = bestEstimator.predict_proba(x_test)
            except:
                y_prob = [0]*len(y_score)
            featureImportance={}

            objs = {"trained_model":bestEstimator,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":featureImportance,"featureList":list(x_train.columns),"labelMapping":{}}
            featureImportance = objs["trained_model"].feature_importances_
            featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)]

            if not algoSetting.is_hyperparameter_tuning_enabled():
                modelName = "M"+"0"*(GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH-1)+"1"
                modelFilepathArr = model_filepath.split("/")[:-1]
                modelFilepathArr.append(modelName+".pkl")
                joblib.dump(objs["trained_model"],"/".join(modelFilepathArr))
            metrics = {}
            metrics["r2"] = r2_score(y_test, y_score)
            metrics["mse"] = mean_squared_error(y_test, y_score)
            metrics["mae"] = mean_absolute_error(y_test, y_score)
            metrics["rmse"] = sqrt(metrics["mse"])
            transformed = pd.DataFrame({"prediction":y_score,result_column:y_test})
            transformed["difference"] = transformed[result_column] - transformed["prediction"]
            transformed["mape"] = np.abs(transformed["difference"])*100/transformed[result_column]

            sampleData = None
            nrows = transformed.shape[0]
            if nrows > 100:
                sampleData = transformed.sample(n=100,random_state=420)
            else:
                sampleData = transformed
            print sampleData.head()

            mapeCountArr = pd.cut(transformed["mape"],GLOBALSETTINGS.MAPEBINS).value_counts().to_dict().items()
            mapeStatsArr = [(str(idx),dictObj) for idx,dictObj in enumerate(sorted([{"count":x[1],"splitRange":(x[0].left,x[0].right)} for x in mapeCountArr],key = lambda x:x["splitRange"][0]))]

            predictionColSummary = transformed["prediction"].describe().to_dict()
            quantileBins = [predictionColSummary["min"],predictionColSummary["25%"],predictionColSummary["50%"],predictionColSummary["75%"],predictionColSummary["max"]]
            print quantileBins
            quantileBins = sorted(list(set(quantileBins)))
            transformed["quantileBinId"] = pd.cut(transformed["prediction"],quantileBins)
            quantileDf = transformed.groupby("quantileBinId").agg({"prediction":[np.sum,np.mean,np.size]}).reset_index()
            quantileDf.columns = ["prediction","sum","mean","count"]
            print quantileDf
            quantileArr = quantileDf.T.to_dict().items()
            quantileSummaryArr = [(obj[0],{"splitRange":(obj[1]["prediction"].left,obj[1]["prediction"].right),"count":obj[1]["count"],"mean":obj[1]["mean"],"sum":obj[1]["sum"]}) for obj in quantileArr]
            print quantileSummaryArr
            runtime = round((time.time() - st_global),2)

            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name("DTREE Regression")
            self._model_summary.set_algorithm_display_name("Decision Tree Regression")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(bestEstimator.get_params())
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.to_dict())
            self._model_summary.set_feature_importance(featuresArray)
            self._model_summary.set_feature_list(list(x_train.columns))


            try:
                pmml_filepath = str(model_path)+"/"+str(self._slug)+"/traindeModel.pmml"
                modelPmmlPipeline = PMMLPipeline([
                  ("pretrained-estimator", objs["trained_model"])
                ])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True)
                pmmlfile = open(pmml_filepath,"r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug:pmmlText})
            except:
                pass
        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                        "name":self._model_summary.get_algorithm_name(),
                        "evaluationMetricValue":self._model_summary.get_model_accuracy(),
                        "evaluationMetricName":"r2",
                        "slug":self._model_summary.get_slug(),
                        "Model Id":modelName
                        }

            modelSummaryJson = {
                "dropdown":modelDropDownObj,
                "levelcount":self._model_summary.get_level_counts(),
                "modelFeatureList":self._model_summary.get_feature_list(),
                "levelMapping":self._model_summary.get_level_map_dict(),
                "slug":self._model_summary.get_slug(),
                "name":self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                        "name":self._model_summary.get_algorithm_name(),
                        "evaluationMetricValue":resultArray[0]["R-Squared"],
                        "evaluationMetricName":"r2",
                        "slug":self._model_summary.get_slug(),
                        "Model Id":resultArray[0]["Model Id"]
                        }
            modelSummaryJson = {
                "dropdown":modelDropDownObj,
                "levelcount":self._model_summary.get_level_counts(),
                "modelFeatureList":self._model_summary.get_feature_list(),
                "levelMapping":self._model_summary.get_level_map_dict(),
                "slug":self._model_summary.get_slug(),
                "name":self._model_summary.get_algorithm_name()
            }

        dtreerCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)]

        for card in dtreerCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({"dtreeregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))})
        self._result_setter.set_dtree_regression_model_summart(modelSummaryJson)
        self._result_setter.set_dtreer_cards(dtreerCards)

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"completion","info",display=True,emptyBin=False,customMsg=None,weightKey="total")
Exemplo n.º 32
0
    assembler = VectorAssembler(inputCols=[
        "slot_id", "day_of_week", "day_of_month", "week_nb", "hour", "minute"
    ],
                                outputCol='features')
    output_training = assembler.transform(df_training)
    output_testing = assembler.transform(df_testing)

    final_data_training = output_training.select('features', 'demand')
    final_data_testing = output_testing.select('features', 'demand')

    #final_data_training.describe().show()
    #final_data_testing.describe().show()
    """  Model and predictions : """
    decisionTree = DecisionTreeRegressor(labelCol='demand', maxDepth=3)
    dt_model = decisionTree.fit(final_data_training)
    predictions = dt_model.transform(final_data_testing)
    #print("Decision tree model max depth = %g" % decisionTree.getMaxDepth())
    #print(dt_model.toDebugString)
    """ Evaluation rmse : """
    evaluatorRMSE = RegressionEvaluator(labelCol="demand",
                                        predictionCol="prediction",
                                        metricName="rmse")
    rmse = evaluatorRMSE.evaluate(predictions)
    errorsRMSE.append(rmse)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    evaluatorR2 = RegressionEvaluator(labelCol="demand",
                                      predictionCol="prediction",
                                      metricName="r2")
    r2 = evaluatorR2.evaluate(predictions)
Exemplo n.º 33
0
    assembler = VectorAssembler().setInputCols(
        ['HouseAge', 'DistanceToMRT',
         'NumberConvenienceStores']).setOutputCol('features')
    df = assembler.transform(data).select('PriceOfUnitArea', 'features')

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
    dtr = DecisionTreeRegressor().setFeaturesCol('features').setLabelCol(
        'PriceOfUnitArea')

    # Train the model using our training data
    model = dtr.fit(trainingDF)

    # Now see if we can predict values in our test data.
    # Generate predictions using our linear regression model for all features in our
    # test dataframe:
    fullPredictions = model.transform(testDF).cache()

    # Extract the predictions and the "known" correct labels.
    predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
    labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

    # Zip them together
    predictionAndLabel = predictions.zip(labels).collect()

    # Print out the predicted and actual values for each point
    for prediction in predictionAndLabel:
# COMMAND ----------

# MAGIC %md
# MAGIC #### Regression with decision trees

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor

dtr = DecisionTreeRegressor().setLabelCol('petalWidth')
print dtr.explainParams()

# COMMAND ----------

dtrModel = dtr.fit(irisPetal)
dtrPredictions = dtrModel.transform(irisPetal)
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'})
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'})

# COMMAND ----------

# MAGIC %md
# MAGIC Let's also build a gradient boosted tree.

# COMMAND ----------

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor().setLabelCol('petalWidth')
print gbt.explainParams()