Exemplos de DecisionTreeRegressor.fit em Python, exemplos de pyspark.ml.regression.DecisionTreeRegressor.fit em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Sample_2 4.38.05 PM.py Projeto: BingQZhou/CodeHonestly

def task_7(data_io, train_data, test_data):

    # ---------------------- Your implementation begins------------------------
    dt = DecisionTreeRegressor(labelCol="overall",
                               featuresCol="features",
                               maxDepth=5)
    model = dt.fit(train_data)
    predictions = model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="overall",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {'test_rmse': None}
    # Modify res:
    res['test_rmse'] = rmse

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_7')
    return res

Exemplo n.º 2

0

Exibir arquivo

Arquivo: decision_tree_model.py Projeto: kinglavi/machineLearningCourseSpark

def build_decision_tree_regression(observation_df, feature_columns):
    # Create new column with all of the features
    vector_observation_df = create_feature_column(observation_df,
                                                  feature_columns,
                                                  ['features', 'duration_sec'])

    train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3])
    lr = DecisionTreeRegressor(featuresCol="features", labelCol="duration_sec")

    model = lr.fit(train_df)

    test_predictions = model.transform(test_df)

    test_predictions.select("prediction", "duration_sec", "features").show(5)

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="rmse")
    print("RMSE on test data = %g" % evaluator.evaluate(test_predictions))

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="r2")

    print("R2 on test data = %g" % evaluator.evaluate(test_predictions))

    return model

Exemplo n.º 3

0

Exibir arquivo

 def test_decision_tree_regressor(self):
     features = [[0, 1], [1, 1], [2, 0]]
     features = numpy.array(features, dtype=numpy.float32)
     labels = [100, -10, 50]
     dd = [(labels[i], Vectors.dense(features[i]))
           for i in range(len(labels))]
     data = self.spark.createDataFrame(
         self.spark.sparkContext.parallelize(dd),
         schema=["label", "features"])
     dt = DecisionTreeRegressor(labelCol="label", featuresCol="features")
     model = dt.fit(data)
     feature_count = data.select('features').first()[0].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml Decision Tree Regressor',
         [('features', FloatTensorType([None, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     predicted = model.transform(data)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlDecisionTreeRegressor")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)

Exemplo n.º 4

0

Exibir arquivo

def predict_price_of_unit_area_by_decision_tree(
        real_estate_dataset_df: DataFrame):
    """
    Predict the price per unit area based on house age, distance to MRT (public transportation) and number of convenience stores,
    using decision tree regression.
    :param real_estate_dataset_df:
    :return:
    """

    real_estate_dataset_df = transform_dataset_to_label_feature_form(
        real_estate_dataset_df)

    train_test_datasets = real_estate_dataset_df.randomSplit([0.5, 0.5])
    train_dataset = train_test_datasets[0]
    test_dataset = train_test_datasets[1]

    # setLabelCol, setFeatureCol: Change column name for "label" and "features" columns.
    decision_tree_regressor = DecisionTreeRegressor().setLabelCol(
        'actual_price')
    model = decision_tree_regressor.fit(train_dataset)

    # Create predictions for testing dataset.
    predictions = model.transform(test_dataset).\
        select('actual_price', func.round(func.col('prediction'), 2).alias('predicted_price')).\
        orderBy(func.desc('actual_price')).cache()

    return predictions

Exemplo n.º 5

0

Exibir arquivo

Arquivo: main.py Projeto: AndriiHomeniuk/boston-room-prices-detection

def decision_tree_regression(train_data, test_data):
    dt = DecisionTreeRegressor(featuresCol='features', labelCol='MEDV')
    dt_model = dt.fit(train_data)
    dt_predictions = dt_model.transform(test_data)
    dt_evaluator = RegressionEvaluator(
        labelCol='MEDV',
        predictionCol='prediction',
        metricName='rmse',
    )
    rmse = dt_evaluator.evaluate(dt_predictions)
    print('Root Mean Squared Error (RMSE) on test data = %g' % rmse)
    print(dt_model.featureImportances)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: Sample_2 4.38.05 PM.py Projeto: BingQZhou/CodeHonestly

def task_8(data_io, train_data, test_data):

    # ---------------------- Your implementation begins------------------------
    trainingData, testData = train_data.randomSplit([0.75, 0.25])
    best = 0
    all_rmse = []
    lowest_rmse = 100
    for i in [5, 7, 9, 12]:
        dt = DecisionTreeRegressor(labelCol="overall",
                                   featuresCol="features",
                                   maxDepth=i)
        model = dt.fit(trainingData)
        predictions = model.transform(testData)
        evaluator = RegressionEvaluator(labelCol="overall",
                                        predictionCol="prediction",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)
        all_rmse = all_rmse + [rmse]
        if rmse <= lowest_rmse:
            lowest_rmse = rmse
            best = i
            best_model = model

    predictions = best_model.transform(test_data)
    evaluator = RegressionEvaluator(labelCol="overall",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'test_rmse': None,
        'valid_rmse_depth_5': None,
        'valid_rmse_depth_7': None,
        'valid_rmse_depth_9': None,
        'valid_rmse_depth_12': None,
    }
    # Modify res:
    res['test_rmse'] = rmse
    res['valid_rmse_depth_5'] = all_rmse[0]
    res['valid_rmse_depth_7'] = all_rmse[1]
    res['valid_rmse_depth_9'] = all_rmse[2]
    res['valid_rmse_depth_12'] = all_rmse[3]

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_8')
    return res

Exemplo n.º 7

0

Exibir arquivo

Arquivo: q1_part_B_C.py Projeto: sohailahmedkhan173/Searching-for-exotic-particles-in-high-energy-physics-using-classic-supervised-learning-algorithms

def decisionTreeRegressor(data, ncolumns, schemaNames):
    from pyspark.ml import Pipeline
    from pyspark.ml.regression import DecisionTreeRegressor
    from pyspark.ml.tuning import ParamGridBuilder
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.tuning import CrossValidator
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.feature import Binarizer
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    import numpy as np
    import time

    binarizer = Binarizer(
        threshold=0.00001,
        inputCol="features",
        outputCol="binarized_features",
    )
    binarizedDataFrame = binarizer.transform(data)

    (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50)
    dtr = DecisionTreeRegressor(labelCol="label",
                                featuresCol="binarized_features",
                                maxDepth=10,
                                maxBins=10,
                                impurity='Variance')

    timer = ''
    start = time.time()
    cvModel = dtr.fit(trainingData)
    end = time.time()
    timer = ((end - start) / 60)

    prediction = cvModel.transform(testData)
    evaluator = RegressionEvaluator\
         (labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(prediction)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    areaUC = evaluator.evaluate(prediction)

    fi = cvModel.featureImportances
    imp_feat = np.zeros(ncolumns - 1)
    imp_feat[fi.indices] = fi.values
    x = np.arange(ncolumns - 1)
    idx = (-imp_feat).argsort()[:3]
    feat = []
    for i in idx:
        feat.append(schemaNames[i])

    return feat, rmse, areaUC, timer

Exemplo n.º 8

0

Exibir arquivo

Arquivo: models.py Projeto: AlinaVorontseva/BigDataSparkProject

def TrainDT(trainingData, testData):
    # Train a DecisionTree model.
    dt = DecisionTreeRegressor()

    # Train model.  This also runs the indexer.
    start = time.time()
    model = dt.fit(trainingData)
    end = time.time()
    print('Training DT model took', end - start)

    # Make predictions.
    predictions = model.transform(testData)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 on test data = %g" % r2)

    # Make predictions for train
    predictions = model.transform(trainingData)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="r2")
    r2 = evaluator.evaluate(predictions)
    print("R2 on train data = %g" % r2)

    return model

Exemplo n.º 9

0

Exibir arquivo

def decisionTreeRegression(df,arguments):
	from pyspark.ml.regression import DecisionTreeRegressor
	maxDepth = 5
	minInstancesPerNode = 1
	impurity = "variance"

	if arguments.maxDepth != None:
		maxDepth = float(arguments.maxDepth)

	if arguments.minInstancesPerNode != None:
		minInstancesPerNode = float(arguments.minInstancesPerNode)

	if arguments.impurity != None:
		impurity = arguments.impurity

	dt = DecisionTreeRegressor(maxDepth=maxDepth,
							   minInstancesPerNode=minInstancesPerNode,
							   impurity=impurity)
	model = dt.fit(df)

	return model

Exemplo n.º 10

0

Exibir arquivo

Arquivo: regression.py Projeto: canisn/pyspark

def decision_tree_regressor():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance")
    model = dt.fit(df)
    model.depth
    # 1
    model.numNodes
    # 3
    model.featureImportances
    # SparseVector(1, {0: 1.0})
    model.numFeatures
    # 1
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    model.transform(test0).head().prediction
    # 0.0
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    model.transform(test1).head().prediction
    # 1.0
    temp_path = "./"
    dtr_path = temp_path + "/dtr"
    dt.save(dtr_path)
    dt2 = DecisionTreeRegressor.load(dtr_path)
    dt2.getMaxDepth()
    # 2
    model_path = temp_path + "/dtr_model"
    model.save(model_path)
    model2 = DecisionTreeRegressionModel.load(model_path)
    model.numNodes == model2.numNodes
    # True
    model.depth == model2.depth
    # True
    model.transform(test1).head().variance

Exemplo n.º 11

0

Exibir arquivo

Arquivo: Energy_output_predict.py Projeto: zayedupal/EnergyOutputPredicition

                      elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

train_df.describe().show()

lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction", "PE", "features").show(5)
lr_evaluator = RegressionEvaluator(predictionCol="prediction",
                                   labelCol="PE",
                                   metricName="r2")
print("R Squared (R2) on test data = %g" %
      lr_evaluator.evaluate(lr_predictions))

## DecisionTreeRegressor portion
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol='features', labelCol='PE')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(labelCol="PE",
                                   predictionCol="prediction",
                                   metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print(
    "DecisionTreeRegressor Root Mean Squared Error (RMSE) on test data = %g" %
    rmse)

Exemplo n.º 12

0

Exibir arquivo

    lr_evaluator = RegressionEvaluator(predictionCol="prediction",
                                       labelCol="MPG",
                                       metricName="r2")
    print("R Squared (R2) for Linear Regression on test data = %g" %
          lr_evaluator.evaluate(lr_predictions))

    # RMSE on test data
    test_result = lr_model.evaluate(test_df)
    print(
        "Root Mean Squared Error (RMSE) for Linear Regression on test data = %g\n"
        % test_result.rootMeanSquaredError)

    #############################---DECISION TREE REGRESSION---##################################

    dt = DecisionTreeRegressor(featuresCol='features', labelCol='MPG')
    decisionTree_model = dt.fit(train_df)
    decisionTree_model_predictions = decisionTree_model.transform(test_df)
    decisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="rmse")
    rmse = decisionTree_model_evaluator.evaluate(
        decisionTree_model_predictions)
    print(
        "Root Mean Squared Error (RMSE) for Decision Tree on test data = %g" %
        rmse)
    r2_dt = ecisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="r2")
    print("R Squared (R2) for Decision Tree on test data = %g" %
          r2_dt.evaluate(decisionTree_model_predictions))

    ############################---RANDOM FOREST REGRESSION---##################################

Exemplo n.º 13

0

Exibir arquivo

def main():
    errorsRMSE_LR = []
    errorsR2_LR = []
    errorsR2_DT = []
    errorsR2_DT5 = []
    errorsRMSE_DT = []
    errorsRMSE_DT5 = []
    rows_training = []
    rows_testing = [[] for i in range(N_OF_CLUSTERS)]

    for week_nb in range(FIRST_WEEK, LAST_WEEK + 1):
        print('week nb : ', week_nb)
        for day_of_week in range(DAY_IN_WEEK):
            for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY):
                for cid in range(N_OF_CLUSTERS):
                    curFeature = demandCache.get_demand(
                        week_nb, day_of_week, time_of_day_code, cid)
                    if curFeature != []:
                        time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature(
                            curFeature)

                        if (week_nb < WEEK_NB_TEST):
                            rows_training.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, is_manhattan, is_airport,
                                 amount))
                        else:
                            rows_testing[cid].append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, is_manhattan, is_airport,
                                 amount))

    df_training = spark.createDataFrame(rows_training, [
        "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
        "minute", "is_manhattan", "is_airport", "amount"
    ])

    assembler = VectorAssembler(inputCols=[
        "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
        "minute", "is_manhattan", "is_airport"
    ],
                                outputCol='features')
    output_training = assembler.transform(df_training)

    final_data_training = output_training.select('features', 'amount')

    decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3)
    dt_model = decisionTree.fit(final_data_training)

    #print(dt_model.toDebugString)

    decisionTree5 = DecisionTreeRegressor(labelCol='amount', maxDepth=5)
    dt_model5 = decisionTree5.fit(final_data_training)

    #print(dt_model5.toDebugString)

    file = open("DT_final_features_one_model_INFO.txt", "w")
    file.write("DT maxDepth 3 : \n" + dt_model.toDebugString)
    file.write("DT maxDepth 5 : \n" + dt_model5.toDebugString)
    file.close()

    linearRegression = LinearRegression(labelCol='amount')
    lr_model = linearRegression.fit(final_data_training)

    for cid in range(N_OF_CLUSTERS):
        print('cluster: ', cid)
        df_testing = spark.createDataFrame(rows_testing[cid], [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "is_manhattan", "is_airport", "amount"
        ])
        #df_testing.show()
        output_testing = assembler.transform(df_testing)
        final_data_testing = output_testing.select('features', 'amount')
        predictionsDT = dt_model.transform(final_data_testing)
        predictionsDT5 = dt_model5.transform(final_data_testing)
        predictionsLR = lr_model.evaluate(final_data_testing)
        """ Evaluation rmse : """
        rmse = predictionsLR.rootMeanSquaredError
        errorsRMSE_LR.append(rmse)
        #print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse)

        r2 = predictionsLR.r2
        errorsR2_LR.append(r2)
        #print("R Squared Error (R2) for LR on test data = %g" % r2)
        """ Evaluation rmse : """
        evaluatorRMSE = RegressionEvaluator(labelCol="amount",
                                            predictionCol="prediction",
                                            metricName="rmse")
        rmse = evaluatorRMSE.evaluate(predictionsDT)
        rmse5 = evaluatorRMSE.evaluate(predictionsDT5)
        errorsRMSE_DT.append(rmse)
        errorsRMSE_DT5.append(rmse5)
        #print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse)

        evaluatorR2 = RegressionEvaluator(labelCol="amount",
                                          predictionCol="prediction",
                                          metricName="r2")
        r2 = evaluatorR2.evaluate(predictionsDT)
        r25 = evaluatorR2.evaluate(predictionsDT5)
        errorsR2_DT.append(r2)
        errorsR2_DT5.append(r25)
        #print("R Squared Error (R2) for DT on test data = %g" % r2)
    return errorsRMSE_LR, errorsR2_LR, errorsRMSE_DT, errorsR2_DT, errorsRMSE_DT5, errorsR2_DT5

Exemplo n.º 14

0

Exibir arquivo

Arquivo: pyspark1.py Projeto: dcsee/learningpython

#testLFDF.take(10)

# COMMAND ----------

#Creating an evaluator measuring our label vs our prediction using RMSE evaluation.
evaluator = RegressionEvaluator(metricName="rmse")\
  .setLabelCol("price_doc")\
  .setPredictionCol("prediction")

# COMMAND ----------

#Decision tree regression, testing on both train and test dataset.
dt = DecisionTreeRegressor(labelCol='price_doc')

#This builds the dt model using the train dataset
model = dt.fit(trainLFDF)
#This predicts dt model outcomes on train and test dataset
trainPredictions = model.transform(trainLFDF)
testPredictions = model.transform(testLFDF)

trainscore = evaluator.evaluate(trainPredictions)
testscore = evaluator.evaluate(testPredictions)
print(trainscore, testscore)

#DT 8 Vars RMSE 3493522, 3901961

# COMMAND ----------

#Gradient boosted tree regression
gbt = GBTRegressor(labelCol='price_doc')
model = gbt.fit(trainLFDF)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: 5-regression_student.py Projeto: smoltis/spark

# COMMAND ----------

# MAGIC %md
# MAGIC #### Regression with decision trees

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor

dtr = DecisionTreeRegressor().setLabelCol('petalWidth')
print dtr.explainParams()

# COMMAND ----------

dtrModel = dtr.fit(irisPetal)
dtrPredictions = dtrModel.transform(irisPetal)
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'})
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'})

# COMMAND ----------

# MAGIC %md
# MAGIC Let's also build a gradient boosted tree.

# COMMAND ----------

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor().setLabelCol('petalWidth')
print gbt.explainParams()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: spark_ML_batch.py Projeto: alcedok/capataz

def spark_process(sqlContext, sc, validate, path_to_file):

	######################
	#
	# HDFS to DataFrame 
	#
	######################

	
	## all fields:
	#  ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 
	#   'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', 
	#   'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', 
	#   'tolls_amount', 'total_amount']

	# columns to select
	feature_columns = [1,2,3,5,6,9,10]

	# read file and convert to DataFrame
	# dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache()
	customSchema = StructType([
    							StructField("vendor_id", StringType(), True),
							    StructField("pickup_datetime", TimestampType(), True),
							    StructField("dropoff_datetime", TimestampType(), True),
							    StructField("passenger_count", StringType(), True),
							    StructField("trip_distance", StringType(), True),
							    StructField("pickup_longitude", DoubleType(), True),
							    StructField("pickup_latitude", DoubleType(), True),
							    StructField("rate_code", StringType(), True),
							    StructField("store_and_fwd_flag", StringType(), True),
							    StructField("dropoff_longitude", DoubleType(), True),
							    StructField("dropoff_latitude", DoubleType(), True),
							    StructField("payment_type", StringType(), True),
							    StructField("fare_amount", StringType(), True),
							    StructField("surcharge", StringType(), True),
							    StructField("mta_tax", StringType(), True),
							    StructField("tip_amount", StringType(), True),
							    StructField("tolls_amount", StringType(), True),
							    StructField("total_amount", StringType(), True)
							    ])

	dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', schema = customSchema).load(path_to_file)
	# create dataframe with selected columns
	dataframe = dataframe.select(*(dataframe.columns[n] for n in feature_columns))
	
	# this number does not include the header
	# number_of_trips = dataframe.count()

	sqlContext.clearCache()
	######################
	#
	# Preprocess data 
	#
	######################

	# filter rows with null fields
	# if passenger count is missing assign it a value of 1
	# filter invalid location: keep only areas near NYC
	dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \
						.fillna(1,subset=["passenger_count"])     \
						.filter(dataframe.pickup_latitude>40.0)   \
						.filter(dataframe.pickup_latitude<41.0)   \
						.filter(dataframe.pickup_longitude<-73.0) \
						.filter(dataframe.pickup_longitude>-74.0) \
						.filter(dataframe.dropoff_latitude>40.0)  \
						.filter(dataframe.dropoff_latitude<41.0)  \
						.filter(dataframe.dropoff_longitude<-73.0)\
						.filter(dataframe.dropoff_longitude>-74.0)


	######################
	#
	# features engineering
	#
	######################

	# create new column based on time-delta (minutes)
	# convert pickup-datetime column to hour
		
	time_delta_udf = udf(time_delta_minutes,FloatType())

	dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
						 .withColumn('pick_up_hour', hour(dataframe.pickup_datetime))

 	dataframe = dataframe.select(dataframe.pick_up_hour,    \
 								dataframe.passenger_count.cast("integer"),  \
								dataframe.pickup_longitude.cast("double"), \
								dataframe.pickup_latitude.cast("double"),  \
								dataframe.dropoff_longitude.cast("double"),\
								dataframe.dropoff_latitude.cast("double"), \
								dataframe.time_delta.cast("double"))

 	dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()


 	# split dataframe into feature and label vector
	# create feature vectors and labels for model training
	feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features')

	transformed = feature_assembler.transform(dataframe)
	vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache()

	######################
	#
	# train model
	#
	######################

	if validate:

		################################
		#
		# validate model on 60/40 split
		#
		################################

		# split 
		training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(training)

		train_pred = model.transform(training)
		test_pred = model.transform(test)

		evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_train = evaluator.evaluate(train_pred)

		evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_test = evaluator_test.evaluate(test_pred)

		output = test_pred.select("prediction", "label", "features")

		return output, r2_test, r2_train
	
	else:

		###################
		#
		# train on all data
		#
		###################

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(vector_dataframe)

		predictions = model.transform(vector_dataframe)

		output = predictions.select("prediction", "label", "features")

		###########################
		#
		# process to send to Kafka
		#
		###########################

		schema = StructType([StructField("prediction_mins", FloatType(), True),
							StructField("pick_up_hour", IntegerType(), True),
							StructField("pickup_longitude", DoubleType(), True),
							StructField("pickup_latitude", DoubleType(), True),
							StructField("dropoff_longitude", DoubleType(), True),
							StructField("dropoff_latitude", DoubleType(), True)])

		features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect()
		sqlContext.clearCache()
		dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache()

		return dataframe_from_prediction_vector

Exemplo n.º 17

0

Exibir arquivo

df = df.selectExpr("fare_amount as label", 'pickup_longitude',
                   'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
                   'passenger_count')

new_df = vecAssembler.setHandleInvalid("skip").transform(df)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = new_df.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeRegressor()

start_time = datetime.now()

# Train model.  This also runs the indexer.
model = dt.fit(trainingData)

time_elapsed = datetime.now() - start_time
print('TIME OF DECISION TREE REGRESSION TRAINING (hh:mm:ss.ms) {}'.format(
    time_elapsed))

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

Exemplo n.º 18

0

Exibir arquivo

def spark_process(sqlContext, sc, validate, path_to_file):

    ######################
    #
    # HDFS to DataFrame
    #
    ######################

    ## all fields:
    #  ['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance',
    #   'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude',
    #   'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount',
    #   'tolls_amount', 'total_amount']

    # columns to select
    feature_columns = [1, 2, 3, 5, 6, 9, 10]

    # read file and convert to DataFrame
    # dataframe = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(path_to_file).cache()
    customSchema = StructType([
        StructField("vendor_id", StringType(), True),
        StructField("pickup_datetime", TimestampType(), True),
        StructField("dropoff_datetime", TimestampType(), True),
        StructField("passenger_count", StringType(), True),
        StructField("trip_distance", StringType(), True),
        StructField("pickup_longitude", DoubleType(), True),
        StructField("pickup_latitude", DoubleType(), True),
        StructField("rate_code", StringType(), True),
        StructField("store_and_fwd_flag", StringType(), True),
        StructField("dropoff_longitude", DoubleType(), True),
        StructField("dropoff_latitude", DoubleType(), True),
        StructField("payment_type", StringType(), True),
        StructField("fare_amount", StringType(), True),
        StructField("surcharge", StringType(), True),
        StructField("mta_tax", StringType(), True),
        StructField("tip_amount", StringType(), True),
        StructField("tolls_amount", StringType(), True),
        StructField("total_amount", StringType(), True)
    ])

    dataframe = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', schema=customSchema).load(path_to_file)
    # create dataframe with selected columns
    dataframe = dataframe.select(*(dataframe.columns[n]
                                   for n in feature_columns))

    # this number does not include the header
    # number_of_trips = dataframe.count()

    sqlContext.clearCache()
    ######################
    #
    # Preprocess data
    #
    ######################

    # filter rows with null fields
    # if passenger count is missing assign it a value of 1
    # filter invalid location: keep only areas near NYC
    dataframe = dataframe.na.drop(how='any',subset=['pickup_datetime','dropoff_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']) \
         .fillna(1,subset=["passenger_count"])     \
         .filter(dataframe.pickup_latitude>40.0)   \
         .filter(dataframe.pickup_latitude<41.0)   \
         .filter(dataframe.pickup_longitude<-73.0) \
         .filter(dataframe.pickup_longitude>-74.0) \
         .filter(dataframe.dropoff_latitude>40.0)  \
         .filter(dataframe.dropoff_latitude<41.0)  \
         .filter(dataframe.dropoff_longitude<-73.0)\
         .filter(dataframe.dropoff_longitude>-74.0)

    ######################
    #
    # features engineering
    #
    ######################

    # create new column based on time-delta (minutes)
    # convert pickup-datetime column to hour

    time_delta_udf = udf(time_delta_minutes, FloatType())

    dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
          .withColumn('pick_up_hour', hour(dataframe.pickup_datetime))

    dataframe = dataframe.select(dataframe.pick_up_hour,    \
           dataframe.passenger_count.cast("integer"),  \
          dataframe.pickup_longitude.cast("double"), \
          dataframe.pickup_latitude.cast("double"),  \
          dataframe.dropoff_longitude.cast("double"),\
          dataframe.dropoff_latitude.cast("double"), \
          dataframe.time_delta.cast("double"))

    dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()

    # split dataframe into feature and label vector
    # create feature vectors and labels for model training
    feature_assembler = VectorAssembler(inputCols=[
        'pick_up_hour', 'pickup_longitude', 'pickup_latitude',
        'dropoff_longitude', 'dropoff_latitude'
    ],
                                        outputCol='features')

    transformed = feature_assembler.transform(dataframe)
    vector_dataframe = transformed.select(
        col("time_delta").alias("label"), col("features")).cache()

    ######################
    #
    # train model
    #
    ######################

    if validate:

        ################################
        #
        # validate model on 60/40 split
        #
        ################################

        # split
        training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)

        decision_tree_reg = DecisionTreeRegressor(maxDepth=12, maxBins=25)
        model = decision_tree_reg.fit(training)

        train_pred = model.transform(training)
        test_pred = model.transform(test)

        evaluator = RegressionEvaluator(labelCol="label",
                                        predictionCol="prediction",
                                        metricName="r2")
        r2_train = evaluator.evaluate(train_pred)

        evaluator_test = RegressionEvaluator(labelCol="label",
                                             predictionCol="prediction",
                                             metricName="r2")
        r2_test = evaluator_test.evaluate(test_pred)

        output = test_pred.select("prediction", "label", "features")

        return output, r2_test, r2_train

    else:

        ###################
        #
        # train on all data
        #
        ###################

        decision_tree_reg = DecisionTreeRegressor(maxDepth=12, maxBins=25)
        model = decision_tree_reg.fit(vector_dataframe)

        predictions = model.transform(vector_dataframe)

        output = predictions.select("prediction", "label", "features")

        ###########################
        #
        # process to send to Kafka
        #
        ###########################

        schema = StructType([
            StructField("prediction_mins", FloatType(), True),
            StructField("pick_up_hour", IntegerType(), True),
            StructField("pickup_longitude", DoubleType(), True),
            StructField("pickup_latitude", DoubleType(), True),
            StructField("dropoff_longitude", DoubleType(), True),
            StructField("dropoff_latitude", DoubleType(), True)
        ])

        features_from_predictions = output.map(lambda row: (
            float(row.prediction), int(row.features[0]), float(row.features[
                1]), float(row.features[2]), float(row.features[3]),
            float(row.features[4]))).collect()
        sqlContext.clearCache()
        dataframe_from_prediction_vector = sqlContext.createDataFrame(
            features_from_predictions, schema).cache()

        return dataframe_from_prediction_vector

Exemplo n.º 19

0

Exibir arquivo

Arquivo: nyc_fare_2_dt_6vm.py Projeto: Jadatravu/Tutorials

    encoder = OneHotEncoder(inputCol=categoricalCol + "Index",
                            outputCol=categoricalCol + "classVec")
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

#encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend']
encColumns = [
    'VendorID', 'RatecodeID', 'PULocationID', 'DOLocationID', 'payment_type'
]
for eCol in encColumns:
    encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol + "classVec")
    stages += [encoder]
#label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label")
#stages += [label_stringIdx]

numericCols = ["trip_distance", "passenger_count", "fare_amount", "tip_amount"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(
    lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(labelCol="total_amount",
                           featuresCol="features",
                           maxBins=32)
model = dt.fit(dataset)
model.write().overwrite().save("./nyc-01020304-6vm-18-DT-model")

Exemplo n.º 20

0

Exibir arquivo

def doGrid_one():
    grid_data = getGridData(sqlCtx, '_ngrid2500')
    errorsRMSE_LR = []
    errorsR2_LR = []
    errorsRMSE_DT = []
    errorsR2_DT = []
    hor = grid_data['horizontal_slots']
    vert = grid_data['vertical_slots']
    print(hor, vert)
    hor = 24
    vert = 24
    for x in range(hor):
        print('grid hor:', x)
        for y in range(vert):
            train, test = get_features_for_grid(spark, x, y)
            assembler = VectorAssembler(inputCols=[
                "day", "day_of_week", "hour", "is_airport", "is_manhattan",
                "minute", 'pickup_lat_slot', 'pickup_long_slot',
                "time_of_day_code", "week"
            ],
                                        outputCol='features')
            output_training = assembler.transform(train)
            output_testing = assembler.transform(test)

            final_data_training = output_training.select('features', 'amount')
            final_data_testing = output_testing.select('features', 'amount')

            final_data_training.describe().show()
            final_data_testing.describe().show()

            decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3)
            dt_model = decisionTree.fit(final_data_training)
            predictionsDT = dt_model.transform(final_data_testing)
            print(dt_model.toDebugString)

            linearRegression = LinearRegression(labelCol='amount')
            lr_model = linearRegression.fit(final_data_training)
            predictionsLR = lr_model.evaluate(final_data_testing)
            """ Evaluation LR : """
            rmse = predictionsLR.rootMeanSquaredError
            errorsRMSE_LR.append(rmse)
            #print("Root Mean Squared Error (RMSE) for LR on test data = ", rmse)

            r2 = predictionsLR.r2
            errorsR2_LR.append(r2)
            #print("R Squared Error (R2) for LR on test data = ", r2)
            """ Evaluation DT : """
            evaluatorRMSE = RegressionEvaluator(labelCol="amount",
                                                predictionCol="prediction",
                                                metricName="rmse")
            rmse = evaluatorRMSE.evaluate(predictionsDT)
            errorsRMSE_DT.append(rmse)
            #print("Root Mean Squared Error (RMSE) for DT on test data = ", rmse)

            evaluatorR2 = RegressionEvaluator(labelCol="amount",
                                              predictionCol="prediction",
                                              metricName="r2")
            r2 = evaluatorR2.evaluate(predictionsDT)
            errorsR2_DT.append(r2)
            #print("R Squared Error (R2) for DT on test data = ", r2)

    return hor, vert, errorsR2_DT, errorsRMSE_DT, errorsRMSE_LR, errorsR2_LR

Exemplo n.º 21

0

Exibir arquivo

Arquivo: AS2.py Projeto: rahsengithub/Scalable_ML

    c)
print("\n")

print("For the whole dataset, the DecisionTreeRegressor is starting...")
evaluator_reg = RegressionEvaluator\
      (labelCol="label", predictionCol="prediction", metricName="rmse")
print("\n")
print(
    "Fetching the best values of parameters from 25% dataset and using them..."
)
rtime = time.time()
dtr = DecisionTreeRegressor(labelCol="label",
                            featuresCol="features",
                            maxDepth=maxDepth_dtr,
                            maxBins=maxBins_dtr)
model_dtr = dtr.fit(trainingData)
predictions_dtr = model_dtr.transform(testData)
binarizer = Binarizer(threshold=0.5,
                      inputCol="prediction",
                      outputCol="binarized_prediction")
binarizedDataFrame = binarizer.transform(predictions_dtr)
binarized = binarizedDataFrame.drop('prediction')
bdf_dtr = binarized.withColumnRenamed('binarized_prediction', 'prediction')
r = time.time() - rtime
evaluator_reg = MulticlassClassificationEvaluator\
      (labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy_reg = evaluator_reg.evaluate(bdf_dtr)
print("\n")
print("Accuracy for DecisionTreeRegressor on the whole dataset = %g " %
      accuracy_reg)
evaluate_area_dtr = BinaryClassificationEvaluator(

Exemplo n.º 22

0

Exibir arquivo

Arquivo: forecast.py Projeto: ramanv1/time-series-sparkML

def Forecast(df, forecast_days, nLags, \
             timeSeriesColumn, regressor, sparksession):
    
    # this performs model training
    # this calls the machine-learning algorithms of Spark ML library
    
    #labels for machine-learning
    LeadWindow = window.Window.rowsBetween(0,forecast_days)   
    df = df.withColumn("label",func.last(df[timeSeriesColumn]).over(LeadWindow))
    
    features = [timeSeriesColumn]
    
    #Auto-regression feature
    LagTransformer = LagGather()\
                     .setLagLength(nLags)\
                     .setInputCol(timeSeriesColumn)
    df = LagTransformer.transform(df)
    featuresGenerated = LagTransformer.getFeatureNames()
    features.extend(featuresGenerated)
    
    #Other feature generators here:
    #Moving Average Smoothing
    #TrendGather

#******************************************************************************
# VECTOR ASSEMBLER
    # this assembles the all the features 
    df = df.dropna()
    vA = VectorAssembler().setInputCols(features)\
                          .setOutputCol("features")
    df_m = vA.transform(df)
#******************************************************************************
# Splitting data into train, test
    splitratio = 0.7
    df_train, df_test = TimeSeriesSplit(df_m, splitratio, sparksession)
#******************************************************************************
# DECISION-TREE REGRESSOR
    if(regressor == "DecisionTreeRegression"):
           
        dr = DecisionTreeRegressor(featuresCol = "features",\
                                   labelCol = "label", maxDepth = 5)
        model = dr.fit(df_train)
        predictions_dr_test = model.transform(df_test)
        predictions_dr_train = model.transform(df_train)
        
        # RMSE is used as evaluation metric
        evaluator = RegressionEvaluator(predictionCol="prediction",\
                                        labelCol="label",\
                                        metricName ="r2")
        
        RMSE_dr_test = evaluator.evaluate(predictions_dr_test)
        RMSE_dr_train = evaluator.evaluate(predictions_dr_train)
        return (df_test, df_train, \
                predictions_dr_test, predictions_dr_train,\
                RMSE_dr_test, RMSE_dr_train)
#******************************************************************************
# LINEAR REGRESSOR
    if(regressor == 'LinearRegression'):
        lr = LinearRegression(featuresCol = "features", labelCol="label", \
                              maxIter = 100, regParam = 0.4, \
                              elasticNetParam = 0.1)
        model = lr.fit(df_train)
        predictions_lr_test = model.transform(df_test)
        predictions_lr_train = model.transform(df_train)
        
        # RMSE is used as evaluation metric
        evaluator = RegressionEvaluator(predictionCol="prediction",\
                                        labelCol="label",\
                                        metricName ="r2") 
        RMSE_lr_test= evaluator.evaluate(predictions_lr_test)
        RMSE_lr_train = evaluator.evaluate(predictions_lr_train)
        return (df_test, df_train, \
                predictions_lr_test, predictions_lr_train,\
                RMSE_lr_test, RMSE_lr_train)
    

#*****************************************************************************
# RANDOM FOREST REGRESSOR
    if(regressor == 'RandomForestRegression'):
        rfr = RandomForestRegressor(featuresCol="features",\
                                    labelCol="label",\
                                    maxDepth = 5,\
                                    subsamplingRate = 0.8,\
                                    )
        model = rfr.fit(df_train)
        predictions_rfr_test = model.transform(df_test)
        predictions_rfr_train = model.transform(df_train)
        
        # RMSE is used as evaluation metric
        evaluator = RegressionEvaluator(predictionCol="prediction",\
                                        labelCol="label",\
                                        metricName ="rmse")
        RMSE_rfr_test= evaluator.evaluate(predictions_rfr_test)
        RMSE_rfr_train = evaluator.evaluate(predictions_rfr_train)
        return (df_test, df_train, \
                predictions_rfr_test, predictions_rfr_train,\
                RMSE_rfr_test, RMSE_rfr_train)
    

#*****************************************************************************
# GRADIENT BOOSTING TREE REGRESSOR
    if(regressor == 'GBTRegression'):
        gbt = GBTRegressor(featuresCol="features",\
                           labelCol="label",\
                           maxDepth=5,\
                           subsamplingRate=0.8)
        
        model = gbt.fit(df_train)
        predictions_gbt_test = model.transform(df_test)
        predictions_gbt_train = model.transform(df_train)
        
        # RMSE is used as evaluation metric
        evaluator = RegressionEvaluator(predictionCol="prediction",\
                                        labelCol="label",\
                                        metricName ="rmse")
        
        RMSE_gbt_test= evaluator.evaluate(predictions_gbt_test)
        RMSE_gbt_train = evaluator.evaluate(predictions_gbt_train)
        return (df_test, df_train, \
                predictions_gbt_test, predictions_gbt_train,\
                RMSE_gbt_test, RMSE_gbt_train)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: Practica_Spark.py Projeto: JHernandezElena/Spark

trainDF.cache()
testDF.cache()


# - ##### Entrenar un árbol de regresión para predecir la variable minutos.

# In[57]:


dt = DecisionTreeRegressor(labelCol='minutos') #toma como inputCo "features" de manera predeterminada


# In[58]:


model=dt.fit(trainDF)


# - ##### Evaluar el modelo resultante usando RMSE tanto en la muestra de entrenamiento como en la muestra de test: Comentar el resultado.

# In[59]:


predictionDF = model.transform(testDF)


# In[23]:


evaluator = RegressionEvaluator(labelCol="minutos")

Exemplo n.º 24

0

Exibir arquivo

def main():
    for cid in range(N_OF_CLUSTERS):
        rows_training = []
        rows_testing = []
        for week_nb in range(FIRST_WEEK, LAST_WEEK + 1):
            print('week nb : ', week_nb)
            for day_of_week in range(DAY_IN_WEEK):
                for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY):
                    #for tid in range(TOTAL_SLOTS_FOR_LOOP): #TODO do the loop per week, per day and day slot and change fi_features_cache too

                    curFeature = demandCache.get_demand(
                        week_nb, day_of_week, time_of_day_code, cid)
                    if curFeature != []:
                        time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature(
                            curFeature)

                        if (week_nb < WEEK_NB_TEST):
                            rows_training.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, amount))
                        else:
                            rows_testing.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, amount))
        df_training = spark.createDataFrame(rows_training, [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "amount"
        ])
        df_testing = spark.createDataFrame(rows_testing, [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "amount"
        ])

        assembler = VectorAssembler(inputCols=[
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute"
        ],
                                    outputCol='features')
        output_training = assembler.transform(df_training)
        output_testing = assembler.transform(df_testing)

        final_data_training = output_training.select('features', 'amount')
        final_data_testing = output_testing.select('features', 'amount')

        decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3)
        dt_model = decisionTree.fit(final_data_training)
        predictionsDT = dt_model.transform(final_data_testing)

        linearRegression = LinearRegression(labelCol='amount')
        lr_model = linearRegression.fit(final_data_training)
        predictionsLR = lr_model.evaluate(final_data_testing)

        # print("Decision tree model max depth = %g" % decisionTree.getMaxDepth())
        # print(dt_model.toDebugString)
        """ Evaluation rmse : """
        rmse = predictionsLR.rootMeanSquaredError
        errorsRMSE_LR.append(rmse)
        print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse)

        r2 = predictionsLR.r2
        errorsR2_LR.append(r2)
        print("R Squared Error (R2) for LR on test data = %g" % r2)
        """ Evaluation rmse : """
        evaluatorRMSE = RegressionEvaluator(labelCol="amount",
                                            predictionCol="prediction",
                                            metricName="rmse")
        rmse = evaluatorRMSE.evaluate(predictionsDT)
        errorsRMSE_DT.append(rmse)
        print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse)

        evaluatorR2 = RegressionEvaluator(labelCol="amount",
                                          predictionCol="prediction",
                                          metricName="r2")
        r2 = evaluatorR2.evaluate(predictionsDT)
        errorsR2_DT.append(r2)
        print("R Squared Error (R2) for DT on test data = %g" % r2)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: one_model_per_cluster.py Projeto: esmijak/CS-IT7-Taxis

def main():
    for cid in range(N_OF_CLUSTERS):
        rows_training = []
        rows_testing = []
        for week_nb in range(FIRST_WEEK, LAST_WEEK + 1):
            print('week nb : ', week_nb)
            for day_of_week in range(DAY_IN_WEEK):
                for time_of_day_code in range(TIME_SLOTS_WITHIN_DAY):
                    #retrieving the features'values:
                    curFeature = demandCache.get_demand(
                        week_nb, day_of_week, time_of_day_code, cid)
                    if curFeature != []:
                        time_of_day_code, origin, day_of_week, day, week, hour, minute, is_manhattan, is_airport, amount = extract_feature(
                            curFeature)

                        # Checking whether the current row should be added to the training or testing set:
                        if (week_nb < WEEK_NB_TEST):
                            rows_training.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, amount))
                        else:
                            rows_testing.append(
                                (time_of_day_code, origin, day_of_week, day,
                                 week, hour, minute, amount))

# Creating the dataframe for the model containing all the rows :
        df_training = spark.createDataFrame(rows_training, [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "amount"
        ])
        df_testing = spark.createDataFrame(rows_testing, [
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute", "amount"
        ])

        assembler = VectorAssembler(inputCols=[
            "time_of_day_code", "origin", "day_of_week", "day", "week", "hour",
            "minute"
        ],
                                    outputCol='features')
        output_training = assembler.transform(df_training)
        output_testing = assembler.transform(df_testing)

        final_data_training = output_training.select('features', 'amount')
        final_data_testing = output_testing.select('features', 'amount')

        # Training the Desition Tree:
        decisionTree = DecisionTreeRegressor(labelCol='amount', maxDepth=3)
        dt_model = decisionTree.fit(final_data_training)
        predictionsDT = dt_model.transform(final_data_testing)
        # print(dt_model.toDebugString) # showing the decision tree

        # Training the linear regression:
        linearRegression = LinearRegression(labelCol='amount')
        lr_model = linearRegression.fit(final_data_training)
        predictionsLR = lr_model.evaluate(final_data_testing)
        """ Evaluation rmse : """
        rmse = predictionsLR.rootMeanSquaredError
        errorsRMSE_LR.append(rmse)
        print("Root Mean Squared Error (RMSE) for LR on test data = %g" % rmse)

        r2 = predictionsLR.r2
        errorsR2_LR.append(r2)
        print("R Squared Error (R2) for LR on test data = %g" % r2)
        """ Evaluation rmse : """
        evaluatorRMSE = RegressionEvaluator(labelCol="amount",
                                            predictionCol="prediction",
                                            metricName="rmse")
        rmse = evaluatorRMSE.evaluate(predictionsDT)
        errorsRMSE_DT.append(rmse)
        print("Root Mean Squared Error (RMSE) for DT on test data = %g" % rmse)

        evaluatorR2 = RegressionEvaluator(labelCol="amount",
                                          predictionCol="prediction",
                                          metricName="r2")
        r2 = evaluatorR2.evaluate(predictionsDT)
        errorsR2_DT.append(r2)
        print("R Squared Error (R2) for DT on test data = %g" % r2)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: Advanced_Analytics_and_Machine_Learning-Chapter_27_Regression.py Projeto: yehonatc/Spark-The-Definitive-Guide

glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
print glr.explainParams()
glrModel = glr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
dtrModel = dtr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf =  RandomForestRegressor()
print rf.explainParams()
rfModel = rf.fit(df)
gbt = GBTRegressor()
print gbt.explainParams()
gbtModel = gbt.fit(df)


# COMMAND ----------

Exemplo n.º 27

0

Exibir arquivo

Arquivo: regressor.py Projeto: anikethsuresh/Taxi-Trip-Duration-Prediction-using-Enhanced-Regression-Trees

# Split the data 70-30
train_test_data = model_data.randomSplit([0.8, 0.2], 16430212)
train_data = train_test_data[0]
test_data = train_test_data[1]

print("Train DT")
rmseEvaluator = myRmseEvaluator(
    RegressionEvaluator(predictionCol="prediction",
                        labelCol="trip_duration",
                        metricName="rmse"))
maeEvaluator = RegressionEvaluator(predictionCol="prediction",
                                   labelCol="trip_duration",
                                   metricName="mae")
dtr = DecisionTreeRegressor(
    maxDepth=3).setFeaturesCol("features").setLabelCol("trip_duration")
trained_model = dtr.fit(train_data)
predictions = trained_model.transform(test_data)

# final_result = predictions.select("prediction", "trip_duration").rdd
print(trained_model)
print("RMSE for Regression Tree:", rmseEvaluator.evaluate(predictions))
print("MAE for Regression Tree:", maeEvaluator.evaluate(predictions))
"""
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_826b1c042824, depth=3, numNodes=15, numFeatures=7
  If (feature 6 <= 2.7002733639393384)
   If (feature 6 <= 1.3071311166631614)
    If (feature 6 <= 0.825910208978972)
     Predict: 481.0347939172201
    Else (feature 6 > 0.825910208978972)
     Predict: 704.5021037177617
   Else (feature 6 > 1.3071311166631614)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: nyc_fare_2_dt_6vm.py Projeto: Jadatravu/Tutorials

categoricalColumns = ['store_and_fwd_flag']
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  # Add stages.  These are not run here, but will run all at once later on.
  stages += [stringIndexer, encoder]

#encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type','Peak_Time','weekend']
encColumns = ['VendorID','RatecodeID','PULocationID','DOLocationID','payment_type']
for eCol in encColumns:
  encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec")
  stages += [encoder]
#label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label")
#stages += [label_stringIdx]

numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"]
assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(train_X4)
dataset = pipelineModel.transform(train_X4)
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(labelCol="total_amount", featuresCol="features", maxBins=32)
model = dt.fit(dataset)
model.write().overwrite().save("./nyc-01020304-6vm-18-DT-model")

Exemplo n.º 29

0

Exibir arquivo

from pyspark.ml.regression import DecisionTreeRegressor
dt_models = {}
dt_predictions = {}

compute_again = False 
if compute_again == False:
  dt_models = loadModels("TreeModel_","tree")
  for park in park_data_with_date_dict:
    dt_predictions[park] = dt_models[park].transform(test_ds[park])
else:
  for park in park_data_with_date_dict:
    #vectorAssembler = VectorAssembler(inputCols=features, outputCol="features")
    #data = vectorAssembler.transform(all_tables[park])
    #train, test = data.randomSplit([0.8,0.2], seed = 12345)
    dt = DecisionTreeRegressor()
    dt_models[park] = dt.fit(train_ds[park])
    dt_predictions[park] = dt_models[park].transform(test_ds[park])
  saveModels(dt_models,"TreeModel_","tree")

# COMMAND ----------

#ATTENZIONE: se vuoi visualizzare proprio gli alberi puoi chiamare display(dt_models[park])

# COMMAND ----------

def printEvaluateModel(park,modelsCollection, predictionsCollection):
  print("EVALUATE MODEL FOR PARKING "+str(park))
  print("OVER TEST SET")
  print("Features importance:" + str(modelsCollection[park].featureImportances))
  eval = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
  # Root Mean Square Error

Exemplo n.º 30

0

Exibir arquivo

Arquivo: Iteration-BDAS(the whole process).py Projeto: leoxs0820/iteration4-BDAS

import numpy as np
import matplotlib.pyplot as plt

# In[106]:

### model building process
#create a sample for model test
sample, x = final_data.randomSplit([0.1, 0.8])

# In[107]:

# decision trees
r2_dtr = np.zeros(10)
for i in np.arange(10):
    dtr = DecisionTreeRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.)
    dtrModel = dtr.fit(sample)
    prediction_dtr = dtrModel.transform(sample)
    r2_dtr[i] = evaluator.evaluate(prediction_dtr)
plt.plot(np.arange(3, 33, 3), r2_dtr)
# so choose 10 as the maxDepth

# In[108]:

# Random Forest
r2_rfr = np.zeros(10)
for i in np.arange(10):
    rfr = RandomForestRegressor(labelCol='mean_temp', maxDepth=(i + 1) * 3.)
    rfrModel = rfr.fit(sample)
    prediction_rfr = rfrModel.transform(sample)
    r2_rfr[i] = evaluator.evaluate(prediction_rfr)
plt.plot(np.arange(3, 33, 3), r2_rfr)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: dtree_regression_model.py Projeto: Srinidhi-SA/Madvisorhadoop

    def Train(self):
        st_global = time.time()

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

        appType = self._dataframe_context.get_app_type()
        algosToRun = self._dataframe_context.get_algorithms_to_run()
        algoSetting = filter(lambda x:x.get_algorithm_slug()==self._slug,algosToRun)[0]
        categorical_columns = self._dataframe_helper.get_string_columns()
        uid_col = self._dataframe_context.get_uid_column()
        if self._metaParser.check_column_isin_ignored_suggestion(uid_col):
            categorical_columns = list(set(categorical_columns) - {uid_col})
        allDateCols = self._dataframe_context.get_date_columns()
        categorical_columns = list(set(categorical_columns)-set(allDateCols))
        print categorical_columns
        result_column = self._dataframe_context.get_result_column()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        numerical_columns = [x for x in numerical_columns if x != result_column]

        model_path = self._dataframe_context.get_model_path()
        if model_path.startswith("file"):
            model_path = model_path[7:]
        validationDict = self._dataframe_context.get_validation_dict()
        print "model_path",model_path
        pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/"
        model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model"
        pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml"

        df = self._data_frame
        if self._mlEnv == "spark":
            pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression")

            pipelineModel = pipeline.fit(df)
            indexed = pipelineModel.transform(df)
            featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values())))

            # print indexed.select([result_column,"features"]).show(5)
            MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath)
            # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn")
            dtreer = DecisionTreeRegressor(labelCol=result_column, featuresCol='features',predictionCol="prediction")
            if validationDict["name"] == "kFold":
                defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"]
                numFold = int(validationDict["value"])
                if numFold == 0:
                    numFold = 3
                trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345)
                paramGrid = ParamGridBuilder()\
                    .addGrid(dtreer.regParam, [0.1, 0.01]) \
                    .addGrid(dtreer.fitIntercept, [False, True])\
                    .addGrid(dtreer.elasticNetParam, [0.0, 0.5, 1.0])\
                    .build()
                crossval = CrossValidator(estimator=dtreer,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column),
                              numFolds=numFold)
                st = time.time()
                cvModel = crossval.fit(indexed)
                trainingTime = time.time()-st
                print "cvModel training takes",trainingTime
                bestModel = cvModel.bestModel
            elif validationDict["name"] == "trainAndtest":
                trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345)
                st = time.time()
                fit = dtreer.fit(trainingData)
                trainingTime = time.time()-st
                print "time to train",trainingTime
                bestModel = fit

            featureImportance = bestModel.featureImportances
            print featureImportance,type(featureImportance)
            # print featureImportance[0],len(featureImportance[1],len(featureImportance[2]))
            print len(featureMapping)
            featuresArray = [(name, featureImportance[idx]) for idx, name in featureMapping]
            print featuresArray
            MLUtils.save_pipeline_or_model(bestModel,model_filepath)
            transformed = bestModel.transform(validationData)
            transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType()))
            transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]])
            transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference")
            transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]])
            transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape")
            sampleData = None
            nrows = transformed.count()
            if nrows > 100:
                sampleData = transformed.sample(False, float(100)/nrows, seed=420)
            else:
                sampleData = transformed
            print sampleData.show()
            evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column)
            metrics = {}
            metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"})
            metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"})
            metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"})
            metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"})
            runtime = round((time.time() - st_global),2)
            # print transformed.count()
            mapeDf = transformed.select("mape")
            # print mapeDf.show()
            mapeStats = MLUtils.get_mape_stats(mapeDf,"mape")
            mapeStatsArr = mapeStats.items()
            mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0]))
            # print mapeStatsArr
            quantileDf = transformed.select("prediction")
            # print quantileDf.show()
            quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction")
            quantileSummaryArr = quantileSummaryDict.items()
            quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0]))
            # print quantileSummaryArr
            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name("dtree Regression")
            self._model_summary.set_algorithm_display_name("Decision Tree Regression")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(bestEstimator.get_params())
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.toPandas().to_dict())
            self._model_summary.set_feature_importance(featureImportance)
            # print CommonUtils.convert_python_object_to_json(self._model_summary)
        elif self._mlEnv == "sklearn":
            model_filepath = model_path+"/"+self._slug+"/model.pkl"
            x_train,x_test,y_train,y_test = self._dataframe_helper.get_train_test_data()
            x_train = MLUtils.create_dummy_columns(x_train,[x for x in categorical_columns if x != result_column])
            x_test = MLUtils.create_dummy_columns(x_test,[x for x in categorical_columns if x != result_column])
            x_test = MLUtils.fill_missing_columns(x_test,x_train.columns,result_column)

            st = time.time()
            est = DecisionTreeRegressor()

            CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"training","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

            if algoSetting.is_hyperparameter_tuning_enabled():
                hyperParamInitParam = algoSetting.get_hyperparameter_params()
                evaluationMetricDict = {"name":hyperParamInitParam["evaluationMetric"]}
                evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]]
                hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name()
                params_grid = algoSetting.get_params_dict_hyperparameter()
                params_grid = {k:v for k,v in params_grid.items() if k in est.get_params()}
                print params_grid
                if hyperParamAlgoName == "gridsearchcv":
                    estGrid = GridSearchCV(est,params_grid)
                    gridParams = estGrid.get_params()
                    hyperParamInitParam = {k:v for k,v in hyperParamInitParam.items() if k in gridParams}
                    estGrid.set_params(**hyperParamInitParam)
                    estGrid.fit(x_train,y_train)
                    bestEstimator = estGrid.best_estimator_
                    modelFilepath = "/".join(model_filepath.split("/")[:-1])
                    sklearnHyperParameterResultObj = SklearnGridSearchResult(estGrid.cv_results_,est,x_train,x_test,y_train,y_test,appType,modelFilepath,evaluationMetricDict=evaluationMetricDict)
                    resultArray = sklearnHyperParameterResultObj.train_and_save_models()
                    self._result_setter.set_hyper_parameter_results(self._slug,resultArray)
                    self._result_setter.set_metadata_parallel_coordinates(self._slug,{"ignoreList":sklearnHyperParameterResultObj.get_ignore_list(),"hideColumns":sklearnHyperParameterResultObj.get_hide_columns(),"metricColName":sklearnHyperParameterResultObj.get_comparison_metric_colname(),"columnOrder":sklearnHyperParameterResultObj.get_keep_columns()})

                elif hyperParamAlgoName == "randomsearchcv":
                    estRand = RandomizedSearchCV(est,params_grid)
                    estRand.set_params(**hyperParamInitParam)
                    bestEstimator = None
            else:
                evaluationMetricDict = {"name":GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC}
                evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]]
                algoParams = algoSetting.get_params_dict()
                algoParams = {k:v for k,v in algoParams.items() if k in est.get_params().keys()}
                est.set_params(**algoParams)
                self._result_setter.set_hyper_parameter_results(self._slug,None)
                if validationDict["name"] == "kFold":
                    defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"]
                    numFold = int(validationDict["value"])
                    if numFold == 0:
                        numFold = 3
                    kFoldClass = SkleanrKFoldResult(numFold,est,x_train,x_test,y_train,y_test,appType,evaluationMetricDict=evaluationMetricDict)
                    kFoldClass.train_and_save_result()
                    kFoldOutput = kFoldClass.get_kfold_result()
                    bestEstimator = kFoldClass.get_best_estimator()
                elif validationDict["name"] == "trainAndtest":
                    est.fit(x_train, y_train)
                    bestEstimator = est
            trainingTime = time.time()-st
            y_score = bestEstimator.predict(x_test)
            try:
                y_prob = bestEstimator.predict_proba(x_test)
            except:
                y_prob = [0]*len(y_score)
            featureImportance={}

            objs = {"trained_model":bestEstimator,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":featureImportance,"featureList":list(x_train.columns),"labelMapping":{}}
            featureImportance = objs["trained_model"].feature_importances_
            featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)]

            if not algoSetting.is_hyperparameter_tuning_enabled():
                modelName = "M"+"0"*(GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH-1)+"1"
                modelFilepathArr = model_filepath.split("/")[:-1]
                modelFilepathArr.append(modelName+".pkl")
                joblib.dump(objs["trained_model"],"/".join(modelFilepathArr))
            metrics = {}
            metrics["r2"] = r2_score(y_test, y_score)
            metrics["mse"] = mean_squared_error(y_test, y_score)
            metrics["mae"] = mean_absolute_error(y_test, y_score)
            metrics["rmse"] = sqrt(metrics["mse"])
            transformed = pd.DataFrame({"prediction":y_score,result_column:y_test})
            transformed["difference"] = transformed[result_column] - transformed["prediction"]
            transformed["mape"] = np.abs(transformed["difference"])*100/transformed[result_column]

            sampleData = None
            nrows = transformed.shape[0]
            if nrows > 100:
                sampleData = transformed.sample(n=100,random_state=420)
            else:
                sampleData = transformed
            print sampleData.head()

            mapeCountArr = pd.cut(transformed["mape"],GLOBALSETTINGS.MAPEBINS).value_counts().to_dict().items()
            mapeStatsArr = [(str(idx),dictObj) for idx,dictObj in enumerate(sorted([{"count":x[1],"splitRange":(x[0].left,x[0].right)} for x in mapeCountArr],key = lambda x:x["splitRange"][0]))]

            predictionColSummary = transformed["prediction"].describe().to_dict()
            quantileBins = [predictionColSummary["min"],predictionColSummary["25%"],predictionColSummary["50%"],predictionColSummary["75%"],predictionColSummary["max"]]
            print quantileBins
            quantileBins = sorted(list(set(quantileBins)))
            transformed["quantileBinId"] = pd.cut(transformed["prediction"],quantileBins)
            quantileDf = transformed.groupby("quantileBinId").agg({"prediction":[np.sum,np.mean,np.size]}).reset_index()
            quantileDf.columns = ["prediction","sum","mean","count"]
            print quantileDf
            quantileArr = quantileDf.T.to_dict().items()
            quantileSummaryArr = [(obj[0],{"splitRange":(obj[1]["prediction"].left,obj[1]["prediction"].right),"count":obj[1]["count"],"mean":obj[1]["mean"],"sum":obj[1]["sum"]}) for obj in quantileArr]
            print quantileSummaryArr
            runtime = round((time.time() - st_global),2)

            self._model_summary.set_model_type("regression")
            self._model_summary.set_algorithm_name("DTREE Regression")
            self._model_summary.set_algorithm_display_name("Decision Tree Regression")
            self._model_summary.set_slug(self._slug)
            self._model_summary.set_training_time(runtime)
            self._model_summary.set_training_time(trainingTime)
            self._model_summary.set_target_variable(result_column)
            self._model_summary.set_validation_method(validationDict["displayName"])
            self._model_summary.set_model_evaluation_metrics(metrics)
            self._model_summary.set_model_params(bestEstimator.get_params())
            self._model_summary.set_quantile_summary(quantileSummaryArr)
            self._model_summary.set_mape_stats(mapeStatsArr)
            self._model_summary.set_sample_data(sampleData.to_dict())
            self._model_summary.set_feature_importance(featuresArray)
            self._model_summary.set_feature_list(list(x_train.columns))


            try:
                pmml_filepath = str(model_path)+"/"+str(self._slug)+"/traindeModel.pmml"
                modelPmmlPipeline = PMMLPipeline([
                  ("pretrained-estimator", objs["trained_model"])
                ])
                modelPmmlPipeline.target_field = result_column
                modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column])
                sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True)
                pmmlfile = open(pmml_filepath,"r")
                pmmlText = pmmlfile.read()
                pmmlfile.close()
                self._result_setter.update_pmml_object({self._slug:pmmlText})
            except:
                pass
        if not algoSetting.is_hyperparameter_tuning_enabled():
            modelDropDownObj = {
                        "name":self._model_summary.get_algorithm_name(),
                        "evaluationMetricValue":self._model_summary.get_model_accuracy(),
                        "evaluationMetricName":"r2",
                        "slug":self._model_summary.get_slug(),
                        "Model Id":modelName
                        }

            modelSummaryJson = {
                "dropdown":modelDropDownObj,
                "levelcount":self._model_summary.get_level_counts(),
                "modelFeatureList":self._model_summary.get_feature_list(),
                "levelMapping":self._model_summary.get_level_map_dict(),
                "slug":self._model_summary.get_slug(),
                "name":self._model_summary.get_algorithm_name()
            }
        else:
            modelDropDownObj = {
                        "name":self._model_summary.get_algorithm_name(),
                        "evaluationMetricValue":resultArray[0]["R-Squared"],
                        "evaluationMetricName":"r2",
                        "slug":self._model_summary.get_slug(),
                        "Model Id":resultArray[0]["Model Id"]
                        }
            modelSummaryJson = {
                "dropdown":modelDropDownObj,
                "levelcount":self._model_summary.get_level_counts(),
                "modelFeatureList":self._model_summary.get_feature_list(),
                "levelMapping":self._model_summary.get_level_map_dict(),
                "slug":self._model_summary.get_slug(),
                "name":self._model_summary.get_algorithm_name()
            }

        dtreerCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)]

        for card in dtreerCards:
            self._prediction_narrative.add_a_card(card)
        self._result_setter.set_model_summary({"dtreeregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))})
        self._result_setter.set_dtree_regression_model_summart(modelSummaryJson)
        self._result_setter.set_dtreer_cards(dtreerCards)

        CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"completion","info",display=True,emptyBin=False,customMsg=None,weightKey="total")

Exemplo n.º 32

0

Exibir arquivo

    assembler = VectorAssembler(inputCols=[
        "slot_id", "day_of_week", "day_of_month", "week_nb", "hour", "minute"
    ],
                                outputCol='features')
    output_training = assembler.transform(df_training)
    output_testing = assembler.transform(df_testing)

    final_data_training = output_training.select('features', 'demand')
    final_data_testing = output_testing.select('features', 'demand')

    #final_data_training.describe().show()
    #final_data_testing.describe().show()
    """  Model and predictions : """
    decisionTree = DecisionTreeRegressor(labelCol='demand', maxDepth=3)
    dt_model = decisionTree.fit(final_data_training)
    predictions = dt_model.transform(final_data_testing)
    #print("Decision tree model max depth = %g" % decisionTree.getMaxDepth())
    #print(dt_model.toDebugString)
    """ Evaluation rmse : """
    evaluatorRMSE = RegressionEvaluator(labelCol="demand",
                                        predictionCol="prediction",
                                        metricName="rmse")
    rmse = evaluatorRMSE.evaluate(predictions)
    errorsRMSE.append(rmse)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    evaluatorR2 = RegressionEvaluator(labelCol="demand",
                                      predictionCol="prediction",
                                      metricName="r2")
    r2 = evaluatorR2.evaluate(predictions)

Exemplo n.º 33

0

Exibir arquivo

    assembler = VectorAssembler().setInputCols(
        ['HouseAge', 'DistanceToMRT',
         'NumberConvenienceStores']).setOutputCol('features')
    df = assembler.transform(data).select('PriceOfUnitArea', 'features')

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
    dtr = DecisionTreeRegressor().setFeaturesCol('features').setLabelCol(
        'PriceOfUnitArea')

    # Train the model using our training data
    model = dtr.fit(trainingDF)

    # Now see if we can predict values in our test data.
    # Generate predictions using our linear regression model for all features in our
    # test dataframe:
    fullPredictions = model.transform(testDF).cache()

    # Extract the predictions and the "known" correct labels.
    predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
    labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

    # Zip them together
    predictionAndLabel = predictions.zip(labels).collect()

    # Print out the predicted and actual values for each point
    for prediction in predictionAndLabel:

Exemplo n.º 34

0

Exibir arquivo

Arquivo: 5-regression_student.py Projeto: Rjup/Spark-Summit-June-2016

# COMMAND ----------

# MAGIC %md
# MAGIC #### Regression with decision trees

# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor

dtr = DecisionTreeRegressor().setLabelCol('petalWidth')
print dtr.explainParams()

# COMMAND ----------

dtrModel = dtr.fit(irisPetal)
dtrPredictions = dtrModel.transform(irisPetal)
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'r2'})
print regEval.evaluate(dtrPredictions, {regEval.metricName: 'rmse'})

# COMMAND ----------

# MAGIC %md
# MAGIC Let's also build a gradient boosted tree.

# COMMAND ----------

from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor().setLabelCol('petalWidth')
print gbt.explainParams()