Python LinearRegressionModel.load示例，pyspark.ml.regression.LinearRegressionModel.load Python示例

示例#1

0

显示文件

文件： spark_handler.py 项目： eubr-bigsea/btr-api

 def updateResources(duration_model_path, crowdedness_model_path,
                     pipeline_path, routes_stops_path):
     self.duration_model = LinearRegressionModel.load(duration_model_path)
     self.crowdedness_model = LinearRegressionModel.load(
         crowdedness_model_path)
     self.pipeline = PipelineModel.load(pipeline_path)
     self.intermediate_stops_extraction_handler = IntermediateStopsExtractionHandler(
         self.sc, self.sqlContext, routes_stops_path)

示例#2

0

显示文件

文件： spark_handler.py 项目： eubr-bigsea/btr-api

 def __init__(self, app_name, duration_model_path, crowdedness_model_path,
              pipeline_path, routes_stops_path):
     self.sc = SparkContext(conf=SparkConf().setAppName(app_name))
     self.sqlContext = SQLContext(self.sc)
     self.duration_model = LinearRegressionModel.load(duration_model_path)
     self.crowdedness_model = LinearRegressionModel.load(
         crowdedness_model_path)
     self.pipeline = PipelineModel.load(pipeline_path)
     self.intermediate_stops_extraction_handler = IntermediateStopsExtractionHandler(
         self.sc, self.sqlContext, routes_stops_path)

示例#3

0

显示文件

def predict(bucket_name, feature_path, feature_name, output_path, plot_path):    
    sc = SparkContext.getOrCreate()
    sqlCtx = SQLContext(sc)

    model_path = path.join(output_path, "regression-model")
    print "Load model from:", model_path
    lrModel = LinearRegressionModel.load(model_path)

    # read last maintenance time from json
    maintain4 = 0.0
    maintain12 = 0.0
    with open(path.join(output_path, "last_maintain.json")) as f:
        last_maintain = json.load(f)
        maintain4 = last_maintain['maintain4']
        maintain12 = last_maintain['maintain12']

    # read data from s3 for prediction
    df = read_data(bucket_name, feature_path, feature_name)
    # transform predict data
    df = df.withColumn('maintain4', lit(maintain4))
    df = df.withColumn('maintain12', lit(maintain12))
    test = df.rdd.map(
        lambda x: (
            Vectors.dense([x.amount, x.split, x.maintain4, x.maintain12]),
        )
    ).toDF(["features"])

    lrModel.transform(test).toPandas().to_csv(
        path_or_buf=path.join(output_path, "pred-" + feature_name))

示例#4

0

显示文件

def main(bootstrap_server, topic_name, time_interval, scikit_model_path,
         spark_model_path, output_attribute_index):

    # Ucitavanje Passive Aggressive Regressor modela
    with hdfs.open(scikit_model_path, 'r') as opened_file:
        regressor = pickle.load(opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Consumer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog ucitavanja modela)
    session = SparkSession(context)

    # Ucitavanje Spark modela sa zadate putanje
    model = LinearRegressionModel.load(spark_model_path)

    # Instanciranje streaming konteksta
    # (tako da se obrada izvrsava na svakih time_interval sekundi),
    # kao i stream-a uz zadat topik i Kafka broker
    streaming_context = StreamingContext(context, time_interval)
    stream = KafkaUtils.createDirectStream(
        streaming_context, [topic_name],
        {"metadata.broker.list": bootstrap_server})

    # Za svaki RDD, definise se funkcija koja vrsi predikciju
    # (uz odgovarajuci prosledjen model i indeks izlaznog atributa)
    stream.foreachRDD(lambda input_data: prediction(
        input_data, regressor, model, output_attribute_index))

    # Zapocinje se obrada stream-a
    streaming_context.start()
    streaming_context.awaitTermination()

示例#5

0

显示文件

文件： modelsSpark.py 项目： Manwelanza/Successful_Terms_py

 def getOrCreateLR (self):
     try:
         if self.lrModel == None:
             self.lrModel = LinearRegressionModel.load(CONST_LR_FILE)
     except :
         print("Creating LR Model")
         self.lrModel =  self.createLR ()
     
     return self.lrModel

示例#6

0

显示文件

def loadModels(path,typeofmodel):
  models = {}
  for park in park_data_with_date_dict:
    if typeofmodel == "linear":
      models[park] = LinearRegressionModel.load(path+str(park))
    elif typeofmodel == "tree":
      models[park] = DecisionTreeRegressionModel.load(path+str(park))
    elif typeofmodel == "gbt":
      models[park] = GBTRegressionModel.load(path+str(park))
  return models

示例#7

0

显示文件

 def load(self, load_dir):
     if os.path.isdir(load_dir):
         if self.pm == 'PM10':
             self.model = LinearRegressionModel.load(
                 os.path.join(load_dir, 'model'))
         else:
             self.model = RandomForestRegressionModel.load(
                 os.path.join(load_dir, 'model'))
         self.imputer = ImputerModel.load(os.path.join(load_dir, 'imputer'))
         self.assembler = VectorAssembler.load(
             os.path.join(load_dir, 'assembler'))
     else:
         raise RuntimeError(
             'Save path: {}, does not exist or is not a directory'.format(
                 load_dir))

示例#8

0

显示文件

文件： spark_app.py 项目： muralidhar26/PySpark-App-CF

def predict():
    """
    https://app.host/predict?value=0
    """
    value = int(request.args.get("value"))
    spark_session, _ = create_spark_connection()
    model_load = LinearRegressionModel.load(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), "model"))
    predict_df = spark_session.createDataFrame([(1, Vectors.dense(value))],
                                               ["index", "features"])

    predict_collected = model_load.transform(predict_df).collect()[0]

    features = predict_collected.features.values.tolist()
    prediction = predict_collected.prediction
    output = {"features": features, "prediction": prediction}
    return jsonify(output)

示例#9

0

显示文件

文件： batch_inference.py 项目： hcvazquez/ht-engineering

    def process(self, data_input, data_output, model):
        """
        An spark process to do inference
        :param data_input: data input filename
        :param data_output: data output filename
        """

        # Load Linear Regression model
        lr_model = LinearRegressionModel.load(model)

        new_data = self.spark.read.parquet(data_input)

        new_predictions = lr_model.evaluate(new_data)

        # Save result as parquet
        new_predictions.write.format("parquet").mode('overwrite').option(
            "header", "true").save(data_output)

示例#10

0

显示文件

def find_rating(player_id, cur_date):
    sp_sess = SparkSession.builder.appName('Regr_Data').getOrCreate()
    play_path = "hdfs://localhost:9000/players.csv"
    players = sp_sess.read.csv(play_path, header=True, inferSchema=True)
    assembler = VectorAssembler(inputCols = ['new_diff'],outputCol = 'features')
    name_df = players.filter(players['Id'] == int(player_id))
    
    player_date = name_df.select("birthDate").collect()[0].birthDate

    new_date1 = player_date.split('-')
    new_date2 = cur_date.split('-')


    d1 = datetime.date(int(new_date1[0]), int(new_date1[1]), int(new_date1[2]))
    d2 = datetime.date(int(new_date2[0]), int(new_date2[1]), int(new_date2[2]))


    diff = abs(d2 - d1).days
    my_rating = 1.000
    my_schema = StructType([
        StructField('diff', IntegerType(), True),
        StructField('rating', FloatType(), True)
    ])
    my_dict = {'diff': diff, 'rating': my_rating}

    new_df = sp_sess.createDataFrame([my_dict], my_schema)

    new_df = new_df.withColumn('new_diff', new_df['diff'] / 1000)
    new_df = new_df.withColumn('new_rating', new_df['rating'] * 10)

    test = assembler.transform(new_df)
    final_model = LinearRegressionModel.load('reg_model')

    res = final_model.evaluate(test)

    req = res.predictions.select("prediction").rdd.flatMap(lambda x : x).collect()

    final_res = req[0] / 10

    if (final_res > 1):
        final_res /= 2
    if (final_res > 0.9):
        final_res /= 2

    return abs(final_res)

示例#11

0

显示文件

def main():
    
    feature_model = VectorIndexerModel.load(featureIndexer_path)
    vectorAssembler = vectorAssembler.load(vectorAssembler_path)
    ohe_model = OneHotEncoderModel.load(ohe_model_path)
    stringIndexer_model = StringIndexerModel.load(stringIndexerPath)
    lr_model = LinearRegressionModel.load(model_path)
    
    spark = SparkSession.builder.master("local").appName("Connection").getOrCreate()
    
    json_data = request.get_json()
    
    availability = json_data.availability
    minimum_nights = json_data.minimum_nights
    latitude = json_data.latitude
    longitude = json_data.longitude
    name = json_data.name
    neighbourhood_group = json_data.neighbourhood_group
    neighbourhood = json_data.neighbourhood
    room_type = json_data.room_type
    
    dept = [(name,neighbourhood_group,neighbourhood,room_type,latitude,longitude,0.0,minimum_nights,0.0,1.0,availability,0.0)]

    df = spark.createDataFrame(data=dept, schema = deptColumns)
    
    df = stringIndexer_model.transform(df)
    
    df = df.drop(*["neighbourhood_group", 'neighbourhood', 'room_type'])
    df = ohe_model.transform(df)
    df = df.drop(*["neighbourhood_group_int", 'neighbourhood_int', 'room_type_int'])

    df = df.withColumn("minimum_nights", when(df["minimum_nights_int"] > 30, 30).otherwise(df["minimum_nights_int"])).drop('minimum_nights_int')
    df = df.withColumn('name_length', length('name')).drop('name')

    df = vectorAssembler.transform(df)
    df = df.select(['features'])
    df = feature_model.transform(df)
    df = df.select(['features_vec'])

    lr_predictions = lr_model.transform(df)
    
    return jsonify(data=lr_predictions.collect()[-1].prediction)

示例#12

0

显示文件

文件： regression.py 项目： canisn/pyspark

def linear_regression():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                (0.0, 2.0, Vectors.sparse(1, [], []))],
                               ["label", "weight", "features"])
    lr = LinearRegression(maxIter=5,
                          regParam=0.0,
                          solver="normal",
                          weightCol="weight")
    model = lr.fit(df)
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    abs(model.transform(test0).head().prediction - (-1.0)) < 0.001
    # True
    abs(model.coefficients[0] - 1.0) < 0.001
    # True
    abs(model.intercept - 0.0) < 0.001
    # True
    test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )],
                                  ["features"])
    abs(model.transform(test1).head().prediction - 1.0) < 0.001
    # True
    lr.setParams("vector")
    # Traceback (most recent call last):
    #    ...
    # TypeError: Method setParams forces keyword arguments.
    temp_path = "./"
    lr_path = temp_path + "/lr"
    lr.save(lr_path)
    lr2 = LinearRegression.load(lr_path)
    lr2.getMaxIter()
    # 5
    model_path = temp_path + "/lr_model"
    model.save(model_path)
    model2 = LinearRegressionModel.load(model_path)
    model.coefficients[0] == model2.coefficients[0]
    # True
    model.intercept == model2.intercept
    # True
    model.numFeatures

示例#13

0

显示文件

文件： linear_regression.py 项目： KoferaDS/PySpark_ML

def loadModelLinearRegression(conf, path):
    """
       input  : conf, path
       output : model (CrossValidatorModel / TrainValidationSplitModel / LinearRegressionModel)
    """
                   
    #Jika menggunakan ML-Tuning
    if conf["tuning"]:    
        #Jika menggunakan Cross Validation, maka tipe model = CrossValidatorModel
        if conf["tuning"].get("method").lower() == "crossval":
            load_model = CrossValidatorModel.load(path)        
        #Jika menggunakan Train Validation, maka tipe model = TrainValidationSplitModel   
        elif conf["tuning"].get("method").lower() == "trainvalsplit":
            load_model = TrainValidationSplitModel.load(path)
    
    #Jika tidak menggunakan ML-tuning, tipe model = LinearRegressionModel    
    elif conf["tuning"] == None:
        load_model = LinearRegressionModel.load(path)
    
    return load_model

示例#14

0

显示文件

    def loadModel(self):

        if self.algoName == "linear_reg" or self.algoName == \
                "ridge_reg" or self.algoName == "lasso_reg" :
            regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "RandomForestAlgo" :
            regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation)
        if self.algoName == "GradientBoostAlgo":
            regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation)

        #dropping the already existed column of prediction on same model
        self.dataset = self.dataset.drop(self.modelSheetName)

        predictionData = regressionPrediction.transform(self.dataset)
        predictionData = predictionData.drop(self.featuresColm)

        #dropping extra added column
        if self.indexedFeatures:
            self.indexedFeatures.extend(self.oneHotEncodedFeaturesList)
            predictionData = predictionData.drop(*self.indexedFeatures)
        else:
            predictionData = predictionData

        #overWriting the original dataset

        '''this step is needed to write because of the nature of spark to not read or write whole data at once
        it only takes limited data to memory and another problem was lazy evaluation of spark.
        so overwriting the same dataset which is already in the memory is not possible'''
        emptyUserId = ''
        fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet"
        predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite")
        predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp)

        predictionTableData = \
            PredictiveUtilities.writeToParquet(fileName=self.datasetName,
                                                       locationAddress=self.locationAddress,
                                                       userId=emptyUserId,
                                                       data=predictionDataReadAgain)        
        return predictionTableData

示例#15

0

显示文件

    def __init__(self):
        print('== [Model] Creating spark session...')
        self.spark = SparkSession.builder.appName('lin_reg_api').getOrCreate()
        #self.spark = SparkSession.newSession()
        print('== [Model] spark version', self.spark.version)
        print('== [Model] Loading model...')
        self.model = LinearRegressionModel.load('model_lin_reg')
        print('== [Model] Loading complete...')

        self.entire_Set = self.spark.read.csv('./airfoil_self_noise.csv',
                                              header=True,
                                              inferSchema=True)

        # define transformers...
        self.airfoil_assembler = VectorAssembler(inputCols=X_Cols,
                                                 outputCol='_features')
        freq_scaler = StandardScaler(inputCol="_features",
                                     outputCol="features")

        tuned_input_vec = self.airfoil_assembler.transform(
            self.entire_Set).select('_features')
        self.std_scaler = freq_scaler.fit(tuned_input_vec)
        return

示例#16

0

显示文件

文件： 643 training code.py 项目： hp482/cs643-project2

dataset.groupby("quality").count().show()

# ################################################################################################################
# export the trained model and create a zip file for ease of download
import shutil
from pyspark.ml.regression import LinearRegressionModel
regressor.write().overwrite().save("cs643")

path_drv = shutil.make_archive("cs643", format='zip', base_dir="cs643")
shutil.unpack_archive(
    "cs643.zip",
    "test",
    format='zip',
)

loadedRegressor = LinearRegressionModel.load("test/cs643")
predictions = loadedRegressor.transform(valid_finalized_data)
print(loadedRegressor.numFeatures)
predictions.show()

# ################################################################################################################
# run some equick evaluations
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol=dataset.columns[11],
                           predictionCol="prediction",
                           metricName="rmse")
# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)
# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})

示例#17

0

显示文件

文件： Distr_LinearRegression.py 项目： suym/MlLib

def main():
    #静默弃用sklearn警告
    warnings.filterwarnings(module='sklearn*',
                            action='ignore',
                            category=DeprecationWarning)
    model_name = 'Distr_LinearRegression'
    dir_of_dict = sys.argv[1]
    bag = too.Read_info(dir_of_dict, 'supervision')
    name_dict,options,task_id,job_id,train_result_dir,\
    names_str,names_num,names_show,Y_names,dir_of_inputdata,\
    dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag

    dir_of_storePara = train_result_dir + '/%s_Parameters.json' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)
    dir_of_storeModel = train_result_dir + '/%s_model' % (
        str(task_id) + '_' + str(job_id) + '_' + model_name)

    # 配置spark客户端
    sess = SparkSession\
        .builder\
        .master("local[4]")\
        .appName("LinearRegression_spark")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    sc = sess.sparkContext
    sc.setLogLevel("ERROR")

    if options == 'train':
        time_start = time()
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #用于测试
        #dataset = dataset[0:1000]

        Y_datavec = dataset[Y_names].values
        #分别获得字符字段和数值型字段数据，且合并
        X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form(
            dataset, names_str, names_num, names_show, 'vocabset', 'open')
        #数据归一化
        X_datavec = too.Data_process(X_datavec, normalized_type)
        #处理数据不平衡问题
        #X,Y =  mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names)
        #X,Y =  mlp.Sample_unbalanced(X_datavec,Y_datavec)
        X, Y = X_datavec, Y_datavec
        ret_num = 'no_num'
        #PCA降维
        if open_pca == 'open_pca':
            pca_num, ret = mlp.GS_PCA(X)
            print 'PCA Information:', pca_num, ret
            print '----------------------------------------------'
            ret_num = ret['99%']
            X = mlp.Model_PCA(X, ret_num)
        #存储vocabset这个list和ret_num
        too.StorePara(dir_of_storePara, vocabset, ret_num)

        print '--------------Train data shape----------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print 'Y.shape:', Y.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        features = pd.DataFrame(X, )
        targets = pd.DataFrame(Y, columns=['Y'])
        #合拼矩阵
        merged = pd.concat([features, targets], axis=1)
        #创建spark DataFrame
        raw_df = sess.createDataFrame(merged)
        #提取特征与目标
        fomula = RFormula(formula='Y ~ .',
                          featuresCol="features",
                          labelCol="label")
        raw_df = fomula.fit(raw_df).transform(raw_df)
        #拆分训练集和测试集
        xy_train, xy_test = raw_df.randomSplit([train_size, test_size],
                                               seed=666)
        #调用模型
        clf_model = dmp.Distr_LinearRegression(xy_train, xy_test)
        #保存模型参数
        clf_model.write().overwrite().save(dir_of_storeModel)
        print '----------------------------------------------'
        dmp.Predict_test_data(xy_test, datavec_show_list, names_show,
                              clf_model, dir_of_outputdata, 'reg')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration

    if options == 'predict':
        time_start = time()
        with open(dir_of_storePara, 'r') as f:
            para_dict = json.load(f)
        vocabset = para_dict['vocabset']
        ret_num = para_dict['ret_num']
        #获取数据
        dataset = pd.read_csv(dir_of_inputdata)
        #分别获得字符字段和数值型字段数据，且合并
        X_datavec, datavec_show_list = too.Merge_form(dataset, names_str,
                                                      names_num, names_show,
                                                      vocabset, 'close')
        #数据归一化
        X = too.Data_process(X_datavec, normalized_type)
        #PCA降维
        if open_pca == 'open_pca':
            X = mlp.Model_PCA(X, ret_num)

        print '-------------Pdedict data shape---------------'
        print 'X.shape:', X.shape
        print '----------------------------------------------'
        print '--------------Start %s model------------------' % model_name

        features = pd.DataFrame(X, )
        #创建spark DataFrame
        raw_features = sess.createDataFrame(features)
        raw_x = VectorAssembler(inputCols=raw_features.columns,
                                outputCol='features').transform(raw_features)
        clf_model = LinearRegressionModel.load(dir_of_storeModel)
        dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model,
                         dir_of_outputdata, 'reg')
        duration = too.Duration(time() - time_start)
        print 'Total run time: %s' % duration

示例#18

0

显示文件

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegressionModel

sc = SparkContext()
sqlContext = SQLContext(sc)
model_1 = LinearRegressionModel.load("My_Model")
print("Model loaded successfully")

示例#19

0

显示文件

#need to load in testing dataset

import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sc= SparkContext()
sqlContext = SQLContext(sc)

print(sys.argv[1])
test_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true'
	, delimiter=';').load(sys.argv[1])
print(test_df.take(1))

from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['"""""fixed acidity""""', '""""volatile acidity""""'
	, '""""citric acid""""', '""""residual sugar""""', '""""chlorides""""', '""""free sulfur dioxide""""'
	, '""""total sulfur dioxide""""', '""""density""""', '""""pH""""', '""""sulphates""""', '""""alcohol""""']
	, outputCol = 'features')
vtest_df = vectorAssembler.transform(test_df)
vtest_df = vtest_df.select(['features', '""""quality"""""'])
vtest_df.show(3)

from pyspark.ml.regression import LinearRegressionModel
lr_model = LinearRegressionModel.load('model')
lr_predictions = lr_model.transform(vtest_df)
lr_predictions.select('prediction','""""quality"""""','features').show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='""""quality"""""',metricName='r2')
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

示例#20

0

显示文件

def main():
		

	
		
		
	if len(sys.argv) > 1:
		predictionFile = sys.argv[1]
		if path.isfile(predictionFile):
			
			print("Processing File "+predictionFile)
		else:
			print("File not found "+ predictionFile)
			usageMessage()
			exit()
	else:
		usageMessage()
		

	spark = SparkSession.builder.master("local[*]").getOrCreate()


	# load trained model 
	loadedRegressor = LinearRegressionModel.load("/cs643")
	# read dataset to predict 
	validationdataset = spark.read.option("delimiter", ";").csv(predictionFile,inferSchema=True, header =True)

	# validationdataset.printSchema()


	# Process the data set into expected format 
	
	# combine the first 10 columns into attributes.
	# because of the data file format use the filename list rather than field names explicitly 
	# for reference here's the expected column names
	# TrainingDataset.csv': b'"""""fixed acidity"""";""""volatile acidity"""";""""citric acid"""";""""residual sugar"""";""""chlorides"""";
	# """"free sulfur dioxide"""";""""total sulfur dioxide"""";""""density"""";""""pH"""";""""sulphates"""";""""alcohol""""
	
	assembler = VectorAssembler(inputCols=[validationdataset.columns[1], validationdataset.columns[2], validationdataset.columns[3], validationdataset.columns[4], validationdataset.columns[5], validationdataset.columns[6], validationdataset.columns[7], validationdataset.columns[8], validationdataset.columns[9],validationdataset.columns[10] ], outputCol = "Attributes")

	valid_output = assembler.transform(validationdataset)

	valid_finalized_data = valid_output.select("Attributes",validationdataset.columns[11])
	# valid_finalized_data.show()
	
	# predict the quality
	predictions = loadedRegressor.transform(valid_finalized_data)
	
	

	eval = RegressionEvaluator(labelCol= validationdataset.columns[11], predictionCol="prediction", metricName="rmse")
	# Root Mean Square Error
	rmse = eval.evaluate(predictions)
	print("RMSE: %.3f" % rmse)
	# Mean Square Error
	mse = eval.evaluate(predictions, {eval.metricName: "mse"})
	print("MSE: %.3f" % mse)
	# Mean Absolute Error
	mae = eval.evaluate(predictions, {eval.metricName: "mae"})
	print("MAE: %.3f" % mae)
	# r2 - coefficient of determination
	r2 = eval.evaluate(predictions, {eval.metricName: "r2"})
	print("r2: %.3f" %r2)

	
	# display results
	predictions.show(2000,  truncate = False)  # we could do this on a row count rather than 2000, but what if we end up with million row model somehow

示例#21

0

显示文件

## Import Libraries
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.regression import LinearRegression, LinearRegressionModel

sc = SparkContext()
spark = SparkSession(sc)

## Load model
lrModel = LinearRegressionModel.load(
    'gs://spark-training-data/ml_models/sample_model.model')

## Read in the data from model_test_jc
df = spark.read.format('libsvm').load(
    'gs://spark-training-data/datasets/sample_linear_regression_data.txt')
df.show(5)

## Predict Results
predictions = lrModel.transform(df)
predictions.show(5)

示例#22

0

显示文件

    .filter(df.dayofyear.isNotNull())

    # create a features column : list of open prices averaged by day
    df = df.groupby('symbol').agg(collect_list('avg(open)').alias("features"))

    # add a yearly average column
    yearly_avg = udf(lambda x: sum(x) / len(x), DoubleType())
    df = df.withColumn("yearly_average", yearly_avg("features"))

    # convert to vectors for the linear regression model
    array_to_vector = udf(lambda x: Vectors.dense(x[0]), VectorUDT())
    df = df.withColumn("features", array_to_vector("features"))

    # load the model and apply it
    model_path = "s3://" + bucket_name + "/models/lr_model"
    loaded_model = LinearRegressionModel.load(model_path)
    results = loaded_model.evaluate(df)
    predictions = results.predictions
    predictions = predictions.withColumn(
        "performance", ((col("prediction") / col("yearly_average")) - 1) * 100)
    performances = predictions.select("performance").rdd.map(
        lambda x: x[0]).collect()
    min_value = min(performances)
    max_value = max(performances)
    normalize = udf(lambda x: (x - min_value) / (max_value - min_value),
                    FloatType())

    # the score is the predicted price compared to the yearly average (normalized)
    predictions = predictions.select("symbol", "prediction", "performance") \
                             .withColumn("price_score", normalize("performance")) \
                             .drop("performance")

示例#23

0

显示文件

文件： customerstreamapp.py 项目： anandcu3/assignment-3-765837

import sys
import json
import pyspark
import time
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.feature import VectorAssembler
import pandas as pd
import multiprocessing
import threading

database_features_ordered = ['VendorID','tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','RatecodeID','store_and_fwd_flag','PULocationID','DOLocationID','payment_type','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount']
sc = pyspark.sql.SparkSession.builder.appName("nycApp").getOrCreate()
sc.sparkContext._conf.set('spark.executor.cores', multiprocessing.cpu_count())
print(sc.sparkContext._conf.getAll())
lm = LinearRegressionModel()
model_1 = lm.load("/home/gcpkey/lr.model")
topic = "streaming_data"
credentials = pika.PlainCredentials('user', 'QwwyqaQj1C4i')
parameters = pika.ConnectionParameters('35.247.117.124',5672,'/',credentials)
connection = pika.BlockingConnection(parameters)
connection1 = pika.BlockingConnection(parameters)
channel = connection.channel()
channel1 = connection1.channel()
channel1.queue_declare(queue="receivePredictedFareClient1")
channel.queue_declare(queue=topic)
def callback(ch, method, properties, body):
    df_message = pd.DataFrame.from_dict([json.loads(body.decode())])
    df_message = df_message[database_features_ordered]
    df_message_pyspark = sc.createDataFrame(df_message)
    df_message_pyspark.write.csv("hdfs://cluster-9bfd-m/hadoop/data1.csv", header=True, mode='append')
    start = time.time()

示例#24

0

显示文件

文件： consumer.py 项目： Dewanshurahul/StockPricePrediction

from pyspark.sql import SQLContext
from pyspark.sql.functions import hour, minute, second, col, avg, when
import pyspark.sql.functions as sql_functions
'''import kafka library for consumer'''
from kafka import KafkaConsumer
'''import kafka library for producer'''
from kafka import KafkaProducer
'''import pyspark mlib library'''
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.feature import VectorAssembler

sc = SparkContext()
sqlContext = SQLContext(sc)
try:
    Model_Path = "stockModel"
    load_model = LinearRegressionModel.load(Model_Path)
except:
    print("Model not Found")

consumer = KafkaConsumer('stock_price')


def stock_price_prediction():
    try:

        for msg in consumer:
            res_dict = json.loads(msg.value.decode('utf-8'))
            data_list = list(res_dict.values())
            dataframe = pd.DataFrame(
                [data_list],
                columns=['Open', 'Close', 'Volume', 'High', 'Low'])

示例#25

0

显示文件

文件： training.py 项目： mrvaghani/wine-test-ml

	valid_data_final.show()
	

	# Split training data into 80% and 20%
	train_data,test_data = data_final.randomSplit([0.8,0.2])
	regressor = LinearRegression(featuresCol = 'Attributes', labelCol = dataset.columns[11] )

	# Train using training data 
	regressor = regressor.fit(train_data)

	pred = regressor.evaluate(test_data)

	# Predict the model
	pred.predictions.show()

	predictions = regressor.transform(valid_data_final)
	predictions.show()

	# Save the model so that we can export it for later use
	regressor.write().overwrite().save("trained-model")

	path_drv = shutil.make_archive("trained-model", format='zip', base_dir="trained-model")
	shutil.unpack_archive("trained-model.zip", "trained-model-sample",format='zip',)

	loadedRegressor = LinearRegressionModel.load("trained-model-sample/trained-model")
	predictions = loadedRegressor.transform(valid_data_final)
	print(loadedRegressor.numFeatures)
	predictions.show()

	spark.stop()

示例#26

0

显示文件

def loadModel(dataset_add, feature_colm, label_colm, relation_list, relation):
    try:
        # dataset = spark.read.csv('/home/fidel/mltest/testData.csv', header=True, inferSchema=True)
        # testDataFetched =  testDataFetched.select('Independent_features', 'MPG')
        # testDataFetched.show()
        # testDataFetched.printSchema()

        dataset = spark.read.csv(dataset_add, header=True, inferSchema=True)
        dataset.show()

        # renaming the colm
        # print(label_colm)
        # dataset.withColumnRenamed(label_colm, "label")
        # print(label_colm)
        # dataset.show()

        label = ''
        for y in label_colm:
            label = y

        print(label)

        dictionary_list = {
            'log_list': ["CYLINDERS"],
            'sqrt_list': ["WEIGHT"],
            'cubic_list': ["ACCELERATION"]
        }

        relationship_val = 'linear_reg'

        if relationship_val == 'linear_reg':
            print('linear relationship')
        else:
            dataset = Relationship(dataset, dictionary_list)

        dataset.show()

        # implementing the vector assembler

        featureassembler = VectorAssembler(inputCols=feature_colm,
                                           outputCol="Independent_features")

        output = featureassembler.transform(dataset)

        output.show()
        output = output.select("Independent_features")

        # finalized_data = output.select("Independent_features", label)

        # finalized_data.show()

        regressorTest = LinearRegressionModel.load(
            '/home/fidel/mltest/linearRegressorFitModel')
        predictedData = regressorTest.transform(output)

        predictedData.show()

    except Exception as e:
        print('exception ' + str(e))


#
# if __name__== '__main__':
#     loadModel()

示例#27

0

显示文件

文件： sparkFunctions.py 项目： manuelalferez/FlOYBD

def predict(sql, sc, columns, station_id, currentWeather):
    columnsToPredict = [
        "max_temp", "med_temp", "min_temp", "max_pressure", "min_pressure",
        "precip", "insolation"
    ]
    returnedPredictions = []

    # schema = StructType([])

    field = [StructField("station_id", StringType(), True),
             StructField("max_temp", FloatType(), True), \
             StructField("max_temp", FloatType(), True), \
             StructField("med_temp", FloatType(), True), \
             StructField("min_temp", FloatType(), True), \
             StructField("max_pressure", FloatType(), True), \
             StructField("min_pressure", FloatType(), True), \
             StructField("precip", FloatType(), True), \
             StructField("insolation", FloatType(), True), \
             StructField("prediction_max_temp", FloatType(), True), \
             StructField("prediction_max_temp", FloatType(), True), \
             StructField("prediction_med_temp", FloatType(), True), \
             StructField("prediction_min_temp", FloatType(), True), \
             StructField("prediction_max_pressure", FloatType(), True), \
             StructField("prediction_min_pressure", FloatType(), True), \
             StructField("prediction_precip", FloatType(), True), \
             StructField("prediction_insolation", FloatType(), True)]

    schema = StructType(field)

    resultDataframe = sql.createDataFrame(sc.emptyRDD(), schema)

    fields1 = [StructField("station_id", StringType(), True),
               StructField("max_temp", FloatType(), True), \
               StructField("med_temp", FloatType(), True), \
               StructField("min_temp", FloatType(), True), \
               StructField("max_pressure", FloatType(), True), \
               StructField("min_pressure", FloatType(), True), \
               StructField("precip", FloatType(), True), \
               StructField("insolation", FloatType(), True)]

    schema1 = StructType(fields1)

    resultDataframe = sql.createDataFrame(sc.emptyRDD(), schema)
    firstTime = True

    for column in columns:
        modelPath = "models/" + station_id + "__" + column
        if not os.path.exists(modelPath):
            logger.info("####No Model")
            break

        lrModel = LinearRegressionModel.load(modelPath)

        assembler = VectorAssembler(inputCols=[column], outputCol="features")

        df_for_predict = sql.createDataFrame(
            [(
                currentWeather["station_id"],
                float(currentWeather["max_temp"]
                      ),  # if column != "max_temp" else None,
                float(currentWeather["med_temp"]
                      ),  # if column != "med_temp" else None,
                float(currentWeather["min_temp"]
                      ),  # if column != "min_temp" else None,
                float(currentWeather["max_pres"]
                      ),  # if column != "max_pres" else None,
                float(currentWeather["min_pres"]
                      ),  # if column != "min_pres" else None,
                float(currentWeather["precip"]
                      ),  # if column != "precip" else None,
                float(currentWeather["insolation"]),
                # if column != "insolation" else None,
            )],
            schema1)

        assembledTestData = assembler.transform(df_for_predict)
        prediction_data = assembledTestData.withColumn(
            "label",
            df_for_predict[column]).withColumn("features",
                                               assembledTestData.features)
        prediction_data1 = clearColumn(prediction_data, "label")

        predictions = lrModel.transform(prediction_data1,
                                        params={
                                            lrModel.intercept: True
                                        }).select("station_id", column,
                                                  "prediction")
        predictions.show()

        predictions1 = predictions.withColumn(str("prediction_" + column),
                                              predictions.prediction)

        returnedPredictions.append(
            generalFunctions.dataframeToJson(predictions1))

    return json.dumps(returnedPredictions)

示例#28

0

显示文件

文件： Vehicle_Prediction_Scoring.py 项目： atulkaila/hello-world

scalerModel = MinMaxScalerModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/Scaler.model')
scaledData = scalerModel.transform(vector_vehicle_df)

NoScale_Pca = PCAModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/NoScale_Pca.model')
Scaled_Pca = PCAModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/Scaled_Pca.model')

NoScale_Pca = NoScale_Pca.transform(vector_vehicle_df).select(
    ["og_features", "features"])
Scaled_Pca = Scaled_Pca.transform(scaledData).select(
    ["og_features", "features"])

#Loading models
lr_model = LinearRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/lr_model.model')
dtr_model = DecisionTreeRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/dtr_model.model')
gbt_model = GBTRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/gbt_model.model')
rf_model = RandomForestRegressionModel.load(
    'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/rfr_model.model')

#Generate prediction
lr_pred = lr_model.transform(NoScale_Pca).select(
    'prediction').collect()[0]['prediction']
dtr_pred = dtr_model.transform(Scaled_Pca).select(
    'prediction').collect()[0]['prediction']
gbt_pred = gbt_model.transform(Scaled_Pca).select(
    'prediction').collect()[0]['prediction']
rfr_pred = rf_model.transform(NoScale_Pca).select(

示例#29

0

显示文件

    conf = SparkConf().setAppName(appName).setMaster("spark://ubuntu:7077")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    #load data
    data = None
    if dataType == "libsvm":
        data = sqlContext.read.format("libsvm").load(dataPath)

    #load model
    if algoName == "LogisticRegression":
        from pyspark.ml.classification import LogisticRegressionModel
        model = LogisticRegressionModel.load(modelPath)
    elif algoName == "LinearRegression":
        from pyspark.ml.regression import LinearRegressionModel
        model = LinearRegressionModel.load(modelPath)
    elif algoName == "DecisionTreeClassification":
        from pyspark.ml.classification import DecisionTreeClassificationModel
        model = DecisionTreeClassificationModel.load(modelPath)
    elif algoName == "DecisionTreeRegression":
        from pyspark.ml.regression import DecisionTreeRegressionModel
        model = DecisionTreeRegressionModel.load(modelPath)
    elif algoName == "RandomForestClassification":
        from pyspark.ml.classification import RandomForestClassificationModel
        model = RandomForestClassificationModel.load(modelPath)
    elif algoName == "RandomForestRegression":
        from pyspark.ml.regression import RandomForestRegressionModel
        model = RandomForestRegressionModel.load(modelPath)
    elif algoName == "GBTClassification":
        from pyspark.ml.classification import GBTClassificationModel
        model = GBTClassificationModel.load(modelPath)

示例#30

0

显示文件

文件： prediction.py 项目： umallugari/Projects

def load_json_and_predict(spark, sqlContext, json_file):

    # Load data to predict
    #predict_df = spark.read.json(JSON_DATA_TO_PREDICT)
    print("Loading prediction data from ", json_file)
    predict_df = spark.read.json(json_file)
    print("Done")

    # Apply same process as historical data to convert/map

    # Drop rows with NA columns
    print("Preprocessing...")
    predict_df_1 = predict_df.dropna()

    predict_df_1 = predict_df_1[
        (predict_df_1.subtotal > 0) & (predict_df_1.min_item_price > 0) &
        (predict_df_1.max_item_price > 0) &
        (predict_df_1.total_onshift_runners >= 0) &
        (predict_df_1.total_busy_runners >= 0) &
        (predict_df_1.total_outstanding_orders >= 0) &
        (predict_df_1.estimated_order_place_duration > 0) &
        (predict_df_1.estimated_store_to_consumer_driving_duration > 0) &
        (predict_df_1.market_id != "NA") &
        (predict_df_1.store_primary_category != "NA") &
        (predict_df_1.order_protocol != "NA")]

    udf_rdd_datetimesec_to_sec = fn.udf(
        rdd_datetimesec_to_sec,
        IntegerType())  # LongType() not available for now

    predict_df_1 = predict_df_1.withColumn(
        'created_at', udf_rdd_datetimesec_to_sec(fn.col('created_at')))

    # Map store_id string to unique number
    stringindexer = StringIndexer().setInputCol("store_id").setOutputCol(
        "store_id_int")
    modelc = stringindexer.fit(predict_df_1)
    predict_df_1 = modelc.transform(predict_df_1)

    # Map store_primary_category to unique number
    stringindexer = StringIndexer().setInputCol(
        "store_primary_category").setOutputCol("store_primary_category_int")
    modelc = stringindexer.fit(predict_df_1)
    predict_df_1 = modelc.transform(predict_df_1)

    predict_df_1 = predict_df_1.withColumn(
        "market_id", predict_df_1["market_id"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "order_protocol", predict_df_1["order_protocol"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_onshift_runners",
        predict_df_1["total_onshift_runners"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_busy_runners",
        predict_df_1["total_busy_runners"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_outstanding_orders",
        predict_df_1["total_outstanding_orders"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "estimated_store_to_consumer_driving_duration",
        predict_df_1["estimated_store_to_consumer_driving_duration"].cast(
            IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "subtotal", predict_df_1["subtotal"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "num_distinct_items",
        predict_df_1["num_distinct_items"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "estimated_order_place_duration",
        predict_df_1["estimated_order_place_duration"].cast(IntegerType()))
    predict_df_1 = predict_df_1.withColumn(
        "total_items", predict_df_1["total_items"].cast(IntegerType()))
    print("Done")

    # Use same features as in historical data
    # Other columns in test data ('store_id', 'store_primary_category', 'min_item_price', 'max_item_price')
    # will be dropped by VectorAssembler transformation

    print("Vectorize...")
    pvectorAssembler = VectorAssembler(inputCols=feature_list,
                                       outputCol='features')
    vectorized_predict_df = pvectorAssembler.transform(predict_df_1)
    vectorized_predict_df = vectorized_predict_df.select(['features'])
    print("Done...")

    txt_file = open(MODEL_NAME_FILE, "r")
    model_name = txt_file.read()
    print("Read model: ", model_name)
    txt_file.close()

    print("Loading model " + model_name + " from " + MODEL_DIR)

    if (model_name == DT_MODEL):
        predict_model = DecisionTreeRegressionModel.load(MODEL_DIR)

    if (model_name == GBT_MODEL):
        predict_model = GBTRegressionModel.load(MODEL_DIR)

    if (model_name == LR_MODEL):
        predict_model = LinearRegressionModel.load(MODEL_DIR)

    if (model_name == RF_MODEL):
        predict_model = RandomForestRegressionModel.load(MODEL_DIR)

    print("Done")

    print("Predicting...")
    model_predictions = predict_model.transform(vectorized_predict_df)
    print("Done")

    df1 = predict_df_1.select('delivery_id').withColumn(
        "id", monotonically_increasing_id())
    df2 = model_predictions.select('prediction').withColumnRenamed(
        'prediction',
        'predicted_delivery_seconds').withColumn("id",
                                                 monotonically_increasing_id())

    # Perform a join on the ids.
    prediction_results_df = df1.join(df2, "id", "left").drop("id")
    prediction_results_df = prediction_results_df.withColumn(
        "predicted_delivery_seconds",
        prediction_results_df["predicted_delivery_seconds"].cast(
            IntegerType()))

    return prediction_results_df