def updateResources(duration_model_path, crowdedness_model_path, pipeline_path, routes_stops_path): self.duration_model = LinearRegressionModel.load(duration_model_path) self.crowdedness_model = LinearRegressionModel.load( crowdedness_model_path) self.pipeline = PipelineModel.load(pipeline_path) self.intermediate_stops_extraction_handler = IntermediateStopsExtractionHandler( self.sc, self.sqlContext, routes_stops_path)
def __init__(self, app_name, duration_model_path, crowdedness_model_path, pipeline_path, routes_stops_path): self.sc = SparkContext(conf=SparkConf().setAppName(app_name)) self.sqlContext = SQLContext(self.sc) self.duration_model = LinearRegressionModel.load(duration_model_path) self.crowdedness_model = LinearRegressionModel.load( crowdedness_model_path) self.pipeline = PipelineModel.load(pipeline_path) self.intermediate_stops_extraction_handler = IntermediateStopsExtractionHandler( self.sc, self.sqlContext, routes_stops_path)
def predict(bucket_name, feature_path, feature_name, output_path, plot_path): sc = SparkContext.getOrCreate() sqlCtx = SQLContext(sc) model_path = path.join(output_path, "regression-model") print "Load model from:", model_path lrModel = LinearRegressionModel.load(model_path) # read last maintenance time from json maintain4 = 0.0 maintain12 = 0.0 with open(path.join(output_path, "last_maintain.json")) as f: last_maintain = json.load(f) maintain4 = last_maintain['maintain4'] maintain12 = last_maintain['maintain12'] # read data from s3 for prediction df = read_data(bucket_name, feature_path, feature_name) # transform predict data df = df.withColumn('maintain4', lit(maintain4)) df = df.withColumn('maintain12', lit(maintain12)) test = df.rdd.map( lambda x: ( Vectors.dense([x.amount, x.split, x.maintain4, x.maintain12]), ) ).toDF(["features"]) lrModel.transform(test).toPandas().to_csv( path_or_buf=path.join(output_path, "pred-" + feature_name))
def main(bootstrap_server, topic_name, time_interval, scikit_model_path, spark_model_path, output_attribute_index): # Ucitavanje Passive Aggressive Regressor modela with hdfs.open(scikit_model_path, 'r') as opened_file: regressor = pickle.load(opened_file) # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije configuration = SparkConf().setAppName("BigDataProj3_Consumer") context = SparkContext(conf=configuration) context.setLogLevel("ERROR") # Inicijalizacija sesije # (mora da se obavi zbog ucitavanja modela) session = SparkSession(context) # Ucitavanje Spark modela sa zadate putanje model = LinearRegressionModel.load(spark_model_path) # Instanciranje streaming konteksta # (tako da se obrada izvrsava na svakih time_interval sekundi), # kao i stream-a uz zadat topik i Kafka broker streaming_context = StreamingContext(context, time_interval) stream = KafkaUtils.createDirectStream( streaming_context, [topic_name], {"metadata.broker.list": bootstrap_server}) # Za svaki RDD, definise se funkcija koja vrsi predikciju # (uz odgovarajuci prosledjen model i indeks izlaznog atributa) stream.foreachRDD(lambda input_data: prediction( input_data, regressor, model, output_attribute_index)) # Zapocinje se obrada stream-a streaming_context.start() streaming_context.awaitTermination()
def getOrCreateLR (self): try: if self.lrModel == None: self.lrModel = LinearRegressionModel.load(CONST_LR_FILE) except : print("Creating LR Model") self.lrModel = self.createLR () return self.lrModel
def loadModels(path,typeofmodel): models = {} for park in park_data_with_date_dict: if typeofmodel == "linear": models[park] = LinearRegressionModel.load(path+str(park)) elif typeofmodel == "tree": models[park] = DecisionTreeRegressionModel.load(path+str(park)) elif typeofmodel == "gbt": models[park] = GBTRegressionModel.load(path+str(park)) return models
def load(self, load_dir): if os.path.isdir(load_dir): if self.pm == 'PM10': self.model = LinearRegressionModel.load( os.path.join(load_dir, 'model')) else: self.model = RandomForestRegressionModel.load( os.path.join(load_dir, 'model')) self.imputer = ImputerModel.load(os.path.join(load_dir, 'imputer')) self.assembler = VectorAssembler.load( os.path.join(load_dir, 'assembler')) else: raise RuntimeError( 'Save path: {}, does not exist or is not a directory'.format( load_dir))
def predict(): """ https://app.host/predict?value=0 """ value = int(request.args.get("value")) spark_session, _ = create_spark_connection() model_load = LinearRegressionModel.load( os.path.join(os.path.dirname(os.path.abspath(__file__)), "model")) predict_df = spark_session.createDataFrame([(1, Vectors.dense(value))], ["index", "features"]) predict_collected = model_load.transform(predict_df).collect()[0] features = predict_collected.features.values.tolist() prediction = predict_collected.prediction output = {"features": features, "prediction": prediction} return jsonify(output)
def process(self, data_input, data_output, model): """ An spark process to do inference :param data_input: data input filename :param data_output: data output filename """ # Load Linear Regression model lr_model = LinearRegressionModel.load(model) new_data = self.spark.read.parquet(data_input) new_predictions = lr_model.evaluate(new_data) # Save result as parquet new_predictions.write.format("parquet").mode('overwrite').option( "header", "true").save(data_output)
def find_rating(player_id, cur_date): sp_sess = SparkSession.builder.appName('Regr_Data').getOrCreate() play_path = "hdfs://localhost:9000/players.csv" players = sp_sess.read.csv(play_path, header=True, inferSchema=True) assembler = VectorAssembler(inputCols = ['new_diff'],outputCol = 'features') name_df = players.filter(players['Id'] == int(player_id)) player_date = name_df.select("birthDate").collect()[0].birthDate new_date1 = player_date.split('-') new_date2 = cur_date.split('-') d1 = datetime.date(int(new_date1[0]), int(new_date1[1]), int(new_date1[2])) d2 = datetime.date(int(new_date2[0]), int(new_date2[1]), int(new_date2[2])) diff = abs(d2 - d1).days my_rating = 1.000 my_schema = StructType([ StructField('diff', IntegerType(), True), StructField('rating', FloatType(), True) ]) my_dict = {'diff': diff, 'rating': my_rating} new_df = sp_sess.createDataFrame([my_dict], my_schema) new_df = new_df.withColumn('new_diff', new_df['diff'] / 1000) new_df = new_df.withColumn('new_rating', new_df['rating'] * 10) test = assembler.transform(new_df) final_model = LinearRegressionModel.load('reg_model') res = final_model.evaluate(test) req = res.predictions.select("prediction").rdd.flatMap(lambda x : x).collect() final_res = req[0] / 10 if (final_res > 1): final_res /= 2 if (final_res > 0.9): final_res /= 2 return abs(final_res)
def main(): feature_model = VectorIndexerModel.load(featureIndexer_path) vectorAssembler = vectorAssembler.load(vectorAssembler_path) ohe_model = OneHotEncoderModel.load(ohe_model_path) stringIndexer_model = StringIndexerModel.load(stringIndexerPath) lr_model = LinearRegressionModel.load(model_path) spark = SparkSession.builder.master("local").appName("Connection").getOrCreate() json_data = request.get_json() availability = json_data.availability minimum_nights = json_data.minimum_nights latitude = json_data.latitude longitude = json_data.longitude name = json_data.name neighbourhood_group = json_data.neighbourhood_group neighbourhood = json_data.neighbourhood room_type = json_data.room_type dept = [(name,neighbourhood_group,neighbourhood,room_type,latitude,longitude,0.0,minimum_nights,0.0,1.0,availability,0.0)] df = spark.createDataFrame(data=dept, schema = deptColumns) df = stringIndexer_model.transform(df) df = df.drop(*["neighbourhood_group", 'neighbourhood', 'room_type']) df = ohe_model.transform(df) df = df.drop(*["neighbourhood_group_int", 'neighbourhood_int', 'room_type_int']) df = df.withColumn("minimum_nights", when(df["minimum_nights_int"] > 30, 30).otherwise(df["minimum_nights_int"])).drop('minimum_nights_int') df = df.withColumn('name_length', length('name')).drop('name') df = vectorAssembler.transform(df) df = df.select(['features']) df = feature_model.transform(df) df = df.select(['features_vec']) lr_predictions = lr_model.transform(df) return jsonify(data=lr_predictions.collect()[-1].prediction)
def linear_regression(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") model = lr.fit(df) test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"]) abs(model.transform(test0).head().prediction - (-1.0)) < 0.001 # True abs(model.coefficients[0] - 1.0) < 0.001 # True abs(model.intercept - 0.0) < 0.001 # True test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]), )], ["features"]) abs(model.transform(test1).head().prediction - 1.0) < 0.001 # True lr.setParams("vector") # Traceback (most recent call last): # ... # TypeError: Method setParams forces keyword arguments. temp_path = "./" lr_path = temp_path + "/lr" lr.save(lr_path) lr2 = LinearRegression.load(lr_path) lr2.getMaxIter() # 5 model_path = temp_path + "/lr_model" model.save(model_path) model2 = LinearRegressionModel.load(model_path) model.coefficients[0] == model2.coefficients[0] # True model.intercept == model2.intercept # True model.numFeatures
def loadModelLinearRegression(conf, path): """ input : conf, path output : model (CrossValidatorModel / TrainValidationSplitModel / LinearRegressionModel) """ #Jika menggunakan ML-Tuning if conf["tuning"]: #Jika menggunakan Cross Validation, maka tipe model = CrossValidatorModel if conf["tuning"].get("method").lower() == "crossval": load_model = CrossValidatorModel.load(path) #Jika menggunakan Train Validation, maka tipe model = TrainValidationSplitModel elif conf["tuning"].get("method").lower() == "trainvalsplit": load_model = TrainValidationSplitModel.load(path) #Jika tidak menggunakan ML-tuning, tipe model = LinearRegressionModel elif conf["tuning"] == None: load_model = LinearRegressionModel.load(path) return load_model
def loadModel(self): if self.algoName == "linear_reg" or self.algoName == \ "ridge_reg" or self.algoName == "lasso_reg" : regressionPrediction = LinearRegressionModel.load(self.modelStorageLocation) if self.algoName == "RandomForestAlgo" : regressionPrediction = RandomForestRegressionModel.load(self.modelStorageLocation) if self.algoName == "GradientBoostAlgo": regressionPrediction = GBTRegressionModel.load(self.modelStorageLocation) #dropping the already existed column of prediction on same model self.dataset = self.dataset.drop(self.modelSheetName) predictionData = regressionPrediction.transform(self.dataset) predictionData = predictionData.drop(self.featuresColm) #dropping extra added column if self.indexedFeatures: self.indexedFeatures.extend(self.oneHotEncodedFeaturesList) predictionData = predictionData.drop(*self.indexedFeatures) else: predictionData = predictionData #overWriting the original dataset '''this step is needed to write because of the nature of spark to not read or write whole data at once it only takes limited data to memory and another problem was lazy evaluation of spark. so overwriting the same dataset which is already in the memory is not possible''' emptyUserId = '' fileNameWithPathTemp = self.locationAddress + emptyUserId + self.datasetName + "_temp.parquet" predictionData.write.parquet(fileNameWithPathTemp, mode="overwrite") predictionDataReadAgain = self.spark.read.parquet(fileNameWithPathTemp) predictionTableData = \ PredictiveUtilities.writeToParquet(fileName=self.datasetName, locationAddress=self.locationAddress, userId=emptyUserId, data=predictionDataReadAgain) return predictionTableData
def __init__(self): print('== [Model] Creating spark session...') self.spark = SparkSession.builder.appName('lin_reg_api').getOrCreate() #self.spark = SparkSession.newSession() print('== [Model] spark version', self.spark.version) print('== [Model] Loading model...') self.model = LinearRegressionModel.load('model_lin_reg') print('== [Model] Loading complete...') self.entire_Set = self.spark.read.csv('./airfoil_self_noise.csv', header=True, inferSchema=True) # define transformers... self.airfoil_assembler = VectorAssembler(inputCols=X_Cols, outputCol='_features') freq_scaler = StandardScaler(inputCol="_features", outputCol="features") tuned_input_vec = self.airfoil_assembler.transform( self.entire_Set).select('_features') self.std_scaler = freq_scaler.fit(tuned_input_vec) return
dataset.groupby("quality").count().show() # ################################################################################################################ # export the trained model and create a zip file for ease of download import shutil from pyspark.ml.regression import LinearRegressionModel regressor.write().overwrite().save("cs643") path_drv = shutil.make_archive("cs643", format='zip', base_dir="cs643") shutil.unpack_archive( "cs643.zip", "test", format='zip', ) loadedRegressor = LinearRegressionModel.load("test/cs643") predictions = loadedRegressor.transform(valid_finalized_data) print(loadedRegressor.numFeatures) predictions.show() # ################################################################################################################ # run some equick evaluations from pyspark.ml.evaluation import RegressionEvaluator eval = RegressionEvaluator(labelCol=dataset.columns[11], predictionCol="prediction", metricName="rmse") # Root Mean Square Error rmse = eval.evaluate(pred.predictions) print("RMSE: %.3f" % rmse) # Mean Square Error mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
def main(): #静默弃用sklearn警告 warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) model_name = 'Distr_LinearRegression' dir_of_dict = sys.argv[1] bag = too.Read_info(dir_of_dict, 'supervision') name_dict,options,task_id,job_id,train_result_dir,\ names_str,names_num,names_show,Y_names,dir_of_inputdata,\ dir_of_outputdata,open_pca,train_size,test_size,normalized_type = bag dir_of_storePara = train_result_dir + '/%s_Parameters.json' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) dir_of_storeModel = train_result_dir + '/%s_model' % ( str(task_id) + '_' + str(job_id) + '_' + model_name) # 配置spark客户端 sess = SparkSession\ .builder\ .master("local[4]")\ .appName("LinearRegression_spark")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() sc = sess.sparkContext sc.setLogLevel("ERROR") if options == 'train': time_start = time() #获取数据 dataset = pd.read_csv(dir_of_inputdata) #用于测试 #dataset = dataset[0:1000] Y_datavec = dataset[Y_names].values #分别获得字符字段和数值型字段数据,且合并 X_datavec, X_columns, vocabset, datavec_show_list = too.Merge_form( dataset, names_str, names_num, names_show, 'vocabset', 'open') #数据归一化 X_datavec = too.Data_process(X_datavec, normalized_type) #处理数据不平衡问题 #X,Y = mlp.KMeans_unbalanced(X_datavec,Y_datavec,X_columns,Y_names) #X,Y = mlp.Sample_unbalanced(X_datavec,Y_datavec) X, Y = X_datavec, Y_datavec ret_num = 'no_num' #PCA降维 if open_pca == 'open_pca': pca_num, ret = mlp.GS_PCA(X) print 'PCA Information:', pca_num, ret print '----------------------------------------------' ret_num = ret['99%'] X = mlp.Model_PCA(X, ret_num) #存储vocabset这个list和ret_num too.StorePara(dir_of_storePara, vocabset, ret_num) print '--------------Train data shape----------------' print 'X.shape:', X.shape print '----------------------------------------------' print 'Y.shape:', Y.shape print '----------------------------------------------' print '--------------Start %s model------------------' % model_name features = pd.DataFrame(X, ) targets = pd.DataFrame(Y, columns=['Y']) #合拼矩阵 merged = pd.concat([features, targets], axis=1) #创建spark DataFrame raw_df = sess.createDataFrame(merged) #提取特征与目标 fomula = RFormula(formula='Y ~ .', featuresCol="features", labelCol="label") raw_df = fomula.fit(raw_df).transform(raw_df) #拆分训练集和测试集 xy_train, xy_test = raw_df.randomSplit([train_size, test_size], seed=666) #调用模型 clf_model = dmp.Distr_LinearRegression(xy_train, xy_test) #保存模型参数 clf_model.write().overwrite().save(dir_of_storeModel) print '----------------------------------------------' dmp.Predict_test_data(xy_test, datavec_show_list, names_show, clf_model, dir_of_outputdata, 'reg') duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration if options == 'predict': time_start = time() with open(dir_of_storePara, 'r') as f: para_dict = json.load(f) vocabset = para_dict['vocabset'] ret_num = para_dict['ret_num'] #获取数据 dataset = pd.read_csv(dir_of_inputdata) #分别获得字符字段和数值型字段数据,且合并 X_datavec, datavec_show_list = too.Merge_form(dataset, names_str, names_num, names_show, vocabset, 'close') #数据归一化 X = too.Data_process(X_datavec, normalized_type) #PCA降维 if open_pca == 'open_pca': X = mlp.Model_PCA(X, ret_num) print '-------------Pdedict data shape---------------' print 'X.shape:', X.shape print '----------------------------------------------' print '--------------Start %s model------------------' % model_name features = pd.DataFrame(X, ) #创建spark DataFrame raw_features = sess.createDataFrame(features) raw_x = VectorAssembler(inputCols=raw_features.columns, outputCol='features').transform(raw_features) clf_model = LinearRegressionModel.load(dir_of_storeModel) dmp.Predict_data(raw_x, datavec_show_list, names_show, clf_model, dir_of_outputdata, 'reg') duration = too.Duration(time() - time_start) print 'Total run time: %s' % duration
from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from pyspark.ml.regression import LinearRegressionModel sc = SparkContext() sqlContext = SQLContext(sc) model_1 = LinearRegressionModel.load("My_Model") print("Model loaded successfully")
#need to load in testing dataset import sys from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext sc= SparkContext() sqlContext = SQLContext(sc) print(sys.argv[1]) test_df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true' , delimiter=';').load(sys.argv[1]) print(test_df.take(1)) from pyspark.ml.feature import VectorAssembler vectorAssembler = VectorAssembler(inputCols = ['"""""fixed acidity""""', '""""volatile acidity""""' , '""""citric acid""""', '""""residual sugar""""', '""""chlorides""""', '""""free sulfur dioxide""""' , '""""total sulfur dioxide""""', '""""density""""', '""""pH""""', '""""sulphates""""', '""""alcohol""""'] , outputCol = 'features') vtest_df = vectorAssembler.transform(test_df) vtest_df = vtest_df.select(['features', '""""quality"""""']) vtest_df.show(3) from pyspark.ml.regression import LinearRegressionModel lr_model = LinearRegressionModel.load('model') lr_predictions = lr_model.transform(vtest_df) lr_predictions.select('prediction','""""quality"""""','features').show(5) from pyspark.ml.evaluation import RegressionEvaluator lr_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='""""quality"""""',metricName='r2') print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))
def main(): if len(sys.argv) > 1: predictionFile = sys.argv[1] if path.isfile(predictionFile): print("Processing File "+predictionFile) else: print("File not found "+ predictionFile) usageMessage() exit() else: usageMessage() spark = SparkSession.builder.master("local[*]").getOrCreate() # load trained model loadedRegressor = LinearRegressionModel.load("/cs643") # read dataset to predict validationdataset = spark.read.option("delimiter", ";").csv(predictionFile,inferSchema=True, header =True) # validationdataset.printSchema() # Process the data set into expected format # combine the first 10 columns into attributes. # because of the data file format use the filename list rather than field names explicitly # for reference here's the expected column names # TrainingDataset.csv': b'"""""fixed acidity"""";""""volatile acidity"""";""""citric acid"""";""""residual sugar"""";""""chlorides""""; # """"free sulfur dioxide"""";""""total sulfur dioxide"""";""""density"""";""""pH"""";""""sulphates"""";""""alcohol"""" assembler = VectorAssembler(inputCols=[validationdataset.columns[1], validationdataset.columns[2], validationdataset.columns[3], validationdataset.columns[4], validationdataset.columns[5], validationdataset.columns[6], validationdataset.columns[7], validationdataset.columns[8], validationdataset.columns[9],validationdataset.columns[10] ], outputCol = "Attributes") valid_output = assembler.transform(validationdataset) valid_finalized_data = valid_output.select("Attributes",validationdataset.columns[11]) # valid_finalized_data.show() # predict the quality predictions = loadedRegressor.transform(valid_finalized_data) eval = RegressionEvaluator(labelCol= validationdataset.columns[11], predictionCol="prediction", metricName="rmse") # Root Mean Square Error rmse = eval.evaluate(predictions) print("RMSE: %.3f" % rmse) # Mean Square Error mse = eval.evaluate(predictions, {eval.metricName: "mse"}) print("MSE: %.3f" % mse) # Mean Absolute Error mae = eval.evaluate(predictions, {eval.metricName: "mae"}) print("MAE: %.3f" % mae) # r2 - coefficient of determination r2 = eval.evaluate(predictions, {eval.metricName: "r2"}) print("r2: %.3f" %r2) # display results predictions.show(2000, truncate = False) # we could do this on a row count rather than 2000, but what if we end up with million row model somehow
## Import Libraries from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.ml.regression import LinearRegression, LinearRegressionModel sc = SparkContext() spark = SparkSession(sc) ## Load model lrModel = LinearRegressionModel.load( 'gs://spark-training-data/ml_models/sample_model.model') ## Read in the data from model_test_jc df = spark.read.format('libsvm').load( 'gs://spark-training-data/datasets/sample_linear_regression_data.txt') df.show(5) ## Predict Results predictions = lrModel.transform(df) predictions.show(5)
.filter(df.dayofyear.isNotNull()) # create a features column : list of open prices averaged by day df = df.groupby('symbol').agg(collect_list('avg(open)').alias("features")) # add a yearly average column yearly_avg = udf(lambda x: sum(x) / len(x), DoubleType()) df = df.withColumn("yearly_average", yearly_avg("features")) # convert to vectors for the linear regression model array_to_vector = udf(lambda x: Vectors.dense(x[0]), VectorUDT()) df = df.withColumn("features", array_to_vector("features")) # load the model and apply it model_path = "s3://" + bucket_name + "/models/lr_model" loaded_model = LinearRegressionModel.load(model_path) results = loaded_model.evaluate(df) predictions = results.predictions predictions = predictions.withColumn( "performance", ((col("prediction") / col("yearly_average")) - 1) * 100) performances = predictions.select("performance").rdd.map( lambda x: x[0]).collect() min_value = min(performances) max_value = max(performances) normalize = udf(lambda x: (x - min_value) / (max_value - min_value), FloatType()) # the score is the predicted price compared to the yearly average (normalized) predictions = predictions.select("symbol", "prediction", "performance") \ .withColumn("price_score", normalize("performance")) \ .drop("performance")
import sys import json import pyspark import time from pyspark.ml.regression import LinearRegressionModel from pyspark.ml.feature import VectorAssembler import pandas as pd import multiprocessing import threading database_features_ordered = ['VendorID','tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','RatecodeID','store_and_fwd_flag','PULocationID','DOLocationID','payment_type','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount'] sc = pyspark.sql.SparkSession.builder.appName("nycApp").getOrCreate() sc.sparkContext._conf.set('spark.executor.cores', multiprocessing.cpu_count()) print(sc.sparkContext._conf.getAll()) lm = LinearRegressionModel() model_1 = lm.load("/home/gcpkey/lr.model") topic = "streaming_data" credentials = pika.PlainCredentials('user', 'QwwyqaQj1C4i') parameters = pika.ConnectionParameters('35.247.117.124',5672,'/',credentials) connection = pika.BlockingConnection(parameters) connection1 = pika.BlockingConnection(parameters) channel = connection.channel() channel1 = connection1.channel() channel1.queue_declare(queue="receivePredictedFareClient1") channel.queue_declare(queue=topic) def callback(ch, method, properties, body): df_message = pd.DataFrame.from_dict([json.loads(body.decode())]) df_message = df_message[database_features_ordered] df_message_pyspark = sc.createDataFrame(df_message) df_message_pyspark.write.csv("hdfs://cluster-9bfd-m/hadoop/data1.csv", header=True, mode='append') start = time.time()
from pyspark.sql import SQLContext from pyspark.sql.functions import hour, minute, second, col, avg, when import pyspark.sql.functions as sql_functions '''import kafka library for consumer''' from kafka import KafkaConsumer '''import kafka library for producer''' from kafka import KafkaProducer '''import pyspark mlib library''' from pyspark.ml.regression import LinearRegressionModel from pyspark.ml.feature import VectorAssembler sc = SparkContext() sqlContext = SQLContext(sc) try: Model_Path = "stockModel" load_model = LinearRegressionModel.load(Model_Path) except: print("Model not Found") consumer = KafkaConsumer('stock_price') def stock_price_prediction(): try: for msg in consumer: res_dict = json.loads(msg.value.decode('utf-8')) data_list = list(res_dict.values()) dataframe = pd.DataFrame( [data_list], columns=['Open', 'Close', 'Volume', 'High', 'Low'])
valid_data_final.show() # Split training data into 80% and 20% train_data,test_data = data_final.randomSplit([0.8,0.2]) regressor = LinearRegression(featuresCol = 'Attributes', labelCol = dataset.columns[11] ) # Train using training data regressor = regressor.fit(train_data) pred = regressor.evaluate(test_data) # Predict the model pred.predictions.show() predictions = regressor.transform(valid_data_final) predictions.show() # Save the model so that we can export it for later use regressor.write().overwrite().save("trained-model") path_drv = shutil.make_archive("trained-model", format='zip', base_dir="trained-model") shutil.unpack_archive("trained-model.zip", "trained-model-sample",format='zip',) loadedRegressor = LinearRegressionModel.load("trained-model-sample/trained-model") predictions = loadedRegressor.transform(valid_data_final) print(loadedRegressor.numFeatures) predictions.show() spark.stop()
def loadModel(dataset_add, feature_colm, label_colm, relation_list, relation): try: # dataset = spark.read.csv('/home/fidel/mltest/testData.csv', header=True, inferSchema=True) # testDataFetched = testDataFetched.select('Independent_features', 'MPG') # testDataFetched.show() # testDataFetched.printSchema() dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() # renaming the colm # print(label_colm) # dataset.withColumnRenamed(label_colm, "label") # print(label_colm) # dataset.show() label = '' for y in label_colm: label = y print(label) dictionary_list = { 'log_list': ["CYLINDERS"], 'sqrt_list': ["WEIGHT"], 'cubic_list': ["ACCELERATION"] } relationship_val = 'linear_reg' if relationship_val == 'linear_reg': print('linear relationship') else: dataset = Relationship(dataset, dictionary_list) dataset.show() # implementing the vector assembler featureassembler = VectorAssembler(inputCols=feature_colm, outputCol="Independent_features") output = featureassembler.transform(dataset) output.show() output = output.select("Independent_features") # finalized_data = output.select("Independent_features", label) # finalized_data.show() regressorTest = LinearRegressionModel.load( '/home/fidel/mltest/linearRegressorFitModel') predictedData = regressorTest.transform(output) predictedData.show() except Exception as e: print('exception ' + str(e)) # # if __name__== '__main__': # loadModel()
def predict(sql, sc, columns, station_id, currentWeather): columnsToPredict = [ "max_temp", "med_temp", "min_temp", "max_pressure", "min_pressure", "precip", "insolation" ] returnedPredictions = [] # schema = StructType([]) field = [StructField("station_id", StringType(), True), StructField("max_temp", FloatType(), True), \ StructField("max_temp", FloatType(), True), \ StructField("med_temp", FloatType(), True), \ StructField("min_temp", FloatType(), True), \ StructField("max_pressure", FloatType(), True), \ StructField("min_pressure", FloatType(), True), \ StructField("precip", FloatType(), True), \ StructField("insolation", FloatType(), True), \ StructField("prediction_max_temp", FloatType(), True), \ StructField("prediction_max_temp", FloatType(), True), \ StructField("prediction_med_temp", FloatType(), True), \ StructField("prediction_min_temp", FloatType(), True), \ StructField("prediction_max_pressure", FloatType(), True), \ StructField("prediction_min_pressure", FloatType(), True), \ StructField("prediction_precip", FloatType(), True), \ StructField("prediction_insolation", FloatType(), True)] schema = StructType(field) resultDataframe = sql.createDataFrame(sc.emptyRDD(), schema) fields1 = [StructField("station_id", StringType(), True), StructField("max_temp", FloatType(), True), \ StructField("med_temp", FloatType(), True), \ StructField("min_temp", FloatType(), True), \ StructField("max_pressure", FloatType(), True), \ StructField("min_pressure", FloatType(), True), \ StructField("precip", FloatType(), True), \ StructField("insolation", FloatType(), True)] schema1 = StructType(fields1) resultDataframe = sql.createDataFrame(sc.emptyRDD(), schema) firstTime = True for column in columns: modelPath = "models/" + station_id + "__" + column if not os.path.exists(modelPath): logger.info("####No Model") break lrModel = LinearRegressionModel.load(modelPath) assembler = VectorAssembler(inputCols=[column], outputCol="features") df_for_predict = sql.createDataFrame( [( currentWeather["station_id"], float(currentWeather["max_temp"] ), # if column != "max_temp" else None, float(currentWeather["med_temp"] ), # if column != "med_temp" else None, float(currentWeather["min_temp"] ), # if column != "min_temp" else None, float(currentWeather["max_pres"] ), # if column != "max_pres" else None, float(currentWeather["min_pres"] ), # if column != "min_pres" else None, float(currentWeather["precip"] ), # if column != "precip" else None, float(currentWeather["insolation"]), # if column != "insolation" else None, )], schema1) assembledTestData = assembler.transform(df_for_predict) prediction_data = assembledTestData.withColumn( "label", df_for_predict[column]).withColumn("features", assembledTestData.features) prediction_data1 = clearColumn(prediction_data, "label") predictions = lrModel.transform(prediction_data1, params={ lrModel.intercept: True }).select("station_id", column, "prediction") predictions.show() predictions1 = predictions.withColumn(str("prediction_" + column), predictions.prediction) returnedPredictions.append( generalFunctions.dataframeToJson(predictions1)) return json.dumps(returnedPredictions)
scalerModel = MinMaxScalerModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/Scaler.model') scaledData = scalerModel.transform(vector_vehicle_df) NoScale_Pca = PCAModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/NoScale_Pca.model') Scaled_Pca = PCAModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/Scaled_Pca.model') NoScale_Pca = NoScale_Pca.transform(vector_vehicle_df).select( ["og_features", "features"]) Scaled_Pca = Scaled_Pca.transform(scaledData).select( ["og_features", "features"]) #Loading models lr_model = LinearRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/lr_model.model') dtr_model = DecisionTreeRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/dtr_model.model') gbt_model = GBTRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/gbt_model.model') rf_model = RandomForestRegressionModel.load( 'file:///bd-fs-mnt/Spark_RA3/models/Spark_Vehicles/rfr_model.model') #Generate prediction lr_pred = lr_model.transform(NoScale_Pca).select( 'prediction').collect()[0]['prediction'] dtr_pred = dtr_model.transform(Scaled_Pca).select( 'prediction').collect()[0]['prediction'] gbt_pred = gbt_model.transform(Scaled_Pca).select( 'prediction').collect()[0]['prediction'] rfr_pred = rf_model.transform(NoScale_Pca).select(
conf = SparkConf().setAppName(appName).setMaster("spark://ubuntu:7077") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) #load data data = None if dataType == "libsvm": data = sqlContext.read.format("libsvm").load(dataPath) #load model if algoName == "LogisticRegression": from pyspark.ml.classification import LogisticRegressionModel model = LogisticRegressionModel.load(modelPath) elif algoName == "LinearRegression": from pyspark.ml.regression import LinearRegressionModel model = LinearRegressionModel.load(modelPath) elif algoName == "DecisionTreeClassification": from pyspark.ml.classification import DecisionTreeClassificationModel model = DecisionTreeClassificationModel.load(modelPath) elif algoName == "DecisionTreeRegression": from pyspark.ml.regression import DecisionTreeRegressionModel model = DecisionTreeRegressionModel.load(modelPath) elif algoName == "RandomForestClassification": from pyspark.ml.classification import RandomForestClassificationModel model = RandomForestClassificationModel.load(modelPath) elif algoName == "RandomForestRegression": from pyspark.ml.regression import RandomForestRegressionModel model = RandomForestRegressionModel.load(modelPath) elif algoName == "GBTClassification": from pyspark.ml.classification import GBTClassificationModel model = GBTClassificationModel.load(modelPath)
def load_json_and_predict(spark, sqlContext, json_file): # Load data to predict #predict_df = spark.read.json(JSON_DATA_TO_PREDICT) print("Loading prediction data from ", json_file) predict_df = spark.read.json(json_file) print("Done") # Apply same process as historical data to convert/map # Drop rows with NA columns print("Preprocessing...") predict_df_1 = predict_df.dropna() predict_df_1 = predict_df_1[ (predict_df_1.subtotal > 0) & (predict_df_1.min_item_price > 0) & (predict_df_1.max_item_price > 0) & (predict_df_1.total_onshift_runners >= 0) & (predict_df_1.total_busy_runners >= 0) & (predict_df_1.total_outstanding_orders >= 0) & (predict_df_1.estimated_order_place_duration > 0) & (predict_df_1.estimated_store_to_consumer_driving_duration > 0) & (predict_df_1.market_id != "NA") & (predict_df_1.store_primary_category != "NA") & (predict_df_1.order_protocol != "NA")] udf_rdd_datetimesec_to_sec = fn.udf( rdd_datetimesec_to_sec, IntegerType()) # LongType() not available for now predict_df_1 = predict_df_1.withColumn( 'created_at', udf_rdd_datetimesec_to_sec(fn.col('created_at'))) # Map store_id string to unique number stringindexer = StringIndexer().setInputCol("store_id").setOutputCol( "store_id_int") modelc = stringindexer.fit(predict_df_1) predict_df_1 = modelc.transform(predict_df_1) # Map store_primary_category to unique number stringindexer = StringIndexer().setInputCol( "store_primary_category").setOutputCol("store_primary_category_int") modelc = stringindexer.fit(predict_df_1) predict_df_1 = modelc.transform(predict_df_1) predict_df_1 = predict_df_1.withColumn( "market_id", predict_df_1["market_id"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "order_protocol", predict_df_1["order_protocol"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_onshift_runners", predict_df_1["total_onshift_runners"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_busy_runners", predict_df_1["total_busy_runners"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_outstanding_orders", predict_df_1["total_outstanding_orders"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "estimated_store_to_consumer_driving_duration", predict_df_1["estimated_store_to_consumer_driving_duration"].cast( IntegerType())) predict_df_1 = predict_df_1.withColumn( "subtotal", predict_df_1["subtotal"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "num_distinct_items", predict_df_1["num_distinct_items"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "estimated_order_place_duration", predict_df_1["estimated_order_place_duration"].cast(IntegerType())) predict_df_1 = predict_df_1.withColumn( "total_items", predict_df_1["total_items"].cast(IntegerType())) print("Done") # Use same features as in historical data # Other columns in test data ('store_id', 'store_primary_category', 'min_item_price', 'max_item_price') # will be dropped by VectorAssembler transformation print("Vectorize...") pvectorAssembler = VectorAssembler(inputCols=feature_list, outputCol='features') vectorized_predict_df = pvectorAssembler.transform(predict_df_1) vectorized_predict_df = vectorized_predict_df.select(['features']) print("Done...") txt_file = open(MODEL_NAME_FILE, "r") model_name = txt_file.read() print("Read model: ", model_name) txt_file.close() print("Loading model " + model_name + " from " + MODEL_DIR) if (model_name == DT_MODEL): predict_model = DecisionTreeRegressionModel.load(MODEL_DIR) if (model_name == GBT_MODEL): predict_model = GBTRegressionModel.load(MODEL_DIR) if (model_name == LR_MODEL): predict_model = LinearRegressionModel.load(MODEL_DIR) if (model_name == RF_MODEL): predict_model = RandomForestRegressionModel.load(MODEL_DIR) print("Done") print("Predicting...") model_predictions = predict_model.transform(vectorized_predict_df) print("Done") df1 = predict_df_1.select('delivery_id').withColumn( "id", monotonically_increasing_id()) df2 = model_predictions.select('prediction').withColumnRenamed( 'prediction', 'predicted_delivery_seconds').withColumn("id", monotonically_increasing_id()) # Perform a join on the ids. prediction_results_df = df1.join(df2, "id", "left").drop("id") prediction_results_df = prediction_results_df.withColumn( "predicted_delivery_seconds", prediction_results_df["predicted_delivery_seconds"].cast( IntegerType())) return prediction_results_df