def test_save_load_trained_model(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) lrModel = tvsModel.bestModel tvsModelPath = temp_path + "/tvsModel" lrModel.save(tvsModelPath) loadedLrModel = LogisticRegressionModel.load(tvsModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
def __init__(self, model_name, model_base_path): """ Initialize the service. Args: model_name: The name of the model. model_base_path: The file path of the model. Return: None """ super(SparkInferenceService, self).__init__() # TODO: Download the model files #local_model_base_path = filesystem_util.download_hdfs_moels( # model_base_path) self.model_name = model_name self.model_base_path = model_base_path self.model_version_list = [1] self.model_graph_signature = "" self.platform = "Spark" self.preprocess_function, self.postprocess_function = preprocess_util.get_preprocess_postprocess_function_from_model_path( self.model_base_path) # Load model from pyspark.sql import SparkSession from pyspark.ml.classification import LogisticRegressionModel self.spark_session = SparkSession.builder.appName("libsvm_lr").getOrCreate() # TODO: Support other model self.spark_model = LogisticRegressionModel.load(self.model_base_path) # TODO: Add signature for Spark model self.model_graph_signature = "No signature for Spark MLlib models"
# 모델 생성 알고리즘 (로지스틱 회귀 평가자) lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender") # 모델 생성 model = lr.fit(assembled_training) # 예측값 생성 model.transform(assembled_training).show() # 파이프라인 pipeline = Pipeline(stages=[assembler, lr]) # 파이프라인 모델 생성 pipelineModel = pipeline.fit(training) # 파이프라인 모델을 이용한 예측값 생성 pipelineModel.transform(training).show() path1 = "/Users/beginspark/Temp/regression-model" path2 = "/Users/beginspark/Temp/pipelinemodel" # 모델 저장 model.write().overwrite().save(path1) pipelineModel.write().overwrite().save(path2) # 저장된 모델 불러오기 loadedModel = LogisticRegressionModel.load(path1) loadedPipelineModel = PipelineModel.load(path2) spark.stop
#!/usr/bin/env python from pyspark.sql import SparkSession from pyspark.ml.classification import LogisticRegressionModel from pyspark.ml.linalg import SparseVector spark = SparkSession.builder.appName("libsvm_lr").getOrCreate() # Load model model_path = "./lr_model/" lrModel = LogisticRegressionModel.load(model_path) print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) # Construct data #testset = spark.read.format("libsvm").load("./sample_libsvm_data.txt") testset = spark.createDataFrame( [(1.0, SparseVector(692, [128, 129, 130], [51, 159, 20]))], ['label', 'features']) # Make inference result = lrModel.transform(testset) result = result.first() print("Prediction: {}, probability_of_0: {}, probability_of_1: {}".format( result.label, result.probability[0], result.probability[1]))
df = sql_sc.read.csv('hdfs:///project_data/pets/train/train.csv', header=True, inferSchema='True').drop('Name').drop('State') input_cols = [a for a, b in df.dtypes if b == 'int'] indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in ["AdoptionSpeed"] ] pipeline = Pipeline(stages=indexers) # print('param', str(sys.argv[1])) # csv_df = sql_sc.read.format("csv").option("header","true").load("hdfs:///project_data/pets/train/train.csv") kafkaStream = KafkaUtils.createStream(ssc, 'gpu17:2181', 'test-consumer-group', {input_topic: 2}) producer = KafkaProducer(bootstrap_servers='gpu17:9092') lr_test = LogisticRegressionModel.load('hdfs:///lr') featurizer_test = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") p_lr_test = PipelineModel(stages=[featurizer_test, lr_test]) feature = VectorAssembler(inputCols=input_cols, outputCol="features") def handler(message): records = message.collect() for record in records: print('record', record, type(record)) print('-----------') print('tuple', record[0], record[1], type(record[0]), type(record[1])) # producer.send(output_topic, b'message received') key = record[0]
regexTokenizer, stopwordsRemover, countVectors, label_stringIdx ] pipeline = Pipeline(stages=transformers) pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) testData.show(5) results = testData.toJSON().map(lambda j: json.loads(j)).collect() #print (results) file = open('/tmp/lr.model.testdata.txt', 'w') file.write(json.dumps(results)) file.close() ########################################################## ################## Train/load the model ################## ########################################################## lrModel = LogisticRegressionModel.load("lr.model.savepoint") predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 7) \ .select("Descript","Category","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30)
df = sc.createDataFrame(lines) tokenizer = Tokenizer(inputCol="text", outputCol="words") # Extract the features hashing_tf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf") idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5) lines = Pipeline(stages=[tokenizer, hashing_tf, idf]) # Get the data to test line_fit = lines.fit(df) test_model = line_fit.transform(df) # Load the trained model sentimentModel = LogisticRegressionModel.load( "./MLModels/logisticRegressionSentiment") sarcasmModel = LogisticRegressionModel.load( "./MLModels/logisticRegressionSarcasm") # Apply the models result = sentimentModel.transform(test_model) result = result.withColumnRenamed("prediction", "sentimment") result = result.drop("words", "tf", "features", "rawPrediction", "probability") line_fit = lines.fit(result) test_model = line_fit.transform(result) result = sarcasmModel.transform(test_model) result = result.withColumnRenamed("prediction", "sarcasm") result = result.drop("words", "tf", "features", "rawPrediction", "probability") result.show(10, False)
def fetchmodel(): regression_model = LogisticRegressionModel.load( "s3://winequalitybucket/regression.model/") return regression_model
def multinomialRegression(df, feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'], maxIter=100, regParam=0.0, elasticNetParam=0.0, threshold=0.5, overwrite_model=False): # Checks if there is a SparkContext running if so grab that if not start a new one # sc = SparkContext.getOrCreate() # sqlContext = SQLContext(sc) # sqlContext.setLogLevel('INFO') feature_list.sort() feature_name = '_'.join(feature_list) param_name = '_'.join( [str(regParam), str(elasticNetParam), str(maxIter), str(threshold)]) model_path_name = model_dir + 'MultinomialRegression/' + feature_name + '_' + param_name model = None vector_assembler = VectorAssembler(inputCols=feature_list, outputCol="features") df_temp = vector_assembler.transform(df) df = df_temp.select(['label', 'features']) trainingData, testData = df.randomSplit([0.7, 0.3]) if os.path.isdir(model_path_name) and not overwrite_model: print('Loading model from ' + model_path_name) model = LogisticRegressionModel.load(model_path_name) else: lr = LogisticRegression(labelCol="label", maxIter=maxIter, regParam=regParam, elasticNetParam=elasticNetParam) model = lr.fit(trainingData) print('Making predictions on validation data') predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") evaluator.setMetricName('accuracy') print('Evaluating accuracy') accuracy = evaluator.evaluate(predictions) evaluator.setMetricName('f1') print('Evaluating f1') f1 = evaluator.evaluate(predictions) evaluator.setMetricName('weightedPrecision') print('Evaluating weightedPrecision') weightedPrecision = evaluator.evaluate(predictions) evaluator.setMetricName('weightedRecall') print('Evaluating weightedRecall') weightedRecall = evaluator.evaluate(predictions) print('accuracy {}'.format(accuracy)) print('f1 {}'.format(f1)) print('weightedPrecision {}'.format(weightedPrecision)) print('weightedRecall {}'.format(weightedRecall)) # test distribution of outputs total = df.select('label').count() tape = df.filter(df.label == 0).count() disk = df.filter(df.label == 1).count() cloud = df.filter(df.label == 2).count() # print outputs print('Multinomial Regression Classification') print(feature_list) print('Data distribution') print('Total Observations {}'.format(total)) print(' Cloud %{}'.format((cloud / total) * 100)) print(' Disk %{}'.format((disk / total) * 100)) print(' Tape %{}\n'.format((tape / total) * 100)) print(" Test Error = {}".format((1.0 - accuracy) * 100)) print(" Test Accuracy = {}\n".format(accuracy * 100)) print('Error distribution') misses = predictions.filter(predictions.label != predictions.prediction) # now get percentage of error tape_misses = misses.filter(misses.label == 0).count() disk_misses = misses.filter(misses.label == 1).count() cloud_misses = misses.filter(misses.label == 2).count() tape_pred = predictions.filter(predictions.label == 0).count() disk_pred = predictions.filter(predictions.label == 1).count() cloud_pred = predictions.filter(predictions.label == 2).count() print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100)) print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100)) print(' Tape Misses %{}'.format((tape_misses / tape_pred) * 100)) if accuracy > 0.80: if os.path.isdir(model_path_name): if overwrite_model: print('Saving model to ' + model_path_name) model.write().overwrite().save(model_path_name) else: pass else: print('Saving model to ' + model_path_name) model.save(model_path_name) metrics = { 'data': { 'Total': total, 'Cloud': (cloud / total) * 100, 'Disk': (disk / total) * 100, 'Tape': (tape / total) * 100 }, 'metrics': { 'Accuracy': accuracy * 100, 'f1': f1 * 100, 'Weighted Precision': weightedPrecision * 100, 'Weighted Recall': weightedRecall * 100 }, 'error_percentage': { 'Cloud': cloud_misses / cloud_pred * 100, 'Disk': disk_misses / disk_pred * 100, 'Tape': tape_misses / tape_pred * 100 }, 'params': { 'Regularization Parameter': regParam, 'Maximum Iteration': maxIter, 'ElasticNet Mixing Parameter': elasticNetParam, 'Threshold': threshold }, 'name': 'Multinomial Regression Classification', 'features': feature_list } with open('tmp/temp2.yml', 'w') as outfile: yaml.dump(metrics, outfile) return metrics, model