Пример #1
0
    def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        lrModel = tvsModel.bestModel

        tvsModelPath = temp_path + "/tvsModel"
        lrModel.save(tvsModelPath)
        loadedLrModel = LogisticRegressionModel.load(tvsModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
  def __init__(self, model_name, model_base_path):
    """
    Initialize the service.
        
    Args:
      model_name: The name of the model.
      model_base_path: The file path of the model.
    Return:
      None
    """

    super(SparkInferenceService, self).__init__()

    # TODO: Download the model files
    #local_model_base_path = filesystem_util.download_hdfs_moels(
    #    model_base_path)

    self.model_name = model_name
    self.model_base_path = model_base_path
    self.model_version_list = [1]
    self.model_graph_signature = ""
    self.platform = "Spark"

    self.preprocess_function, self.postprocess_function = preprocess_util.get_preprocess_postprocess_function_from_model_path(
        self.model_base_path)

    # Load model
    from pyspark.sql import SparkSession
    from pyspark.ml.classification import LogisticRegressionModel

    self.spark_session = SparkSession.builder.appName("libsvm_lr").getOrCreate()
    # TODO: Support other model
    self.spark_model = LogisticRegressionModel.load(self.model_base_path)

    # TODO: Add signature for Spark model
    self.model_graph_signature = "No signature for Spark MLlib models"
Пример #3
0
# 모델 생성 알고리즘 (로지스틱 회귀 평가자)
lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender")

# 모델 생성
model = lr.fit(assembled_training)

# 예측값 생성
model.transform(assembled_training).show()

# 파이프라인
pipeline = Pipeline(stages=[assembler, lr])

# 파이프라인 모델 생성
pipelineModel = pipeline.fit(training)

# 파이프라인 모델을 이용한 예측값 생성
pipelineModel.transform(training).show()

path1 = "/Users/beginspark/Temp/regression-model"
path2 = "/Users/beginspark/Temp/pipelinemodel"

# 모델 저장
model.write().overwrite().save(path1)
pipelineModel.write().overwrite().save(path2)

# 저장된 모델 불러오기
loadedModel = LogisticRegressionModel.load(path1)
loadedPipelineModel = PipelineModel.load(path2)

spark.stop
#!/usr/bin/env python

from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.linalg import SparseVector

spark = SparkSession.builder.appName("libsvm_lr").getOrCreate()

# Load model
model_path = "./lr_model/"
lrModel = LogisticRegressionModel.load(model_path)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# Construct data
#testset = spark.read.format("libsvm").load("./sample_libsvm_data.txt")
testset = spark.createDataFrame(
    [(1.0, SparseVector(692, [128, 129, 130], [51, 159, 20]))],
    ['label', 'features'])

# Make inference
result = lrModel.transform(testset)
result = result.first()
print("Prediction: {}, probability_of_0: {}, probability_of_1: {}".format(
    result.label, result.probability[0], result.probability[1]))
Пример #5
0
df = sql_sc.read.csv('hdfs:///project_data/pets/train/train.csv',
                     header=True,
                     inferSchema='True').drop('Name').drop('State')
input_cols = [a for a, b in df.dtypes if b == 'int']
indexers = [
    StringIndexer(inputCol=column, outputCol=column + "_index").fit(df)
    for column in ["AdoptionSpeed"]
]
pipeline = Pipeline(stages=indexers)

# print('param', str(sys.argv[1]))
# csv_df = sql_sc.read.format("csv").option("header","true").load("hdfs:///project_data/pets/train/train.csv")
kafkaStream = KafkaUtils.createStream(ssc, 'gpu17:2181', 'test-consumer-group',
                                      {input_topic: 2})
producer = KafkaProducer(bootstrap_servers='gpu17:9092')
lr_test = LogisticRegressionModel.load('hdfs:///lr')
featurizer_test = dl.DeepImageFeaturizer(inputCol="image",
                                         outputCol="features",
                                         modelName="InceptionV3")
p_lr_test = PipelineModel(stages=[featurizer_test, lr_test])
feature = VectorAssembler(inputCols=input_cols, outputCol="features")


def handler(message):
    records = message.collect()
    for record in records:
        print('record', record, type(record))
        print('-----------')
        print('tuple', record[0], record[1], type(record[0]), type(record[1]))
        # producer.send(output_topic, b'message received')
        key = record[0]
    regexTokenizer, stopwordsRemover, countVectors, label_stringIdx
]
pipeline = Pipeline(stages=transformers)
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

testData.show(5)

results = testData.toJSON().map(lambda j: json.loads(j)).collect()
#print (results)
file = open('/tmp/lr.model.testdata.txt', 'w')
file.write(json.dumps(results))
file.close()

##########################################################
################## Train/load the model ##################
##########################################################
lrModel = LogisticRegressionModel.load("lr.model.savepoint")

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 7)  \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
Пример #7
0
df = sc.createDataFrame(lines)

tokenizer = Tokenizer(inputCol="text", outputCol="words")

# Extract the features
hashing_tf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)
lines = Pipeline(stages=[tokenizer, hashing_tf, idf])

# Get the data to test
line_fit = lines.fit(df)
test_model = line_fit.transform(df)

# Load the trained model
sentimentModel = LogisticRegressionModel.load(
    "./MLModels/logisticRegressionSentiment")
sarcasmModel = LogisticRegressionModel.load(
    "./MLModels/logisticRegressionSarcasm")

# Apply the models
result = sentimentModel.transform(test_model)
result = result.withColumnRenamed("prediction", "sentimment")
result = result.drop("words", "tf", "features", "rawPrediction", "probability")

line_fit = lines.fit(result)
test_model = line_fit.transform(result)

result = sarcasmModel.transform(test_model)
result = result.withColumnRenamed("prediction", "sarcasm")
result = result.drop("words", "tf", "features", "rawPrediction", "probability")
result.show(10, False)
Пример #8
0
def fetchmodel():
    regression_model = LogisticRegressionModel.load(
        "s3://winequalitybucket/regression.model/")
    return regression_model
Пример #9
0
def multinomialRegression(df,
                          feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'],
                          maxIter=100,
                          regParam=0.0,
                          elasticNetParam=0.0,
                          threshold=0.5,
                          overwrite_model=False):
    # Checks if there is a SparkContext running if so grab that if not start a new one
    # sc = SparkContext.getOrCreate()
    # sqlContext = SQLContext(sc)
    # sqlContext.setLogLevel('INFO')
    feature_list.sort()
    feature_name = '_'.join(feature_list)
    param_name = '_'.join(
        [str(regParam),
         str(elasticNetParam),
         str(maxIter),
         str(threshold)])
    model_path_name = model_dir + 'MultinomialRegression/' + feature_name + '_' + param_name
    model = None

    vector_assembler = VectorAssembler(inputCols=feature_list,
                                       outputCol="features")
    df_temp = vector_assembler.transform(df)

    df = df_temp.select(['label', 'features'])

    trainingData, testData = df.randomSplit([0.7, 0.3])

    if os.path.isdir(model_path_name) and not overwrite_model:
        print('Loading model from ' + model_path_name)
        model = LogisticRegressionModel.load(model_path_name)

    else:
        lr = LogisticRegression(labelCol="label",
                                maxIter=maxIter,
                                regParam=regParam,
                                elasticNetParam=elasticNetParam)
        model = lr.fit(trainingData)

    print('Making predictions on validation data')
    predictions = model.transform(testData)

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction")

    evaluator.setMetricName('accuracy')
    print('Evaluating accuracy')
    accuracy = evaluator.evaluate(predictions)

    evaluator.setMetricName('f1')
    print('Evaluating f1')
    f1 = evaluator.evaluate(predictions)

    evaluator.setMetricName('weightedPrecision')
    print('Evaluating weightedPrecision')
    weightedPrecision = evaluator.evaluate(predictions)

    evaluator.setMetricName('weightedRecall')
    print('Evaluating weightedRecall')
    weightedRecall = evaluator.evaluate(predictions)

    print('accuracy {}'.format(accuracy))
    print('f1 {}'.format(f1))
    print('weightedPrecision {}'.format(weightedPrecision))
    print('weightedRecall {}'.format(weightedRecall))

    # test distribution of outputs
    total = df.select('label').count()
    tape = df.filter(df.label == 0).count()
    disk = df.filter(df.label == 1).count()
    cloud = df.filter(df.label == 2).count()

    # print outputs
    print('Multinomial Regression Classification')
    print(feature_list)
    print('Data distribution')
    print('Total Observations {}'.format(total))
    print(' Cloud %{}'.format((cloud / total) * 100))
    print(' Disk %{}'.format((disk / total) * 100))
    print(' Tape %{}\n'.format((tape / total) * 100))

    print(" Test Error = {}".format((1.0 - accuracy) * 100))
    print(" Test Accuracy = {}\n".format(accuracy * 100))

    print('Error distribution')
    misses = predictions.filter(predictions.label != predictions.prediction)
    # now get percentage of error
    tape_misses = misses.filter(misses.label == 0).count()
    disk_misses = misses.filter(misses.label == 1).count()
    cloud_misses = misses.filter(misses.label == 2).count()

    tape_pred = predictions.filter(predictions.label == 0).count()
    disk_pred = predictions.filter(predictions.label == 1).count()
    cloud_pred = predictions.filter(predictions.label == 2).count()

    print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100))
    print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100))
    print(' Tape Misses %{}'.format((tape_misses / tape_pred) * 100))

    if accuracy > 0.80:
        if os.path.isdir(model_path_name):
            if overwrite_model:
                print('Saving model to ' + model_path_name)
                model.write().overwrite().save(model_path_name)
            else:
                pass
        else:
            print('Saving model to ' + model_path_name)
            model.save(model_path_name)

    metrics = {
        'data': {
            'Total': total,
            'Cloud': (cloud / total) * 100,
            'Disk': (disk / total) * 100,
            'Tape': (tape / total) * 100
        },
        'metrics': {
            'Accuracy': accuracy * 100,
            'f1': f1 * 100,
            'Weighted Precision': weightedPrecision * 100,
            'Weighted Recall': weightedRecall * 100
        },
        'error_percentage': {
            'Cloud': cloud_misses / cloud_pred * 100,
            'Disk': disk_misses / disk_pred * 100,
            'Tape': tape_misses / tape_pred * 100
        },
        'params': {
            'Regularization Parameter': regParam,
            'Maximum Iteration': maxIter,
            'ElasticNet Mixing Parameter': elasticNetParam,
            'Threshold': threshold
        },
        'name': 'Multinomial Regression Classification',
        'features': feature_list
    }

    with open('tmp/temp2.yml', 'w') as outfile:
        yaml.dump(metrics, outfile)

    return metrics, model