Пример #1
0
def lr_second_predict(lr_model_path, df, condition):
    """
    lr二分类预测
    :param lr_model_path: 模型地址
    :param df: 数据
    :param condition: {"features": [12, 13, 14, 15], "label": "label"}
    特征列
    :return: 预测结果 spark dataframe
    """
    feature_indexs = condition['features']
    label_index = condition['label']

    if label_index is None or label_index == "":  # 无标签列
        # 1. 准备数据
        def func(x):
            features_data = []
            for feature in feature_indexs:
                features_data.append(x[feature])
            return Row(features=Vectors.dense(features_data))

        training_set = df.rdd.map(lambda x: func(x)).toDF()

        # 2.加载模型
        lr_model = LogisticRegressionModel.load(lr_model_path)

        # 3.预测
        prediction_df = lr_model.transform(training_set).select(
            "prediction", "features")
        return prediction_df
    else:  # 有标签列
        # 1. 准备数据
        def func(x):
            features_data = []
            for feature in feature_indexs:
                features_data.append(x[feature])
            return Row(label=x[label_index],
                       features=Vectors.dense(features_data))

        training_set = df.rdd.map(lambda x: func(x)).toDF()

        # 2.加载模型
        print("*****lr_model_path:", lr_model_path)
        lr_model = LogisticRegressionModel.load(lr_model_path)

        # 3.预测
        prediction_df = lr_model.transform(training_set).select(
            "prediction", "label", "features")
        return prediction_df
def logistic_regression_evaluator(test_data, deal_id):
    ####In:
    #A testing data set, as generated by data_prep()
    #The deal_id you want to test a model for
    #NB: The model has to be already saved to the cloud

    ####Out
    #An update message is outputted
    #an evaluator

    model = LogisticRegressionModel.load(
        f"/mnt/lotte/logistic_regression/{deal_id}/")
    predictions = model.transform(test_data.withColumnRenamed(
        deal_id, 'label'))
    # compute accuracy on the test set
    evaluator = BinaryClassificationEvaluator(
        labelCol="label",
        rawPredictionCol="prediction",
        metricName="areaUnderPR"
    )  #alternatively, use areaUnderPR to get the precision-recall curve instead of the accuracy

    accuracy = evaluator.evaluate(predictions)
    print("Logistic Regression area under PR " + deal_id + " = " +
          str(accuracy))

    return evaluator
Пример #3
0
def predict_prob(lrModelPath, test_data):
    lrModel = LogisticRegressionModel.load(lrModelPath)
    predictions = lrModel.transform(test_data)
    result = predictions.select(['_c0', '_c1', '_c2', 'probability'])
    print('*************** result **************')
    print(result.show(5))
    # result.write.csv('file:///opt/int_group/result123')

    vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1])
    prob_1 = vs.transform(result)
    print('*************** prob_1 **************')
    print(prob_1.show(5))
    result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1'])
    print('*************** result_prob1 **************')
    print(result_prob1.show(5))

    new_result_prob1 = result_prob1.select([
        '_c0', '_c1', '_c2',
        result_prob1['prob_1'].cast('string').alias('prob_1_str')
    ])
    print('*************** new_result_prob1 **************')
    print(new_result_prob1.show(10))
    print(new_result_prob1)
    # find null rows
    final_null_rows = new_result_prob1.filter(new_result_prob1._c0.isNull() | new_result_prob1._c1.isNull()\
        | new_result_prob1._c2.isNull() | new_result_prob1.prob_1_str.isNull())
    print('########### find null rows #############')
    final_null_rows.show(100)
Пример #4
0
def init():
    global model
    # note here "iris.model" is the name of the model registered under the workspace
    # this call should return the path to the model.pkl file on the local disk.
    model_path = Model.get_model_path('iris.model')
    # Load the model file back into a LogisticRegression model
    model = LogisticRegressionModel.load(model_path)
Пример #5
0
def post_homeLoanDefault_predictions(Path):
    '''
    Purpose : This function generates predictions based on the input data to a home loan default classifier.
    Args    : Path(dict)
    Output  : prediction(array)
    '''
    
    from pyspark.sql import SparkSession
    from pyspark.ml.classification import LogisticRegressionModel
    
    for item in Path:
        modelPath = item['modelPath']
        dataPath = item['dataPath']
    
    spark = SparkSession.builder.appName('HomeCredit').getOrCreate()
    data = spark.read.parquet(dataPath)
    
    #loading Model
    mm = LogisticRegressionModel.load(modelPath)
    
    #calculate predictions
    predicted = mm.transform(data)
    
    predictList = predicted.select('prediction').collect()
    predictList = [int(i.prediction) for i in predictList]
    
    return predictList
Пример #6
0
    def gen_lr_sort_model_metrics(test_df):
        from pyspark.ml.classification import LogisticRegressionModel
        logistic_regression_model = LogisticRegressionModel.load(
            "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model")
        lr_result = logistic_regression_model.evaluate(test_df).predictions
        lr_result.show()
 
        def vector_to_double(row):
            return float(row.click_flag), float(row.probability[1])
        score_labels = lr_result.select(["click_flag", "probability"]).rdd.map(vector_to_double)
        score_labels.collect()
 
        from pyspark.mllib.evaluation import BinaryClassificationMetrics
        binary_classification_metrics = BinaryClassificationMetrics(scoreAndLabels=score_labels)
        area_under_roc = binary_classification_metrics.areaUnderROC
        print area_under_roc
 
        tp = lr_result[(lr_result.click_flag == 1) & (lr_result.prediction == 1)].count()
        tn = lr_result[(lr_result.click_flag == 0) & (lr_result.prediction == 1)].count()
        fp = lr_result[(lr_result.click_flag == 0) & (lr_result.prediction == 1)].count()
        fn = lr_result[(lr_result.click_flag == 1) & (lr_result.prediction == 0)].count()
        print "tp {} tn {} fp {} fn {}".format(tp, tn, fp, fn)
        print('accuracy is : %f' % ((tp + tn) / (tp + tn + fp + fn)))
        print('recall is : %f' % (tp / (tp + fn)))
        print('precision is : %f' % (tp / (tp + fp)))
def load_LogisticReg_Model(dataset):
    print ("Accuracy of best LRC Model with CrossValidation:")
    evaluator = BinaryClassificationEvaluator()
    best_LRModel = LogisticRegressionModel.load("model/LR1/")
    predictions = best_LRModel.transform(dataset)
    accuracy = evaluator.evaluate(predictions)
    print "The  accuracy = %g" % accuracy
    def on_data(self, data):
        try:
            s = self.client_socket
            s.listen(5)  # Now wait for client connection.
            c, addr = s.accept()  # Establish connection with client.
            print("Received request from: " + str(addr))
            msg = json.loads(data)

            tweet_time = msg['created_at']
            text = msg['text'].replace('\n', '')
            hashtags = " "
            if msg['entities'] is not None:
                if msg['entities']['hashtags'] is not None:
                    for hashtag in msg['entities']['hashtags']:
                        hashtags = hashtags + " " + hashtag['text']

            model = PipelineModel.load(Constants.sentiment_tf_idf_model_path)
            v = sql_context.createDataFrame([
                ("a", msg['text'].replace('\n', '')),
            ], ["_c0", "text"])
            v = model.transform(v)
            model2 = LogisticRegressionModel.load(Constants.sentiment_analysis_model_path)
            v = model2.transform(v)
            v_list = v.select('prediction').collect()
            sentiment = str(v_list[0].prediction)

            s_data = tweet_time + ' ~@ ' + text + ' ~@ ' + sentiment + ' ~@ ' + str(hashtags)

            print(s_data.encode('utf-8'))
            c.send(s_data.encode('utf-8'))
            c.close()
        except BaseException as e:
            print("Error on_data: %s" % str(e))
        return True
Пример #9
0
def predict(test_path, model_name, output_path):
    if model_name is None:
        model_name = 'model'
    if output_path is None:
        output_path = os.path.join(dirname(os.getcwd()), 'predict.csv')

    model_path = os.path.join(dirname(os.getcwd()), 'models', model_name)

    spark = SparkSession \
        .builder \
        .master('local') \
        .appName('Logistic App') \
        .getOrCreate()

    # todo Delete the next line
    spark.sparkContext.setLogLevel('OFF')

    model = LogisticRegressionModel.load(path=model_path)
    raw_data = spark.read.csv(test_path, header=True)

    dataset = mature_data(raw_data)

    prediction_df = model.transform(dataset).select(
        col('id'),
        col('prediction').cast('int'))
    prediction_df = prediction_df.toPandas()
    prediction_df.to_csv(output_path, index=False)
Пример #10
0
    def read_model(self):

        if "LogisticRegression" in self.best_model_path:
            classifier = LogisticRegressionModel.load(self.best_model_path)

        elif "DecisionTree" in self.best_model_path:
            classifier = DecisionTreeClassificationModel.load(
                self.best_model_path)

        elif "RandomForest" in self.best_model_path:
            classifier = RandomForestClassificationModel.load(
                self.best_model_path)

        elif "LinearSVC" in self.best_model_path:
            classifier = LinearSVCModel.load(self.best_model_path)

        if "VGG16" in self.best_model_path:
            featurizer_name = "VGG16"

        elif "VGG19" in self.best_model_path:
            featurizer_name = "VGG19"

        elif "InceptionV3" in self.best_model_path:
            featurizer_name = "InceptionV3"

        elif "Xception" in self.best_model_path:
            featurizer_name = "Xception"

        elif "ResNet50" in self.best_model_path:
            featurizer_name = "ResNet50"

        return featurizer_name, classifier
Пример #11
0
def testJustify(lrModelPath, test_data):
    lrModel = LogisticRegressionModel.load(lrModelPath)
    predictions = lrModel.transform(test_data)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")  # attention:the inputted parameter
    accuracy = evaluator.evaluate(predictions)
    print('############ accuracy: {} ############'.format(accuracy))
    return predictions
Пример #12
0
def predict():
    model_1 = LogisticRegressionModel.load('model')

    if request.method == 'POST':
        message = request.form['message']
        data = [message]
        my_prediction = model_1.transform(data)
    return render_template('result.html', prediction=my_prediction)
Пример #13
0
 def __init__(self):
     self.logger = logging.getLogger(__name__)
     self.spark_sql_context = SparkUtility.get_spark_sql_context()
     self.spark_session = SparkUtility.get_spark_session()
     model_path = os.path.join(
         Utility.get_data_folder(DataFolder.Stock_Model, Market.US),
         'lr_model')
     self.model = LogisticRegressionModel.load(model_path)
def load_model():
    lr_model = LogisticRegressionModel.load('s3a://trainingmodel/lr')
    featurizer = DeepImageFeaturizer(inputCol="image",
                                     outputCol="features",
                                     modelName="InceptionV3")
    p_test = PipelineModel(stages=[featurizer, lr_model])

    return p_test
Пример #15
0
    def __init__(self, model_path, scaler_path):

        self.model = LogisticRegressionModel.load(model_path)
        self.scaler = StandardScalerModel.load(scaler_path)
        self.au_PRC = 0
        self.precision = []
        self.recall = []
        self.thresholds = []
        self.matrix = []
Пример #16
0
 def getOrCreateLRC (self):
     try:
         if self.LRCModel == None:
             self.LRCModel = LogisticRegressionModel.load(CONST_LRC_FILE)
     except:
         print ("Creating LinearSVC Model")
         self.LRCModel = self.createLRC()
     
     return self.LRCModel
Пример #17
0
    def __init__(self):
        self.spark = SparkSession.builder \
            .master('yarn') \
            .appName("Yelp Online Testing") \
            .getOrCreate()

        self.lda_model = PipelineModel.load(
            'hdfs:///project/small_data/lda_model_10')
        self.lr_model = LogisticRegressionModel.load(
            'hdfs:///project/small_data/lr-model-10')
Пример #18
0
def modelPredicting(testSetWoeDF, fn):
    # 数据预转换,满足ML-linearRegression输入格式要求
    strInd = StringIndexerModel.load(savePath +
                                     '{}/{}/strInd'.format(curDate, fn))
    lrModel = LogisticRegressionModel.load(savePath +
                                           '{}/{}/lrModel'.format(curDate, fn))
    testSetVecAse = vecAseembler.transform(testSetWoeDF)
    testSetVecAseStrInd = strInd.transform(testSetVecAse)
    testSetWithProba = lrModel.transform(testSetVecAseStrInd)
    return (testSetWithProba)
    def classify(self, inputJson):
        pass

        self.hdfs = PyWebHdfsClient(host=self.config.acm.servers.hdfs.host,port=self.config.acm.servers.hdfs.restPort, user_name=self.config.acm.servers.hdfs.fileOwner)
        self.hdfsServerUrl = "hdfs://"+self.config.acm.servers.hdfs.host+":"+str(self.config.acm.servers.hdfs.port)

        if hasattr(self, 'sc')==False: 
            self.sc =SparkContext()
        if hasattr(self, 'sqlContext')==False:
            self.sqlContext = SQLContext(self.sc)


        schema = StructType([StructField('Category', StringType(), True),
                     StructField('Descript', StringType(), True),
                     StructField('Dates', StringType(), True),
                     StructField('DayOfWeek', StringType(), True),
                     StructField('PdDistrict', StringType(), True),
                     StructField('Resolution', StringType(), True),
                     StructField('Address', StringType(), True),
                     StructField('X', DoubleType(), True),
                     StructField('Y', DoubleType(), True)
                    ])
        test = self.sqlContext.createDataFrame(inputJson, schema)

        #pipeline= PipelineModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.pipeline.savepoint")
        pipeline= PipelineModel.load(self.pipelineHdfsPath)


        testData = pipeline.transform(test)
        print("Test Dataset Count: " + str(testData.count()))

        ########################################################## 
        ################## Train/load the model ################## 
        ########################################################## 

        #lrModel = LogisticRegressionModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.savepoint")
        lrModel = LogisticRegressionModel.load(self.modelHdfsPath)

        predictions = lrModel.transform(testData)

        predictions.filter(predictions['prediction'] == 7)  \
            .select("Descript","Category","probability","label","prediction") \
            .orderBy("probability", ascending=False) \
            .show(n = 10, truncate = 30)

        #.select("probability","label","prediction") \
        resultJson = predictions.filter(predictions['prediction'] == 7)  \
            .select("prediction") \
            .orderBy("probability", ascending=False) \
            .toJSON().collect()
        self.sc.stop()

        return ["al sana ML!", resultJson]
Пример #20
0
def predict_prob(lrModelPath, test_data):
    lrModel = LogisticRegressionModel.load(lrModelPath)
    predictions = lrModel.transform(test_data)
    result = predictions.select(['_c0', '_c1', '_c2', 'probability'])
    print('*************** result **************')
    print(result.show(5))
    # result.write.csv('file:///opt/int_group/result123')

    vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1])
    prob_1 = vs.transform(result)
    print('*************** prob_1 **************')
    print(prob_1.show(5))
    result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1'])
    print('*************** result_prob1 **************')
    print(result_prob1.show(5))
    # for i in range(800, 802):
    #     g = i / 1000
    #     h = g + 0.001
    #     sqlTrans = SQLTransformer(statement="SELECT _c0, _c1, _c2, prob_1[0] AS prob FROM __THIS__ WHERE prob_1[0] < h  AND prob_1[0] >= g")
    #     dd = sqlTrans.transform(result_prob1)
    #     dd.write.csv('file:///opt/int_group/sql_test')

    new_result_prob1 = result_prob1.select([
        '_c0', '_c1', '_c2',
        result_prob1['prob_1'].cast('string').alias('prob_1_str')
    ])
    print('*************** new_result_prob1 **************')
    print(new_result_prob1.show(5))
    print(new_result_prob1)

    dd = new_result_prob1.head(1000)
    dd_df = spark.createDataFrame(dd)
    dd_df.write.csv('file:///opt/int_group/head_1kw_test')
    # for i in [1,2,3,4,5]:
    #     dd = new_result_prob1.head(i)
    #     dd_df = spark.createDataFrame(dd)
    #     dd_df.write.csv('file:///opt/int_group/head_test', mode='append')

    # DataFrame[_c0: string, _c1: string, _c2: string, prob_1_str: string]

    ###
    '''
    Error:
    Exception: Python in worker has different version 2.7 than that in driver 3.6, 
    PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and 
    PYSPARK_DRIVER_PYTHON are correctly set.
    '''
    # new_result_prob1.toPandas().to_csv('file:///opt/int_group/result.csv')

    # new_result_prob1.toPandas().to_csv('hdfs://bcg/opt/int_group/result/result.csv')

    ###
    '''
Пример #21
0
def lr_second_evaluation(spark_session, lr_model_path, df, predict_condition,
                         condition):
    """
    lr二分类评估
    :param spark_session:
    :param lr_model_path: 模型地址
    :param df: 预测数据
    :param predict_condition: 预测算子(父算子)配置
    :param condition: 该算子配置 {"label":"标签"}
    :return:
    """

    feature_indexs = predict_condition['features']
    label_index = condition['label']

    # 1. 准备数据
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return Row(label=x[label_index], features=Vectors.dense(features_data))

    predict_data = df.rdd.map(lambda x: func(x)).toDF()

    # 2.加载模型
    print("*****lr_model_path:", lr_model_path)
    lr_model = LogisticRegressionModel.load(lr_model_path)

    # 计算评估指标
    result = lr_model.transform(predict_data)
    print(result.prediction)
    lrTotalCorrect = result.rdd.map(lambda r: 1 if (r.prediction == r.label)
                                    else 0).reduce(lambda x, y: x + y)

    lrAccuracy = lrTotalCorrect / float(
        predict_data.count())  # 0.5136044023234485
    # # 清除默认阈值,这样会输出原始的预测评分,即带有确信度的结果
    lrPredictionAndLabels = result.rdd.map(
        lambda lp: (float(lp.prediction), float(lp.label)))
    lrmetrics = BinaryClassificationMetrics(lrPredictionAndLabels)

    print("Area under PR = %s" % lrmetrics.areaUnderPR)
    print("Area under ROC = %s" % lrmetrics.areaUnderROC)

    # 返回数据
    result = [("正确个数", float(lrTotalCorrect)), ("精准度", float(lrAccuracy)),
              ("Area under PR", float(lrmetrics.areaUnderPR)),
              ("Area under ROC", float(lrmetrics.areaUnderROC))]
    return spark_session.createDataFrame(result, schema=['指标', '值'])
Пример #22
0
def LinearEvaluation(data):
    path = 'modelo_LogisticRegression/modelLogisticRegression'
    lrModel = LogisticRegressionModel.load(path)
    #print(lrModel.coefficientMatrix)
    #predictions=lrModel.transform(data)
    predictions = lrModel.transform(data)  #VERDADERO = 0 Y FALSO 1
    prediccion = predictions.select(
        'prediction', 'probability').rdd.flatMap(lambda x: x).collect()
    print(prediccion[0])
    if prediccion[0] == 1.0:
        prediccionLabel = 'FALSO'
    else:
        prediccionLabel = 'VERDADERO'

    return prediccionLabel, prediccion[1][0] * 100
Пример #23
0
def testResult(lrModelPath, test_data, threshold):
    lrModel = LogisticRegressionModel.load(lrModelPath)
    predictions = lrModel.transform(test_data)
    label = predictions.select('label').collect()
    label_list = [label[i][0] for i in range(0, len(label))]
    probability = predictions.select('probability').collect()
    prob_list = [probability[i][0][1] for i in range(0, len(probability))]  # !此处取出1的概率!!!

    # tag
    flag = []
    for prob in prob_list:
        if prob >= threshold:
            flag.append(float(1))
        else:
            flag.append(float(0))

    # 评测
    acc = 0
    for j in range(0, len(label_list)):
        if label_list[j] == flag[j]:
            acc += 1
    accuracy = acc / len(label_list)
    print('-------accuracy--------: {}'.format(accuracy))

    tp, fn, tn, fp = 0, 0, 0, 0
    length = len(label_list)
    for i in range(0, length):
        if label_list[i] == 0.0 and flag[i] == 0.0:
            tn += 1
        if label_list[i] == 1.0 and flag[i] == 1.0:
            tp += 1
        if label_list[i] == 1.0 and flag[i] == 0.0:
            fn += 1
        if label_list[i] == 0.0 and flag[i] == 1.0:
            fp += 1
    # precision
    total = tn + tp + fn + fp
    print('tn:', tn)
    print('tp:', tp)
    print('fn', fn)
    print('fp:', fp)
    print('total:', total)
    precision = tp / (tp + fp)
    print('-------precision--------: {}'.format(precision))
    # recall
    recall = tp / (tp + fn)
    print('-------recall--------: {}'.format(recall))
    f1_score = 2 * ((precision * recall) / (precision + recall))
Пример #24
0
    def classify(self, inputJson):
        pass
        sc = SparkContext()
        sqlContext = SQLContext(sc)

        schema = StructType([
            StructField('Category', StringType(), True),
            StructField('Descript', StringType(), True),
            StructField('Dates', StringType(), True),
            StructField('DayOfWeek', StringType(), True),
            StructField('PdDistrict', StringType(), True),
            StructField('Resolution', StringType(), True),
            StructField('Address', StringType(), True),
            StructField('X', DoubleType(), True),
            StructField('Y', DoubleType(), True)
        ])
        test = sqlContext.createDataFrame(inputJson, schema)

        pipeline = PipelineModel.load(
            "/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.pipeline.savepoint"
        )

        testData = pipeline.transform(test)
        print("Test Dataset Count: " + str(testData.count()))

        ##########################################################
        ################## Train/load the model ##################
        ##########################################################

        lrModel = LogisticRegressionModel.load(
            "/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.savepoint"
        )

        predictions = lrModel.transform(testData)

        predictions.filter(predictions['prediction'] == 7)  \
            .select("Descript","Category","probability","label","prediction") \
            .orderBy("probability", ascending=False) \
            .show(n = 10, truncate = 30)

        #.select("probability","label","prediction") \
        resultJson = predictions.filter(predictions['prediction'] == 7)  \
            .select("prediction") \
            .orderBy("probability", ascending=False) \
            .toJSON().collect()

        return ["al sana ML!", resultJson]
Пример #25
0
def hello():
    form = ReusableForm(request.form)

    print(form.errors)
    if request.method == 'POST':
        name = request.form['name']
        print(name)

        if form.validate():
            # Save the comment here.
            sc = SparkContext()
            sc.setLogLevel("ERROR")

            app = Flask(__name__)
            #Schema of the trained data
            schema = StructType([
                StructField("_c0", StringType()),
                StructField("_c1", StringType())
            ])
            #Schema for the input features
            predict_schema = StructType([StructField("_c1", StringType())])

            #Load the Pipeline and the Classification Model
            pipelineModel = PipelineModel.load("pipeline_Model")
            lfModel = LogisticRegressionModel.load("lr_Model")

            spark = SparkSession.builder.getOrCreate()
            input_features = [[(name)]]
            #Making predictions from the model
            predict_df = spark.createDataFrame(data=input_features,
                                               schema=predict_schema)
            transformed_pred_df = pipelineModel.transform(predict_df)
            predictions = lfModel.transform(transformed_pred_df)
            probs = predictions.select('probability').take(1)[0][0]

            n_predictions = len(probs)
            labels = pipelineModel.stages[-1].labels
            result_dict = {labels[i]: probs[i] for i in range(n_predictions)}
            #results = jsonify(result_dict)
            #displaying the predictions
            flash(result_dict)

        else:
            flash('All the form fields are required. ')

    return render_template('hello.html', form=form)
Пример #26
0
    def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for CrossValidator will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        cv = CrossValidator(
            estimator=lr,
            estimatorParamMaps=grid,
            evaluator=evaluator,
            collectSubModels=True,
            numFolds=4,
            seed=42
        )
        cvModel = cv.fit(dataset)
        lrModel = cvModel.bestModel

        lrModelPath = temp_path + "/lrModel"
        lrModel.save(lrModelPath)
        loadedLrModel = LogisticRegressionModel.load(lrModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)

        # SPARK-32092: Saving and then loading CrossValidatorModel should not change the params
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedCvModel = CrossValidatorModel.load(cvModelPath)
        for param in [
            lambda x: x.getNumFolds(),
            lambda x: x.getSeed(),
            lambda x: len(x.subModels)
        ]:
            self.assertEqual(param(cvModel), param(loadedCvModel))

        self.assertTrue(all(
            loadedCvModel.isSet(param) for param in loadedCvModel.params
        ))
Пример #27
0
def payload(json):
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    sc = SparkContext.getOrCreate()
    modelka = mdl.load('./models/amounts.model')
    va = VectorAssembler(
        inputCols=['user', 'special', 'amount', 'percent', 'term'],
        outputCol="features")
    df = spark.read.json(sc.parallelize([json]))
    test = va.transform(df)
    pred = modelka.transform(test)
    approved = pred.take(1)[0][-1]
    spark.stop()
    sc.stop()
    return approved
Пример #28
0
def predict():

    content = request.get_json(force=True)

    f1 = content["feature1"]
    f2 = content["feature2"]
    f3 = content["feature3"]
    f4 = content["feature4"]

    ####### Initializing a Spark Session #######
    spark = SparkSession.builder.appName('abc').getOrCreate()

    pipelineModel = LogisticRegressionModel.load("model")

    data = spark.createDataFrame(
        [(f1, f2, f3, f4)],
        ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"])

    feature_cols = [
        "SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"
    ]

    for col in feature_cols:
        data = data.withColumn(col, data[col].cast(DoubleType()))

    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    test = assembler.transform(data)

    ####### Getting prediction value #######

    prediction = pipelineModel.transform(test)

    model_prediction = prediction.select('prediction').collect()[0][0]

    if model_prediction == 0.0:
        result = "Iris-setosa"
    elif model_prediction == 1.0:
        result = "Iris-versicolor"
    elif model_prediction == 2.0:
        result = "Iris-vifginica"

    print("Result: {}".format(result), file=sys.stderr)

    return 'OK'
Пример #29
0
    def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(
            estimator=lr,
            estimatorParamMaps=grid,
            evaluator=evaluator,
            collectSubModels=True,
            seed=42
        )
        tvsModel = tvs.fit(dataset)
        lrModel = tvsModel.bestModel

        lrModelPath = temp_path + "/lrModel"
        lrModel.save(lrModelPath)
        loadedLrModel = LogisticRegressionModel.load(lrModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedTvsModel = TrainValidationSplitModel.load(tvsModelPath)
        for param in [
            lambda x: x.getSeed(),
            lambda x: x.getTrainRatio(),
        ]:
            self.assertEqual(param(tvsModel), param(loadedTvsModel))

        self.assertTrue(all(
            loadedTvsModel.isSet(param) for param in loadedTvsModel.params
        ))
Пример #30
0
def get_model(s3_name):
    model_name = s3_name + ".model.zip"
    print(model_name)
    get_file_from_bucket('models-dpa', model_name, 'aux.zip')

    with zipfile.ZipFile("aux.zip", 'r') as zip_ref:
        zip_ref.extractall("model")
    os.remove("aux.zip")

    spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

    model_path = "model/" + s3_name + ".model"
    model = LogisticRegressionModel.load(model_path)
    print(model)

    shutil.rmtree("model", ignore_errors=True)
    return model
Пример #31
0
    def test_save_load_trained_model(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        lrModel = tvsModel.bestModel

        tvsModelPath = temp_path + "/tvsModel"
        lrModel.save(tvsModelPath)
        loadedLrModel = LogisticRegressionModel.load(tvsModelPath)
        self.assertEqual(loadedLrModel.uid, lrModel.uid)
        self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
  def __init__(self, model_name, model_base_path):
    """
    Initialize the service.
        
    Args:
      model_name: The name of the model.
      model_base_path: The file path of the model.
    Return:
      None
    """

    super(SparkInferenceService, self).__init__()

    # TODO: Download the model files
    #local_model_base_path = filesystem_util.download_hdfs_moels(
    #    model_base_path)

    self.model_name = model_name
    self.model_base_path = model_base_path
    self.model_version_list = [1]
    self.model_graph_signature = ""
    self.platform = "Spark"

    self.preprocess_function, self.postprocess_function = preprocess_util.get_preprocess_postprocess_function_from_model_path(
        self.model_base_path)

    # Load model
    from pyspark.sql import SparkSession
    from pyspark.ml.classification import LogisticRegressionModel

    self.spark_session = SparkSession.builder.appName("libsvm_lr").getOrCreate()
    # TODO: Support other model
    self.spark_model = LogisticRegressionModel.load(self.model_base_path)

    # TODO: Add signature for Spark model
    self.model_graph_signature = "No signature for Spark MLlib models"
Пример #33
0
# 모델 생성 알고리즘 (로지스틱 회귀 평가자)
lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender")

# 모델 생성
model = lr.fit(assembled_training)

# 예측값 생성
model.transform(assembled_training).show()

# 파이프라인
pipeline = Pipeline(stages=[assembler, lr])

# 파이프라인 모델 생성
pipelineModel = pipeline.fit(training)

# 파이프라인 모델을 이용한 예측값 생성
pipelineModel.transform(training).show()

path1 = "/Users/beginspark/Temp/regression-model"
path2 = "/Users/beginspark/Temp/pipelinemodel"

# 모델 저장
model.write().overwrite().save(path1)
pipelineModel.write().overwrite().save(path2)

# 저장된 모델 불러오기
loadedModel = LogisticRegressionModel.load(path1)
loadedPipelineModel = PipelineModel.load(path2)

spark.stop
#!/usr/bin/env python

from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.linalg import SparseVector

spark = SparkSession.builder.appName("libsvm_lr").getOrCreate()

# Load model
model_path = "./lr_model/"
lrModel = LogisticRegressionModel.load(model_path)
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# Construct data
#testset = spark.read.format("libsvm").load("./sample_libsvm_data.txt")
testset = spark.createDataFrame(
    [(1.0, SparseVector(692, [128, 129, 130], [51, 159, 20]))],
    ['label', 'features'])

# Make inference
result = lrModel.transform(testset)
result = result.first()
print("Prediction: {}, probability_of_0: {}, probability_of_1: {}".format(
    result.label, result.probability[0], result.probability[1]))