Пример #1
0
def EvaluateModel(model, validationData):
    # Python version of the DecisionTreeModel can't handle currently equivalent of
    # "validationData.map(lambda point: (model.predict(point.features), point.label))" but Scala can.Hence I use zip() instead
    # scoresAndLabels = validationData.map(lambda point: (model.predict(point.features), point.label))
    # metrics = BinaryClassificationMetrics(scoresAndLabels)
    if (model.__class__.__name__ == "DecisionTreeModel"):
        predictedLabel = model.predict(
            validationData.map(lambda line: line.features))
        scoresAndLabels = validationData.map(lambda line: line.label).zip(
            predictedLabel)
        areaUnderROC = float(
            BinaryClassificationMetrics(scoresAndLabels).areaUnderROC)
        matchedNum = scoresAndLabels.filter(
            lambda (real, predicted): real == predicted).count()
        accRate = float(matchedNum) / validationData.count()
    else:
        scoresAndLabels = validationData.map(
            lambda line: (model.predict(line.features), line.label)).collect()
        scoresAndLabels = [[float(i), j] for i, j in scoresAndLabels]
        rdd_scoresAndLabels = globalVal.sc.parallelize(scoresAndLabels)
        areaUnderROC = float(
            BinaryClassificationMetrics(rdd_scoresAndLabels).areaUnderROC)
        matchedNum = rdd_scoresAndLabels.filter(
            lambda (real, predicted): real == predicted).count()
        accRate = float(matchedNum) / validationData.count()
        # matchedNum = validationData.map(lambda line: 1 if (model.predict(line.features) == line.label) else 0 ).sum()
        # accRate = float(matchedNum) / validationData.count()
    return areaUnderROC, accRate
Пример #2
0
    def gen_lr_sort_model_metrics(test_df):
        from pyspark.ml.classification import LogisticRegressionModel
        logistic_regression_model = LogisticRegressionModel.load(
            "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model")
        lr_result = logistic_regression_model.evaluate(test_df).predictions
        lr_result.show()
 
        def vector_to_double(row):
            return float(row.click_flag), float(row.probability[1])
        score_labels = lr_result.select(["click_flag", "probability"]).rdd.map(vector_to_double)
        score_labels.collect()
 
        from pyspark.mllib.evaluation import BinaryClassificationMetrics
        binary_classification_metrics = BinaryClassificationMetrics(scoreAndLabels=score_labels)
        area_under_roc = binary_classification_metrics.areaUnderROC
        print area_under_roc
 
        tp = lr_result[(lr_result.click_flag == 1) & (lr_result.prediction == 1)].count()
        tn = lr_result[(lr_result.click_flag == 0) & (lr_result.prediction == 1)].count()
        fp = lr_result[(lr_result.click_flag == 0) & (lr_result.prediction == 1)].count()
        fn = lr_result[(lr_result.click_flag == 1) & (lr_result.prediction == 0)].count()
        print "tp {} tn {} fp {} fn {}".format(tp, tn, fp, fn)
        print('accuracy is : %f' % ((tp + tn) / (tp + tn + fp + fn)))
        print('recall is : %f' % (tp / (tp + fn)))
        print('precision is : %f' % (tp / (tp + fp)))
Пример #3
0
def print_performance_metrics(predictions):
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    auc = evaluator.evaluate(predictions,
                             {evaluator.metricName: "areaUnderROC"})
    aupr = evaluator.evaluate(predictions,
                              {evaluator.metricName: "areaUnderPR"})
    print("auc = {}".format(auc))
    print("aupr = {}".format(aupr))

    # Get RDD of predictions and labels for eval metrics
    predictionAndLabels = predictions.select("prediction", "label").rdd

    # Instantiate metrics objects
    binary_metrics = BinaryClassificationMetrics(predictionAndLabels)
    multi_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = {}".format(binary_metrics.areaUnderPR))
    # Area under ROC curve
    print("Area under ROC = {}".format(binary_metrics.areaUnderROC))
    # Accuracy
    print("Accuracy = {}".format(multi_metrics.accuracy))
    # Confusion Matrix
    print(multi_metrics.confusionMatrix())
    # F1
    print("F1 = {}".format(multi_metrics.fMeasure(1.0)))
    # Precision
    print("Precision = {}".format(multi_metrics.precision(1.0)))
    # Recall
    print("Recall = {}".format(multi_metrics.recall(1.0)))
    # FPR
    print("FPR = {}".format(multi_metrics.falsePositiveRate(1.0)))
    # TPR
    print("TPR = {}".format(multi_metrics.truePositiveRate(1.0)))
Пример #4
0
def random_forest(training_data, test_data, output_str):
    rf = RandomForestClassifier(labelCol="label", featuresCol="features")
    paramGrid = ParamGridBuilder() \
        .addGrid(rf.numTrees, [20, 50, 80]) \
        .addGrid(rf.maxDepth, [3, 5, 10, 15]) \
        .build()
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    crossval = CrossValidator(estimator=rf,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=5)

    rfmodel = crossval.fit(training_data)

    rfPredictions = rfmodel.transform(test_data)

    # Evaluate bestModel found from Cross Validation
    accuracy = evaluator.evaluate(rfPredictions)
    output_str = output_str + "Random Forest accuracy is: " + str(
        accuracy) + "\n"

    predictionandLabels = rfPredictions.withColumn(
        'label1',
        rfPredictions["label"].cast("double")).select("prediction",
                                                      "label1").rdd
    metrics = BinaryClassificationMetrics(predictionandLabels)

    auroc = metrics.areaUnderROC
    aupr = metrics.areaUnderPR
    output_str = output_str + "RF Area under ROC Curve: " + str(auroc) + "\n"
    output_str = output_str + "RF Area under PR Curve: " + str(aupr) + "\n"

    return output_str
Пример #5
0
def naive_bayes(training_data, test_data, output_str):
    nb = NaiveBayes(modelType="multinomial")
    paramGrid = ParamGridBuilder().addGrid(nb.smoothing,
                                           [0.01, 0.1, 1.0, 10, 100]).build()
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    cv = CrossValidator(estimator=nb,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)
    cvModel = cv.fit(training_data)
    cvPredictions = cvModel.transform(test_data)

    # Evaluate bestModel found from Cross Validation
    accuracy = evaluator.evaluate(cvPredictions)
    output_str = output_str + "Naive Bayes accuracy is: " + str(
        accuracy) + "\n"

    predictionandLabels = cvPredictions.withColumn(
        'label1',
        cvPredictions["label"].cast("double")).select("prediction",
                                                      "label1").rdd
    metrics = BinaryClassificationMetrics(predictionandLabels)

    auroc = metrics.areaUnderROC
    aupr = metrics.areaUnderPR
    output_str = output_str + "NB Area under ROC Curve: " + str(auroc) + "\n"
    output_str = output_str + "NB Area under PR Curve: " + str(aupr) + "\n"

    return output_str
Пример #6
0
def test_confusion_matrix(sdf):
    assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'],
                            outputCol='features')
    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='Survived',
                                numTrees=20)
    pipeline = Pipeline(stages=[assem, rf])
    model = pipeline.fit(sdf.fillna(0.0))
    predictions = model.transform(sdf.fillna(0.0)).select(
        'probability', 'Survived')
    bcm = BinaryClassificationMetrics(predictions,
                                      scoreCol='probability',
                                      labelCol='Survived')

    predictions = predictions.toHandy().to_metrics_RDD('probability',
                                                       'Survived')
    predictions = np.array(predictions.collect())

    scm = bcm.confusionMatrix().toArray()
    pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .5)
    npt.assert_array_almost_equal(scm, pcm)

    scm = bcm.confusionMatrix(.3).toArray()
    pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .3)
    npt.assert_array_almost_equal(scm, pcm)
Пример #7
0
def plot_roc(model, test_data, name, label_col):

    transformed = model.transform(test_data)
    results = transformed.select(["probability", label_col])
    results_collect = results.collect()
    results_list = [(float(i[0][1]), float(i[1])) for i in results_collect]
    score_and_labels = sc.parallelize(results_list)

    metrics = BinaryClassificationMetrics(score_and_labels)
    print("The ROC score for " + name + " is : ", metrics.areaUnderROC)

    y_test = [i[1] for i in results_list]
    y_score = [i[0] for i in results_list]

    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    for fp, tp, thresh in zip(fpr, tpr, thresholds):
        print("fpr: ", fp, " tpr: ", tp, " threshold: ", thresh)

    plt.clf()
    plt.figure()
    plt.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % roc_auc)
    plt.plot([0, 1], [0, 1], "k--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic for " + name)
    plt.legend(loc="lower right")
    if not os.path.isdir(os.path.join(script_dir, png_dir)):
        os.makedirs(os.path.join(script_dir, png_dir))
    plt.savefig(
        os.path.join(script_dir, png_dir + name.replace(" ", "") + ".png"))
Пример #8
0
def model(classifiers, training, testing, week):

    results = ""
    timing = []

    for classifier in classifiers:

        timeStart = time.time()

        clf = get_classifier(classifier)

        labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
        featureIndexer = VectorIndexer(inputCol="features",
                                       outputCol="indexedFeatures")

        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
        model = pipeline.fit(training)

        prediction = model.transform(testing)

        metrics = BinaryClassificationMetrics(
            prediction.select("label", "prediction").rdd)

        results = results + "new," + classifier + "," + week + "," + str(
            metrics.areaUnderROC) + "," + str(metrics.areaUnderPR) + "\n"

        timing.append(time.time() - timeStart)

    return results, timing
def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    score = score.map(lambda p: float(p))
    scoreAndLabels = score.zip(validationData.map(lambda p: p.label))
    metrics = BinaryClassificationMetrics(scoreAndLabels)
    AUC = metrics.areaUnderROC
    return AUC
Пример #10
0
def Print_class_info(xy_predict):
    '''
    打印和分类效果有关的信息
    xy_predict:模型预测的数据集
    '''
    predict_and_target_rdd = xy_predict.rdd.map(
        lambda row: (float(row.prediction), float(row.label)))
    metrics = BinaryClassificationMetrics(predict_and_target_rdd)

    correct_amount = xy_predict.filter(
        xy_predict['label'] == xy_predict['prediction']).count()
    total_amount = xy_predict.count()
    accuracy_rate = float(correct_amount) / total_amount
    positive_precision_amount = xy_predict.filter(
        xy_predict['label'] == 1).filter(
            xy_predict['prediction'] == 1).count()
    positive_amount = xy_predict.filter(xy_predict['label'] == 1).count()
    predict_amount = xy_predict.filter(xy_predict['prediction'] == 1).count()

    recall_rate = float(positive_precision_amount) / positive_amount
    precision_rate = float(positive_precision_amount) / predict_amount

    print '----------------------------------------------'
    print "Precision score: %s" % precision_rate
    print "Recall score: %s" % recall_rate
    print "Accuracy score: %s" % accuracy_rate
    print "Area under PR: %s" % metrics.areaUnderPR
    print "Area under ROC: %s" % metrics.areaUnderROC
    print '----------------------------------------------'
Пример #11
0
def predict():

	testData = MLUtils.loadLibSVMFile(sc,INPUT_DATA_PATH)
	print("[INFO] load complete.")

	model = RandomForestModel.load(sc,TEST_MODEL_PATH)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))

	lst = predictions.collect()
	with open(TEST_PREDICT_PATH+"/"+time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())+".txt",'w') as f:
		for k in lst:
			f.write(str(k)+"\n")

	labelsAndPredictions = testData.map(lambda lp: tobin(lp.label)).zip(predictions.map(lambda lp: tobin(lp)))

	#print(labelsAndPredictions.collect())

	metrics = BinaryClassificationMetrics(labelsAndPredictions)

	# Area under precision-recall curve
	print("Area under PR = %s" % metrics.areaUnderPR)

	# Area under ROC curve
	print("Area under ROC = %s" % metrics.areaUnderROC)
	#print(labelsAndPredictions.collect())

	testErr = labelsAndPredictions.filter(lambda lp: lp[0] != lp[1]).count() / float(testData.count())
	print('[INFO] Test Error = ' + str(testErr))
Пример #12
0
def svmClassification(trainSetFile,testSetFile):

    data1 = sc.textFile(directory_supervised + trainSetFile)
    trainData = data1.map(parsePoint)
    data2 = sc.textFile(directory_supervised + testSetFile)
    testData = data2.map(parsePoint)

    # Build the model
    model = SVMWithSGD.train(trainData, iterations=10)

    # Evaluating the model on training data
    '''labelsAndPreds = trainData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainData.count())
    print("Training Error = " + str(trainErr))
    labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
    testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
    print("Test Error = " + str(testErr))
    return testErr'''
    #labelsAndPreds = testData.map(lambda p: (p.label, float(model.predict(p.features))))
    #truePos = labelsAndPreds.filter(lambda p: p[0] == p[1]).count()
    #print("True pos : " + str(truePos))
    #metrics1 = MulticlassMetrics(labelsAndPreds)
    #print("Recall : " + str(metrics1.recall()))
    #print("Precision : " + str(metrics1.precision()))
    #print(metrics1.confusionMatrix())

    model.clearThreshold()
    scoreAndLabels = testData.map(lambda p: (float(model.predict(p.features)), p.label))
    metrics = BinaryClassificationMetrics(scoreAndLabels)
    return metrics.areaUnderROC
Пример #13
0
def score(model, test_data):
    predictions = model.predict(test_data.map(lambda x: x.features))
    lables = test_data.map(lambda x: x.label)
    labels_and_preds = predictions.zip(lables)
    metrics = BinaryClassificationMetrics(labels_and_preds)

    return (metrics.areaUnderPR, metrics.areaUnderROC)
Пример #14
0
def predict_SVMWithSGD(numIterations, step, regParam, regType):
    """
    SVMWithSGD.train(data,iterations=100, step=1.0, regParam=0.01, miniBatchFraction=1.0, initialWeights=None, regType='l2',intercept=False, validateData=True,convergenceTol=0.001)
    data: the training data, an RDD of LabeledPoint
    iterations: the number of iterations, default 100
    step: the step parameter used in SGD, default 1.0
    regParam: the regularizer parameter, default 0.01
    miniBatchFraction: fraction of data to be used for each SGD iteration, default 1.0
    initialWeights: the initial weights, default None
    regType: the type of regularizer used for training our model, allowed values ('l1':for using L1 regularization; 'l2':for using L2 regularization, default; None: for no regularization)
    intercept: boolean parameter which indicates the use or not of the augmented representation for training data (i.e. whether bias feature are activated or not, default False)
    validateData: boolean parameter which indicates if the algorithm should validate data before training, default True
    convergenceTol: a condition which decides iteration termination, default 0.001
    """
    svmModel = SVMWithSGD.train(scaledData,
                                iterations=numIterations,
                                step=step,
                                regParam=regParam,
                                regType=regType)
    svmMetrics = scaledData.map(lambda p:
                                (svmModel.predict(p.features), p.label))
    svmAccuracy = svmMetrics.filter(
        lambda (actual, pred): actual == pred).count() * 1.0 / data.count()
    metrics = BinaryClassificationMetrics(svmMetrics)
    #print "SVMWithSGD model accuracy is: %f in %d iterations,step:%f;regParam:%f;regType:%s" % (svmAccuracy, numIterations,step,regParam,regType)
    return svmAccuracy
Пример #15
0
def predictData(sc, model):
    #----------------------1.匯入並轉換資料-------------
    print("開始匯入資料...")
    rawDataWithHeader = sc.textFile("s3n://bigdata17demo/train.csv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    lines = rawData.map(lambda x: x.split(","))
    print("共計:" + str(lines.count()) + "筆")
    #----------------------2.建立訓練評估所需資料 LabeledPoint RDD-------------
    labelpointRDD = lines.map(
        lambda r: LabeledPoint(extract_label(r), extract_features(r)))
    print(labelpointRDD.first())
    testData = labelpointRDD
    print("testData:" + str(testData.count()))
    labelsAndPreds = testData.map(lambda p:
                                  (p.label, float(model.predict(p.features))))
    metrics = BinaryClassificationMetrics(labelsAndPreds)
    print("Area under PR = %s" % metrics.areaUnderPR)
    print("Area under ROC = %s" % metrics.areaUnderROC)
    testErr = labelsAndPreds.filter(
        lambda seq: seq[0] != seq[1]).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    #----------------------4.進行預測並顯示結果--------------

    # 把預測結果寫出來
    f = open('workfile', 'w')
    for lp in labelpointRDD.take(499999):
        predict = int(model.predict(lp.features))
        dataDesc = "  " + str(predict) + " "
        f.write(dataDesc)
    f.close()
def main():
    start = time.time()
    conf = SparkConf().setMaster("local").setAppName("income")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    income_df = load(sqlContext, csv_path=CSV_PATH)
    # income_df.show()
    # print(income_df.dtypes)
    # print(income_df.count())

    features_df = preprocess(data_frame=income_df)

    # train, test split
    train_df, test_df = features_df.randomSplit([7.0, 3.0], 100)

    # logistic regression

    income_lr = LogisticRegression(featuresCol="features",
                                   labelCol="income_index",
                                   regParam=0.0,
                                   elasticNetParam=0.0,
                                   maxIter=200)
    income_model = income_lr.fit(train_df)

    # modeling
    print("Training:")
    training_summary = income_model.summary
    training_FPR = training_summary.roc.select('FPR').collect()
    training_TPR = training_summary.roc.select('TPR').collect()
    plot_roc(training_FPR, training_TPR, "pic/training_roc.jpg")

    training_recall = training_summary.pr.select('recall').collect()
    training_precision = training_summary.pr.select('precision').collect()
    # Area under ROC curve
    print("Training Area under ROC = %s" % training_summary.areaUnderROC)
    # accuracy
    print("Training Accuracy = %s" % training_summary.accuracy)
    plot_pr(training_recall, training_precision, "pic/training_pr.jpg")

    # evaluation
    print()
    print("Evaluation:")
    pred_df = income_model.transform(test_df).select("prediction",
                                                     "income_index")
    raw_pred_df = income_model.transform(test_df).select(
        "probability",
        "income_index").rdd.map(lambda l: (float(l[0][1]), l[1]))
    metrics = BinaryClassificationMetrics(raw_pred_df)
    # Area under ROC curve
    print("Testing Area under ROC = %s" % metrics.areaUnderROC)
    # accuracy
    metrics = MulticlassMetrics(pred_df.rdd)
    print("Testing Accuracy = %s" % metrics.accuracy)

    # confusion matrix
    print("Testing Confusion Matrix:")
    print(metrics.confusionMatrix().toArray())
    print("Total cost %fs" % (time.time() - start))
    print("Done!")
Пример #17
0
def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    print score
    scoreAndLabels = score.zip(validationData.map(lambda p: p.label)).map(lambda (x,y): (float(x),float(y)))
    print scoreAndLabels.take(1)
    metrics = BinaryClassificationMetrics(scoreAndLabels)

    return metrics.areaUnderROC
Пример #18
0
def evaluate_model(model, validationData):
    from pyspark.mllib.evaluation import BinaryClassificationMetrics
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels = score.zip(validationData.map(lambda p: p.label))
    scoreAndLabels.take(5)
    metrics = BinaryClassificationMetrics(scoreAndLabels)
    print("auc: ", metrics.areaUnderROC)
    return metrics.areaUnderROC
Пример #19
0
def EvaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    # 这里的score是int,要转换为float
    score = score.map(lambda x: float(x))
    scoreAndLables = score.zip(validationData.map(lambda p: p.label))
    metric = BinaryClassificationMetrics(scoreAndLables)
    AUC = metric.areaUnderROC
    return (AUC)
Пример #20
0
def main(sc):
    train_data = sc.textFile(
        "/data/scratch/vw/criteo-display-advertising-dataset/train.txt").map(
            parsePoint)
    model = LogisticRegressionWithSGD.train(train_data,
                                            iterations=1000,
                                            miniBatchFraction=0.0001,
                                            step=.001,
                                            regType="l2")

    valid_data = sc.textFile("input/valid_data.txt").map(parsePoint)
    labelsAndPreds = valid_data.map(
        lambda p: (float(model.predict(p.features)), p.label))
    Accuracy = labelsAndPreds.filter(
        lambda (pred, lab): lab == pred).count() / float(valid_data.count())
    FP = labelsAndPreds.filter(lambda
                               (pred, lab): lab == 0 and pred == 1).count()
    N = float(labelsAndPreds.filter(lambda (pred, lab): lab == 0).count())
    FPR = FP / N
    output = "Accuracy valid = " + str(Accuracy) + "\nFPR valid = " + str(FPR)
    print output
    metrics = BinaryClassificationMetrics(labelsAndPreds)
    output += "Area under ROC valid = " + str(metrics.areaUnderROC)

    print output

    test_data = sc.textFile(
        "/data/scratch/vw/criteo-display-advertising-dataset/test.txt").map(
            parsePoint)
    labelsAndPreds = test_data.map(lambda p:
                                   (float(model.predict(p.features)), p.label))
    Accuracy = labelsAndPreds.filter(
        lambda (pred, lab): lab == pred).count() / float(test_data.count())
    FP = labelsAndPreds.filter(lambda
                               (pred, lab): lab == 0 and pred == 1).count()
    N = float(labelsAndPreds.filter(lambda (pred, lab): lab == 0).count())
    FPR = FP / N
    output += "\nAccuracy test = " + str(Accuracy) + "\nFPR test = " + str(FPR)
    print output
    metrics = BinaryClassificationMetrics(labelsAndPreds)
    output += "Area under ROC test = " + str(metrics.areaUnderROC)

    print output

    output = sc.parallelize([output])
    output.saveAsTextFile("str")
Пример #21
0
def evaluate_lrmodel():
    scoreAndLabels = data.map(
        lambda point: (float(lrModel.predict(point.features)), point.label))
    metrics = BinaryClassificationMetrics(scoreAndLabels)
    #area under prcesion-recall
    print "area under PR %f" % metrics.areaUnderPR
    #area under ROC
    print "area under ROC %f" % metrics.areaUnderROC
Пример #22
0
 def printRddBinaryClassificationMetrics(self, predictions_and_labels):
     metrics = BinaryClassificationMetrics(predictions_and_labels)
     print "KAPPA=" + str(
         self.computeKappa(np.array(metrics.confusionMatrix().toArray())))
     print "BA=" + str(
         self.computeBA(np.array(metrics.confusionMatrix().toArray())))
     CMarray = metrics.confusionMatrix().toArray()
     #CMstring = ','.join(['%.5f' % num for num in CMarray])
     print "CM=" + str(CMarray)
Пример #23
0
def get_model_eval_metrics(labeled_test_comments, model, model_name,
                           label_name):
    transformed_comments = model.transform(labeled_test_comments)
    predictionAndLabels = transformed_comments.rdd.map(\
        lambda lp: (float(lp.probability[1]), float(lp[label_name]))\
    )
    metrics = BinaryClassificationMetrics(predictionAndLabels)
    print("auPR for {}: {}".format(model_name, metrics.areaUnderPR))
    print("auROC for {}: {}".format(model_name, metrics.areaUnderROC))
Пример #24
0
def evaluate_model(model, valid_data):
    # 返回的是int 型 的 0, 1
    score = model.predict(valid_data.map(lambda p: p.features))
    #
    score_and_labels = score.map(lambda x: float(x)).zip(
        valid_data.map(lambda p: p.label))
    metrics = BinaryClassificationMetrics(score_and_labels)
    AUC = metrics.areaUnderROC
    return AUC
Пример #25
0
def printMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    metrics2 = BinaryClassificationMetrics(predictions_and_labels)
    print('Precision of True ', metrics.precision(1))
    print('Precision of False', metrics.precision(0))
    print('Recall of True    ', metrics.recall(1))
    print('Recall of False   ', metrics.recall(0))
    print('areaUnderROC      ', metrics2.areaUnderROC)
    print('areaUnderPR       ', metrics2.areaUnderPR)
Пример #26
0
def get_auc_roc(classifier, training, test):
    model = classifier.fit(training)
    out = model.transform(test) \
        .select("prediction", "label") \
        .rdd.map(lambda x: (float(x[0]), float(x[1])))
    metrics = BinaryClassificationMetrics(out)
    print("Model: {1}. Area under ROC: {0:2f}".format(metrics.areaUnderROC,
                                                      clf.__class__))
    return model, out, metrics
Пример #27
0
def main():
    # Reading from the hdfs, removing the header
    # read the titanic train, test csv here 
    trainTitanic = sc.textFile( srcDir + "titanic_train.csv")
    # remove the header 
    trainHeader = trainTitanic.first()
    trainTitanic = trainTitanic.filter(lambda line: line != trainHeader).mapPartitions(lambda x: csv.reader(x))
    trainTitanic.first()
     
     
    # Data Transformations and filter lines with empty strings
    trainTitanic=trainTitanic.map(lambda line: line[1:3]+sexTransformMapper(line[4])+line[5:11])
    trainTitanic=trainTitanic.filter(lambda line: line[3] != '' ).filter(lambda line: line[4] != '' )
    trainTitanic.take(10)
     
    # creating "labeled point" rdds specific to MLlib "(label (v1, v2...vp])"
    trainTitanicLP=trainTitanic.map(lambda line: LabeledPoint(line[0],[line[1:5]]))
    trainTitanicLP.first()
     
    # splitting dataset into train and test set
    # 70% train, 30% test 
    (trainData, testData) = trainTitanicLP.randomSplit([0.7, 0.3])
     
    # Random forest : same parameters as sklearn (?)
    from pyspark.mllib.tree import RandomForest
     
    time_start=time.time()
    model_rf = RandomForest.trainClassifier(trainData, numClasses = 2,
            categoricalFeaturesInfo = {}, numTrees = 100,
            featureSubsetStrategy='auto', impurity='gini', maxDepth=12,
            maxBins=32, seed=None)
     
      
    model_rf.numTrees()
    model_rf.totalNumNodes()
    time_end=time.time()
    time_rf=(time_end - time_start)
    print("RF takes %d s" %(time_rf))
     
    # Predictions on test set
    predictions = model_rf.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
     
    # first metrics
    from pyspark.mllib.evaluation import BinaryClassificationMetrics
    metrics = BinaryClassificationMetrics(labelsAndPredictions)
     
    print ('=====================================================')
    print (' output : ')

    # Area under precision-recall curve
    print("Area under PR = %s" % metrics.areaUnderPR)
     
    # Area under ROC curve
    print("Area under ROC = %s" % metrics.areaUnderROC)

    print ('=====================================================')
Пример #28
0
def evaluateModel(model, validationData):
    # 计算AUC(ROC曲线下的面积)
    score = model.predict(validationData.map(lambda x: x.features))
    print(score)
    scoreAndLabels = score.zip(validationData.map(lambda x: x.label))
    print("scoreAndLabels的前5项", scoreAndLabels.take(5))
    metrics = BinaryClassificationMetrics(scoreAndLabels)
    AUC = metrics.areaUnderROC
    return (AUC)
Пример #29
0
def evaluateModel(model, validationData):
    """
    模型评估AUC
    """
    score = model.predict(validationData.map(lambda p: p.feature))
    scoreAndLabels = score.zip(
        validationData.map(lambda p: p.label))  # [(s1, l1), (s2, l2), ...]
    metrics = BinaryClassificationMetrics(scoreAndLabels)
    AUC = metrics.areaUnderROC
    return AUC
def evaluateMetrics(model, data, label):
    labelsAndScores = data.map(lambda lp: (
        lp.label, getP(lp.features, model.weights, model.intercept)))

    auc = BinaryClassificationMetrics(labelsAndScores).areaUnderROC
    log_loss = evaluateResults(model, data)

    sys.stderr.write('\n LogLoss {0} = {1}'.format(label, log_loss))
    sys.stderr.write('\n AUC {0} = {1}\n'.format(label, auc))

    return (label, log_loss, auc)