예제 #1
0
 def testLogisticMLPipeline1(self):
     training = sqlCtx.createDataFrame([
         ("a b c d e spark", 1.0),
         ("b d", 2.0),
         ("spark f g h", 1.0),
         ("hadoop mapreduce", 2.0),
         ("b spark who", 1.0),
         ("g d a y", 2.0),
         ("spark fly", 1.0),
         ("was mapreduce", 2.0),
         ("e spark program", 1.0),
         ("a e c l", 2.0),
         ("spark compile", 1.0),
         ("hadoop software", 2.0)
         ], ["text", "label"])
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
     lr = LogisticRegression(sqlCtx)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
     model = pipeline.fit(training)
     test = sqlCtx.createDataFrame([
         ("spark i j k", 1.0),
         ("l m n", 2.0),
         ("mapreduce spark", 1.0),
         ("apache hadoop", 2.0)], ["text", "label"])
     result = model.transform(test)
     predictionAndLabels = result.select("prediction", "label")
     evaluator = MulticlassClassificationEvaluator()
     score = evaluator.evaluate(predictionAndLabels)
     self.failUnless(score == 1.0)
예제 #2
0
def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
예제 #3
0
def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
def sparking_your_interest():
	df = SQLContext.read.json('speeches_dataset.json')
	df_fillna=df.fillna("")
	print(df_fillna.count())
	print(df_fillna.printSchema())

	df_utf=call_utf_encoder(df)
	df_cleaned=call_para_cleanup(df_utf)
	print(df_cleaned)
	df_with_bigrams = call_ngrams(df_cleaned, 2)
	df_with_trigrams = call_ngrams(df_with_bigrams, 3)
	df_with_4grams = call_ngrams(df_with_trigrams, 4)
	df_with_5grams = call_ngrams(df_with_4grams, 4)
	df_with_6grams = call_ngrams(df_with_5grams, 4)
	df_with_vocab_score = call_speech_vocab(df_with_6grams)

	df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
	df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
	df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
	assembler = VectorAssembler(
	    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
	    outputCol="features")
	assembler_output = assembler.transform(df_with_4grams_idf_vectors)
	output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
	print(output.show())
	print(output.count())

	output_tordd = output.rdd
	train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
	train_df = train_rdd.toDF()
	test_df = test_rdd.toDF()
	print(train_df)
	print(test_df)

	print('Train DF - Count: ')
	print(train_df.count())
	print('Test DF - Count: ')
	print(test_df.count())

	print("Initializing RF Model")
	labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
	rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
	pipeline = Pipeline(stages=[labelIndexer,rf])
	model = pipeline.fit(output)
	print("Completed RF Model")

	predictions = model.transform(test_df)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g" % (1.0 - accuracy))
	rfModel = model.stages[1]
	print(rfModel)  # summary only
	print("Predictions: ")
	print(predictions.show())
예제 #6
0
def model(classifier, ftrain, fvalid, fprediction):

    startTime = time.time()

    ctx = SparkContext(appName="model_on_Spark")
    sqlContext = SQLContext(ctx)
    logger = SparkLogger(ctx)
    logger.set_level('ERROR')

    # load and prepare training and validation data
    rawTrain, train = prepData(sqlContext, ctx, ftrain)
    rawValid, valid = prepData(sqlContext, ctx, fvalid)

    # is needed to join columns
    valid = indexData(valid)
    rawValid = indexData(rawValid)

    classifiers = {
        "RandomForestClassifier" : RFC
    }

    clf = classifiers[classifier]()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    # train and predict
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
    model = pipeline.fit(train)

    predictions = model.transform(valid)

    # write to file:

    subsetPrediction = predictions.select("prediction", "index")
    subsetValidData = rawValid.select("dataset", "index")

    output = (subsetValidData
               .join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
                    .drop("index")
                    .drop("index"))

    lines = output.map(toCSVLine)
    lines.saveAsTextFile('output')

    evaluator = MulticlassClassificationEvaluator(
       labelCol="label", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print "Test Error = %g" % (1.0 - accuracy)

    executionTime = time.time() - startTime
    row=classifier+','+str(executionTime)
    ctx.parallelize([row]).saveAsTextFile("timing")
예제 #7
0
def price_predict(path, windows=5, spark_contest=None, sql_context=None):
    if spark_contest is None:
        spark_contest, sql_context = load_spark_context()
    input_data = DataParser(path=path, window_size=windows)
    close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data(
        data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context)
    evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION)

    # handle open data
    open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128,
                                                  featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    open_model = open_trainer.fit(open_train_df)
    open_result = open_model.transform(open_test_df)
    open_prediction_labels = open_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(open_prediction_labels)))

    # handle close data
    close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128,
                                                   featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    close_model = close_trainer.fit(close_train_df)
    close_result = close_model.transform(close_test_df)
    close_prediction_labels = close_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
예제 #8
0
def print_evaluation_metrics(model, test_df, labelCol="label", featuresCol="features"):
    """
    Prints evaluation metrics.
    :param model: Used model.
    :param test_df: dataframe containing test data.
    :param labelCol: label column.
    :param featuresCol: features column.
    :return: A DataFrame.
    """
    predictions = model.transform(test_df)


    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol=labelCol, predictionCol="prediction",)
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    print "Accuracy:", accuracy
    print "f1:", f1
    print "Precision:", weighted_precision
    print "Recall:", weighted_recall
예제 #9
0
def calculate_accuracy_metrics(predictions):

    """
    Calculates accuracy metrics for a Prediction DataFrame

    :param predictions:
    :return:
    """
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction")
    accuracy = round(evaluator.evaluate(predictions, {evaluator.metricName: "precision"}), 2)
    recall = round(evaluator.evaluate(predictions, {evaluator.metricName: "recall"}), 2)

    positive_cases = predictions.filter(predictions["indexedLabel"] == 1.0)
    negative_cases = predictions.filter(predictions["indexedLabel"] == 0.0)
    false_positive_cases = negative_cases.filter(positive_cases["prediction"] == 1.0)
    false_negative_cases = positive_cases.filter(positive_cases["prediction"] == 0.0)

    return [accuracy,
            recall,
            positive_cases.count(),
            negative_cases.count(),
            false_positive_cases.count(),
            false_negative_cases.count()]
def build_decision_tree(sqlContext, features, interested):
	print '-----------------------------------------'
	data = sqlContext.createDataFrame(
			[Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))])
	data.printSchema()
	data.show(5)
	print 'created data frame'

	# Index the label column & adding metadata.
	labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
	print 'created label indexer'

	# Mark the features with < 4 distinct values as categorical
	featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

	# Split the data into training and test sets
	(trainingData, testData) = data.randomSplit([0.8, 0.2])

	# Train a DecisionTree model
	dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

	# Chain the indexers together with DecisionTree
	pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

	# Train the model
	model = pipeline.fit(trainingData)

	# Make predictions
	predictions = model.transform(testData)

	predictions.select("prediction", "indexedLabel", "features").show(5)

	# Select (prediction, true label) & compute test error
	evaluator = MulticlassClassificationEvaluator(
			labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	precision = evaluator.evaluate(predictions)

	treeModel = model.stages[2]
	return (1 - precision, model)
def naiveBayeseian():

    def parseLine(line):
        keys  = [float(x) for x in line.split(",")]
        #return LabeledPoint(keys[0],keys[1:])
        return keys
    scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv")
    data= scdata1.map(parseLine)
    splits = data.randomSplit([0.8, 0.2], 1234)
    train = splits[0]
    test = splits[1]
    layers = [30, 20, 20, 2]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
예제 #12
0

#gbt = GBTClassifier(numTrees = 10, maxDepth = 3, maxBins = 64)
gbt = GBTClassifier(maxIter = 30, maxDepth = 2, impurityType = gini)

#gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
##rf = RandomForestClassifier(numTrees = 25, maxDepth = 4, maxBins = 64)
pipeline = Pipeline(stages=[gbt])
pipelineModel = pipeline.fit(training)

testPredictions = pipelineModel.transform(test)
testPredictions.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")#.setMetricName("accuracy")
evaluatorParaMap = {evaluator.metricName: "f1"}
aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap)


from pyspark.ml.tuning import *

paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [1,5]).build()

cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3)

cvModel = cv.fit(training)
cvPredictions = cvModel.transform(test)
cvAUCTest = evaluator.evaluate(cvPredictions, evaluatorParaMap)

print("pipeline Test AUC: %g" % aucTest)
print("Cross-Validation test AUC: %g" % cvAUCTest)
end = time.time()
예제 #13
0
    adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999)

    va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features').transform(df)
    encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False).transform(va).select(['features', 'labels'])

    #demonstration of options. Not all are required
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='out:0',
        tfOptimizer='adam',
        miniBatchSize=300,
        miniStochasticIters=1,
        shufflePerIter=True,
        iters=50,
        predictionCol='predicted',
        labelCol='labels',
        partitions=3,
        verbose=1,
        optimizerOptions=adam_config
    )

    spark_model.fit(encoded).save('simple_dnn')
    predictions = SparkAsyncDLModel.load("simple_dnn").transform(encoded)
    evaluator = MulticlassClassificationEvaluator(
        labelCol="labels", predictionCol="predicted", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))
예제 #14
0
])

df_train = sqlContext.createDataFrame(
    train.map(
        lambda x: Row([float(m) for m in x[0:-1]], transform_label(x[-1]))),
    schema)
df_test = sqlContext.createDataFrame(
    test.map(
        lambda x: Row([float(m) for m in x[0:-1]], transform_label(x[-1]))),
    schema)

list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
df_train = df_train.select(
    list_to_vector_udf(df_train["features"]).alias("features"),
    'label').cache()
df_test = df_test.select(
    list_to_vector_udf(df_test["features"]).alias("features"),
    'label').cache()

rf = RandomForestClassifier(maxDepth=30)
rfmodel = rf.fit(df_train)

rfpredicts = rfmodel.transform(df_test)
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="f1")

print("F1 = %0.4f" % evaluator.evaluate(rfpredicts))
#########
sc.stop()
    data = sqlContext.read.format("libsvm")\
        .load("data/mllib/sample_multiclass_classification_data.txt")
    # Split the data into train and test
    
    data.show() 
    data.printSchema()
    data.select('features').show()
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    print (train.count())
    train.show()
    test = splits[1]
    
    
    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    sc.stop()
예제 #16
0
                              inputCol='features',
                              outputCol='std_features')
layers = [16, 20, 20, 10]

# create the trainer and set its parameters
mlp = MultilayerPerceptronClassifier( layers=layers,  labelCol="label", featuresCol="std_features")

pipeline = Pipeline(stages=[standardizer , mlp])


mlpModel=pipeline.fit(trainingData);
mlpPredictions=mlpModel.transform(testingData);
mlpPredictions.select("prediction", "label", "std_features").show(5)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(mlpPredictions)

print("Accuracy on test data = %g" % accuracy)

paramGrid = ParamGridBuilder().\
    addGrid(mlp.maxIter, [ 50,100,150]).\
    addGrid(mlp.blockSize, [ 64,128]). \
    addGrid(mlp.layers, [(16, 10, 10, 10),(16, 32, 32, 10)]). \
    build()


tvs = TrainValidationSplit(estimator=pipeline,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator,
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)
예제 #17
0
    def _execute(self):
        df = self.df_from_temp_table(self.kwargs["previous_job_temp_table"])
        if self.target_label in df.columns:
            df = df.drop(self.target_label)
        cols_to_index = [
            k for k, v in df.dtypes
            if (v == "string" and k != self.target_label)
        ]
        cols_not_to_index = [k for k, v in df.dtypes if v != "string"]

        feature_cols = cols_not_to_index + [
            "indexed{}".format(col_) for col_ in cols_to_index
        ]

        df = self.create_feature_vector(df, feature_cols)

        # Index labels, adding metadata to the label column.
        # Fit on whole dataset to include all labels in index.
        label_indexer = StringIndexer(
            inputCol=self.target_label,
            outputCol="{}loan_status".format("indexed"))
        label_indexer.setHandleInvalid("skip")
        label_indexer = label_indexer.fit(df)

        # Automatically identify categorical features, and index them.
        # Set maxCategories so features with > 12 distinct values are
        # treated as continuous.
        feature_indexer = VectorIndexer(inputCol="features",
                                        outputCol="indexedFeatures",
                                        maxCategories=12)
        feature_indexer.setHandleInvalid("skip")

        # Split the data into training and test sets (30% held out for testing)
        (trainingData, testData) = df.randomSplit([0.7, 0.3])

        # Train a RandomForest model.
        rf = RandomForestClassifier(labelCol="{}loan_status".format("indexed"),
                                    featuresCol="indexedFeatures",
                                    predictionCol="prediction",
                                    numTrees=10)

        # # Convert indexed labels back to original labels.
        label_converter = IndexToString(inputCol="prediction",
                                        outputCol="predictedLabel",
                                        labels=label_indexer.labels)

        # Chain indexers and forest in a Pipeline
        pipeline = Pipeline(
            stages=[label_indexer, feature_indexer, rf, label_converter])

        # Train model.  This also runs the indexers.
        model = pipeline.fit(trainingData)

        # # Make predictions.
        predictions = model.transform(testData)

        # Select (prediction, true label) and compute test error
        evaluator = MulticlassClassificationEvaluator(
            labelCol="{}loan_status".format("indexed"),
            predictionCol="prediction",
            metricName="accuracy")

        accuracy = evaluator.evaluate(predictions)

        self.metrics["accuracy"] = accuracy

        return str(accuracy)
# Loading the dataset
KCDPFinal = spark.read.format("csv").option("header", True).option(
    "inferSchema",
    True).option("delimiter",
                 ",").load("C:/KCcrimeForAnalytics.csv").withColumnRenamed(
                     "Firearm_Used_Flag", "label")
KCDPFinal

# Create vector assembler for feature columns
VAssembler = VectorAssembler(inputCols=KCDPFinal.columns[1:19],
                             outputCol="features")
KCDPFinal = VAssembler.transform(KCDPFinal)

# Split the crime dataset into training and testing data sets
trainingData, testingData = KCDPFinal.select("label", "features").randomSplit(
    [0.7, 0.3])

# Using the training set for the model traning
from pyspark.ml.classification import NaiveBayes
NaiveBayesModel = NaiveBayes()
model = NaiveBayesModel.fit(trainingData)

# Generate prediction from test dataset
CrimepredKC = model.transform(testingData)

# Evuluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(CrimepredKC)

# Show model accuracy
print("Accuracy:", accuracy)
예제 #19
0
    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(
        stages=[labelIndexer, featureIndexer, rf, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select('predictedLabel', 'label', 'features').show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel',
                                                  predictionCol='prediction',
                                                  metricName='accuracy')
    accuracy = evaluator.evaluate(predictions)
    print('Test Error = %g' % (1.0 - accuracy))
    print('Accuracy = ', accuracy)

    rfModel = model.stages[2]
    print(rfModel)  # summary only

    #Calcular AUC
    evaluator = BinaryClassificationEvaluator()
    evaluation = evaluator.evaluate(model.transform(testData))
    print('AUC:', evaluation)

    #Detener
    sc.stop()
예제 #20
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("naive_bayes_example")\
        .getOrCreate()

    # $example on$
    # Load training data
    data = spark.read.format("libsvm") \
        .load("data/mllib/sample_libsvm_data.txt")
    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    spark.stop()
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="accuracy")

#Create the Decision Trees model
dtClassifer = DecisionTreeClassifier(labelCol="indexed", \
                featuresCol="features")
dtModel = dtClassifer.fit(trainingData)
#Predict on the test data
predictions = dtModel.transform(testData)
predictions.select("prediction", "indexed", "label", "features").collect()
print("Results of Decision Trees : ", evaluator.evaluate(predictions))

#Create the Random Forest model
rmClassifer = RandomForestClassifier(labelCol="indexed", \
                featuresCol="features")
rmModel = rmClassifer.fit(trainingData)
#Predict on the test data
predictions = rmModel.transform(testData)
predictions.select("prediction", "indexed", "label", "features").collect()
print("Results of Random Forest : ", evaluator.evaluate(predictions))

#Create the Naive Bayes model
nbClassifer = NaiveBayes(labelCol="indexed", \
                featuresCol="features")
nbModel = nbClassifer.fit(trainingData)
#Predict on the test data
예제 #22
0
# model on training data regPara: lasso regularisation parameter (L1)
lrModel = LogisticRegression(regParam=0.1).fit(trainData)

# make prediction on test data
pred = lrModel.transform(testData)

pred.select('catLabel', 'label', 'prediction').show()


evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred))
print('F1 score of Logistic Regression model is %f' % evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()


# <a id="context322"></a>
# #### 3.2.2. Decision Tree

# In[18]:

from pyspark.ml.classification import DecisionTreeClassifier

# model on training data maxDepth is the hyperparameter
dtModel = DecisionTreeClassifier(maxDepth=3).fit(trainData)

# make prediction on test data
pred = dtModel.transform(testData)

#doing DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label",
                            featuresCol="features",
                            maxDepth=13,
                            maxBins=64,
                            impurity='entropy')
time1 = time.time()
dtc_model = dt.fit(trainingData)
time2 = time.time()
dtc_time = time2 - time1
dtc_prediction = dtc_model.transform(testData)
evaluator = MulticlassClassificationEvaluator\
      (labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dtc_prediction)
dtc_3 = get_top_3(trainingData, dtc_model.featureImportances)

#doing DecisionTreeRegressor
dt = DecisionTreeRegressor(labelCol="label",
                           featuresCol="features",
                           maxDepth=12,
                           maxBins=64,
                           minInstancesPerNode=2,
                           minInfoGain=0.0)
time1 = time.time()
dtr_model = dt.fit(trainingData)
time2 = time.time()
dtr_time = time2 - time1
dtr_prediction = dtr_model.transform(testData)
dtr_3 = get_top_3(trainingData, dtr_model.featureImportances)
(trainingData, testData) = forData.randomSplit([0.8,0.2],seed=0)
print(trainingData.take(1))
rfClassifier = RandomForestClassifier(numTrees=10,maxDepth=10, seed=0,labelCol="indexed")

start_time = time.time()
modelClassifier = rfClassifier.fit(trainingData)
end_time = time.time()

cost_time = end_time - start_time
print("spark rf time  :",cost_time)


predictionsClassifier = modelClassifier.transform(testData)

evaluator= MulticlassClassificationEvaluator().setLabelCol("indexed").setPredictionCol("prediction")
print("accuracy = ",evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "accuracy"}))
print("weightedPrecision = ",evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "weightedPrecision"}))
print("weightedRecall = ",evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "weightedRecall"}))
print("f1 = ",evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "f1"}))











예제 #25
0
       	print >> sys.stderr, "%s <input> <model_path> <stop_file> class_num appname" % sys.argv[0] 
        sys.exit(1)

    input_path = sys.argv[1]
    model_path = sys.argv[2]
    stop_file = sys.argv[3]
    class_num = int(sys.argv[4])
    appname = sys.argv[5]

    conf = SparkConf().setAppName(appname)
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    data_df = text_to_df(sc, sqlContext, input_path)
    print "*** create data frame ***" 
    splits = data_df.randomSplit([0.8, 0.2], 1234)
    training = splits[0].cache()
    test = splits[1].cache()

    stopwords = get_stopwords(stop_file)
    print "*** load %s stopwords ***" % len(stopwords)
    pipeline = get_pipeline(vector_size=50, class_num=class_num, stopwords=stopwords) 
    model = pipeline.fit(training)
    result = model.transform(test)
  
    pred_label = result.select("prediction", "indexLabel")
    evaluator = MulticlassClassificationEvaluator(metricName="precision", predictionCol="prediction", labelCol="indexLabel")
    print("Precision: " + str(evaluator.evaluate(pred_label)))    


예제 #26
0
                            .build()
# 2^3 models = 8
lr_grid = ParamGridBuilder().baseOn({pipeline.stages:[dtc_assembler, lr]})\
                             .addGrid(lr.regParam, [0.01, 0.05])\
                             .addGrid(lr.maxIter, [10, 100])\
                             .addGrid(lr.elasticNetParam, [0.0, 0.1])\
                             .build()

paramGrid = dtc_grid + dtr_grid + lr_grid

#fit cv

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, parallelism=10, numFolds=3, seed=16)
cvModel = cv.fit(train)
predictions = cvModel.transform(test)
accuracy = evaluator.evaluate(predictions)

print(cvModel.avgMetrics)
print("trained models",len(cvModel.avgMetrics))
print("\n")
print("Decision tree classifier models accuracy", cvModel.avgMetrics[:6])
print("\n")
print("Decision tree regression models accuracy", cvModel.avgMetrics[6:15])
print("\n")
print("Logistic regression models accuracy", cvModel.avgMetrics[15:23])

#GET BEST DTC model
print("\n")
print("Best decision tree classifier model")
print("accuracy", cvModel.avgMetrics[np.argmax(cvModel.avgMetrics[:6])])
print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics[:6])])
prediction = piplineModel.transform(test)
predicted = prediction.select("features", "prediction", "trueLabel")
predicted.show(100, truncate=False)

# COMMAND ----------

# MAGIC %md
# MAGIC Following cell measure accuracy of Algorithm using MultiClassificationEvaluater and evaluate using predicted data.

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluation = MulticlassClassificationEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluation.evaluate(prediction)
print(accuracy)

# COMMAND ----------

# MAGIC %md Following Code will give total Error in project

# COMMAND ----------

print("Test Error = %g" % (1.0 - accuracy))

# COMMAND ----------

# MAGIC %md #### Gradient Boosting Evaluator
# MAGIC Calculate AUC using Gradient Boosting Evaluator.
예제 #28
0
multiEvaluator = MulticlassClassificationEvaluator()

#Setting Paramaters for Crossvalidation 
mlp_cv = CrossValidator( estimator=pipeline, evaluator=evaluator, estimatorParamMaps=mlp_paramGrid, numFolds=user_mlp_param_numFolds)
mlp_cvmodel = mlp_cv.fit(train)

#Evaluating Multilayer Perceptron Model Performance 
from pyspark.sql.functions import udf

mlp_predictions = mlp_cvmodel.transform(test)

auroc = evaluator.evaluate(mlp_predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(mlp_predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s" % (auroc, aupr)

f1score = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "f1"})
weightedPrecision = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "weightedPrecision"})
weightedRecall = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "weightedRecall"})

"The F1 score: %s the Weighted Precision: %s the Weighted Recall is %s" % (f1score, weightedPrecision, weightedRecall)

#Select The Best Multilayer Perceptron Model After Crossvalidation
mlpmodel = mlp_cvmodel.bestModel 
bestMLPModel = mlpmodel.stages[-1]

#Retrieving Paramaters from the Best MLP Model 
#param_BestModel_Layers = bestMLPModel._java_obj.layers
#param_BestModel_Iter = bestMLPModel._java_obj.maxIter

### Stop Timer
stopTime = time.process_time()
예제 #29
0
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
grid=(ParamGridBuilder()
     .baseOn([evaluator.metricName,'precision'])
     .addGrid(dt.maxDepth, [10,20])
     .build())
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator)


# In[332]:

print "Fitting the decision tree on selected features"
t0 = time()
cv_model = cv.fit(dfTrainIndexed)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[302]:

pr
dfTestIndexed = string_indexer_model.transform(dfTestSelect)
df_test_pred = cv_model.transform(dfTestIndexed)
res=evaluator.evaluate(df_test_pred)
print res 




예제 #30
0
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark import SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import PipelineModel

sc = SparkContext(appName="MyFirstApp3_Task_task2")
spark = SparkSession(sc)

df_node16 = spark.read.format("parquet").load(
    path="hdfs://namenode:9000/example3/test.parquet")
model_node17 = PipelineModel.load("hdfs://namenode:9000/example3/model/")
df_node18 = model_node17.transform(df_node16)

evaluator_node19 = MulticlassClassificationEvaluator(
    labelCol="indexedSurvived",
    predictionCol="prediction",
    metricName="accuracy")
score_node19 = evaluator_node19.evaluate(df_node18)
df_node19 = spark.createDataFrame([(score_node19, )], ["score"])

df_node19.write.format("csv").save(
    path="hdfs://namenode:9000/example3/EvalResult3.csv",
    quote="\"",
    header=True,
    sep=",")
예제 #31
0
 title_category = news_data.select("TITLE", "CATEGORY")
 title_category = title_category.dropna()
 title_category = title_category.withColumn(
     "only_str", regexp_replace(col('TITLE'), '\d+', ''))
 regex_tokenizer = RegexTokenizer(inputCol="only_str",
                                  outputCol="words",
                                  pattern="\\W")
 raw_words = regex_tokenizer.transform(title_category)
 remover = StopWordsRemover(inputCol="words", outputCol="filtered")
 words_df = remover.transform(raw_words)
 indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex")
 feature_data = indexer.fit(words_df).transform(words_df)
 cv = CountVectorizer(inputCol="filtered", outputCol="features")
 model = cv.fit(feature_data)
 countVectorizer_feateures = model.transform(feature_data)
 (trainingData,
  testData) = countVectorizer_feateures.randomSplit([0.8, 0.2], seed=11)
 nb = NaiveBayes(modelType="multinomial",
                 labelCol="categoryIndex",
                 featuresCol="features")
 nbModel = nb.fit(trainingData)
 nb_predictions = nbModel.transform(testData)
 evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex",
                                               predictionCol="prediction",
                                               metricName="accuracy")
 nb_accuracy = evaluator.evaluate(nb_predictions)
 print("Accuracy of NaiveBayes is = %g" % (nb_accuracy))
 print("Test Error of NaiveBayes = %g " % (1.0 - nb_accuracy))
 time2 = datetime.datetime.now()
 elapsedTime = time2 - time1
 print(elapsedTime)
예제 #32
0
#predictions.show()


# In[120]:


from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# In[121]:


####### Random Forest Accuarcy - not using this as simple linear regression is giving better results ############
evalAcc = MulticlassClassificationEvaluator(
    labelCol="quality", predictionCol="prediction", metricName="accuracy")
accuracy = evalAcc.evaluate(predictions)

##print("accuracy Test Error = %g" % (1.0 - accuracy))


transformed_data = model.transform(val)
transformed_data = transformed_data.withColumn("prediction", func.round("prediction"))
##print(evalAcc.getMetricName(), 'accuracy:', evalAcc.evaluate(transformed_data))


# In[122]:


####### Random Forest f1 - not using this as simple linear regression is giving better results ############
evalVal = MulticlassClassificationEvaluator(
    labelCol="quality", predictionCol="prediction", metricName="f1")
예제 #33
0
    rel = {}
    rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3]))
    rel['label'] = str(x[4])
    return rel


data = spark.sparkContext.textFile("E:/iris.txt").map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF()
data.createOrReplaceTempView("iris")
df = spark.sql("select * from iris")
# rel = df.rdd.map(lambda t : str(t[1])+":"+str(t[0])).collect()
# for item in rel:
#     print(item)
labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
trainingData, testData = df.randomSplit([0.7, 0.3])
dtClassifier = DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures")
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
pipelinedClassifier = Pipeline().setStages([labelIndexer, featureIndexer, dtClassifier, labelConverter])

modelClassifier = pipelinedClassifier.fit(trainingData)
predictionsClassifier = modelClassifier.transform(testData)
predictionsClassifier.select("predictedLabel", "label", "features").show(20)

evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol(
    "prediction").setMetricName("accuracy")
lrAccuracy = evaluator.evaluate(predictionsClassifier)
print("Test Error = " + str(1.0 - lrAccuracy))
treeModelClassifier = modelClassifier.stages[2]
print("Learned classification tree model:\n" + str(treeModelClassifier.toDebugString))

num_test_samples = 10000
num_train_samples= 60000

test_df = spark.read.csv(test_datafile,header=False,inferSchema="true")
train_df = spark.read.csv(train_datafile,header=False,inferSchema="true")

assembler = VectorAssembler(inputCols=train_df.columns[1:],outputCol="feature")
train_vector=assembler.transform(train_df).select("_c0","feature")
pca = PCA(k=99, inputCol="feature", outputCol="features")#PCA 784 to 99
model = pca.fit(train_vector)
train_pca_result = model.transform(train_vector).select('_c0','features')
new_train_pca_result=train_pca_result.withColumnRenamed("_c0", "label")

assembler = VectorAssembler(inputCols=test_df.columns[1:],outputCol="feature")
test_vector=assembler.transform(test_df).select("_c0","feature")
test_pca_result = model.transform(test_vector).select('_c0','features')
new_test_pca_result=test_pca_result.withColumnRenamed("_c0", "label")

# train the model
# create the trainer and set its parameters
treeNumber = 100 #set tree number#
trainer = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=treeNumber)
trainmodel = trainer.fit(new_train_pca_result)#label/features

results = trainmodel.transform(new_test_pca_result)
predictionAndLabels = results.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(""Test set accuracy of treeNumber = " + str(treeNumber1) + " is " + str(evaluator.evaluate(predictionAndLabels)))
#evaluator.saveAsTextFile(output_path)
예제 #35
0
                                outputCol="words",
                                pattern="\\W")
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label")

pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineModel = pipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)

#splitting dataset
trainingData, testData = dataset.randomSplit([0.7, 0.3], seed=100)

#Building the Logistic Regression Model
lr = LogisticRegression(maxIter=20, regParam=0.01, elasticNetParam=0)
lrModel = lr.fit(trainingData)

#Evaluating the classification model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
predictions = lrModel.transform(testData)
acc = evaluator.evaluate(predictions)
print("Accuracy on testset is:", acc)

#Saving the model
lrModel.write().overwrite().save("lr_Model")
pipelineModel.write().overwrite().save("pipeline_Model")

print("Stored pipeline and model.")
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("Text","Sentiment","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

# COMMAND ----------

#finding the accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# COMMAND ----------

#applying cross validation
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
    # $example on$
    # Load training data
    data = spark.read.format("libsvm")\
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]

    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

    # train the model
    model = trainer.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    spark.stop()
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

treeModel = model.stages[2]
# summary only
print(treeModel)

# see for more: https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier

# Churn - which customers (of a telecommunications company) are likely to stop using their service
# Churn dataset provided by the UC Irvine machine-learning repository hosted by SGI
# Data from https://www.sgi.com/tech/mlc/db/churn.all
$ wget https://www.sgi.com/tech/mlc/db/churn.all

# Classification - Random Forest
예제 #39
0
si  = StringIndexer(inputCol="purpose", outputCol="purpose_index")
hot = OneHotEncoder(inputCol="purpose_index", outputCol="purpose_features")
va  = VectorAssembler(inputCols=["loan_amnt", "interest_rate", "employment_length", "home_owner", "income", "verified", "open_accts", "credit_debt", "purpose_features"], outputCol="features")
dtr = DecisionTreeRegressor(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=2, varianceCol="variance")
gbr = GBTRegressor(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)
gbc = GBTClassifier(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=5, maxIter=20, seed=12345)

pipeline = Pipeline(stages=[si, hot, va, gbc])

model = pipeline.fit(training)
model.write().overwrite().save('hdfs:///tmp/spark_model')

predictions = model.transform(testing)

predictions.select(['default','prediction']).sort(col('prediction').desc()).show(25,False)

#evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="default")
#rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
#r2   = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

#evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="default")
#evaluator.evaluate(predictions)
#evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="default")
evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})


#ZEND
예제 #40
0
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[19]:

print "Testing precision of the model"
t0 = time()

dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()
dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache()
df_valid_pred = lrModel.transform(dfValidIndexed).cache()
res=evaluator.evaluate(df_valid_pred)
print res

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[12]:

import loadFiles as lf
print "Start loading  and preprocessing test data "
t0 = time()

test,names=lf.loadUknown('./data/test')
text_name=zip(test,names)
dfTest = sc.parallelize(text_name).toDF(['review','label']).cache()
예제 #41
0
def run_MLA(XX,XXpredict,yy,yypredict,unique_IDS_tr,unique_IDS_pr,uniquetarget_tr,uniquetarget_pr,n_feat,ind_run_name,n_run):
    logger.info('Starting MLA run')
    logger.info('------------')
    if settings.pyspark_on == 1:                # Use pyspark or not? Pyspark makes cross node (HPC) calculation possible.
        from pyspark import SparkContext        # It's slower, manages resources between nodes using HTTP. 
        from pyspark.sql import SQLContext      # So far, it does not include feature importance outputs.
        from pyspark.ml import Pipeline         # I would have to program feature importances myself. May be time consuming.
        from pyspark.ml.feature import VectorAssembler
        from pyspark.ml.classification import RandomForestClassifier
        from pyspark.ml.feature import StringIndexer, VectorIndexer
        from pyspark.ml.evaluation import MulticlassClassificationEvaluator
        # pyspark go
        
        if settings.pyspark_remake_csv == 1: # Making the csv files for the pyspark MLA to read in is time consuming, turn off the file generation?
            logger.info('Remaking csvs for pysparks...')
            numpy.savetxt(temp_train, XX, delimiter=",")
            logger.info('Training csv saved')
            numpy.savetxt(temp_pred, XXpredict, delimiter=",")
            logger.info('Predict csv saved')
        sc = SparkContext(appName="ML_RF") # Initiate spark
        
        sclogger=sc._jvm.org.apache.log4j # Initiate spark logging
        sclogger.LogManager.getLogger("org").setLevel(sclogger.Level.ERROR)
        sclogger.LogManager.getLogger("akka").setLevel(sclogger.Level.ERROR)
        sqlContext=SQLContext(sc)
        # Read in data
        data_tr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_train)
        data_pr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_pred)
        data_tr=data_tr.withColumnRenamed(data_tr.columns[-1],"label") # rename last column (answers), to label
        data_pr=data_pr.withColumnRenamed(data_pr.columns[-1],"label")
        
        assembler=VectorAssembler(inputCols=data_tr.columns[:-1],outputCol="features")
        reduced=assembler.transform(data_tr.select('*')) # Assemble feature vectos for spark MLA
        
        assembler_pr=VectorAssembler(inputCols=data_pr.columns[:-1],outputCol="features")
        reduced_pr=assembler_pr.transform(data_pr.select('*'))
        
        labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(reduced) # Index vectors        
        featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(reduced)
        # Initiate MLA alg
        rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=100,maxDepth=5,maxBins=200)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Set up fitting pipeline
        start, end=[],[] # Timer
        logger.info('Fit start')
        logger.info('------------')
        start = time.time()
        model=pipeline.fit(reduced) # Fit
        end = time.time()
        logger.info('Fit ended in %s seconds' %(end-start))
        logger.info('------------')
        start, end=[],[]
        logger.info('Predict start')
        logger.info('------------')
        start = time.time()
        predictions = model.transform(reduced_pr) # Predict
        evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision")
        accuracy = evaluator.evaluate(predictions)
        logger.info("Test Error = %g" %(1.0-accuracy))
        logger.info('------------')
        logger.info('Pulling results ...')
        yypredict=numpy.array(predictions.select("indexedLabel").collect()) # Pulls all results into numpy arrays to continue program
        yypredict=yypredict[:,0]
        result=numpy.array(predictions.select("prediction").collect())
        result=result[:,0]
        XXpredict=numpy.array(predictions.select("indexedFeatures").collect())
        XXpredict=XXpredict[:,0]
        probs=numpy.array(predictions.select("probability").collect())
        probs=probs[:,0]
        XXpredict=numpy.column_stack((XXpredict,yypredict))
        end=time.time()
        logger.info('Predict ended in %s seconds' %(end-start))
        logger.info('------------')
    
    else:
        # Run sklearn MLA switch
        MLA = get_function(settings.MLA) # Pulls in machine learning algorithm from settings
        clf = MLA().set_params(**settings.MLAset)
        logger.info('MLA settings') 
        logger.info(clf)
        logger.info('------------')    
        start, end=[],[] # Timer
        logger.info('Fit start')
        logger.info('------------')
        start = time.time()
        clf = clf.fit(XX[:,0:n_feat],yy) # XX is train array, yy is training answers
        end = time.time()
        logger.info('Fit ended in %s seconds' %(end-start))
        logger.info('------------')
        score = clf.score
        if 'OvsA' not in ind_run_name:
            if settings.output_all_trees == 1:
                i_tree = 0
                for tree_in_forest in clf.estimators_:
                    with open('plots/tree_' + str(i_tree) + '.dot', 'w') as my_file:
                        my_file = tree.export_graphviz(tree_in_forest, out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True)
                    os.system('dot -Tpng plots/tree_%s.dot -o plots/tree_%s.png' %(i_tree,i_tree))
                    os.remove('plots/tree_%s.dot' %i_tree)
                    i_tree = i_tree + 1        
            else:
                with open('plots/tree_example.dot', 'w') as my_file:
                    my_file = tree.export_graphviz(clf.estimators_[0], out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True)
                os.system('dot -Tpng plots/tree_example.dot -o plots/tree_example.png')
                os.remove('plots/tree_example.dot')
        start, end=[],[]
        # Split cats for RAM management
        numcats = numpy.int64((2*(XXpredict.size/1024/1024)*clf.n_jobs))
        if settings.get_contributions ==1:
            numcats=100
        if numcats < 1:
            numcats = 1
        logger.info('Predict start')
        logger.info('------------')
        start = time.time()
        result,probs,bias,contributions,train_contributions=[],[],[],[],[]
        XXpredict_cats=numpy.array_split(XXpredict,numcats)
        logger.info('Splitting predict array into %s' %numcats)
        logger.info('------------')
        for i in range(len(XXpredict_cats)):
            logger.info('Predicting cat %s/%s' %(i,len(XXpredict_cats)))
            result.extend(clf.predict(XXpredict_cats[i][:,0:n_feat])) # XX is predict array.
            probs.extend(clf.predict_proba(XXpredict_cats[i][:,0:n_feat])) # Only take from 0:n_feat because answers are tacked on end
            if 'OvsA' not in ind_run_name:            
                if (settings.get_contributions == 1) | (settings.get_perfect_contributions==1):           
                    logger.info('Getting contributions from predict catalogue %s' %i)
                    tiresult = ti.predict(clf,XXpredict_cats[i][:,0:n_feat])
                    contributions.extend(tiresult[2])
                    bias = tiresult[1][0]
        feat_importance = clf.feature_importances_
        result=numpy.float32(result)
        probs=numpy.float32(probs)
        if 'OvsA' not in ind_run_name:            
            if settings.get_contributions == 1: 
                numpy.save('contributions',contributions)
            if settings.get_perfect_contributions == 1: 
                numpy.save('perfect_contributions',contributions)
            if settings.compute_contribution_mic == 1:
                logger.info('Getting contributions from train catalogue (for plot_mic_cont)')
                tiresult_train = ti.predict(clf,XX[:,0:n_feat])
                train_contributions=tiresult_train[2]
                bias_train = tiresult_train[1][0]
        
        accuracy = metrics.accuracy_score(result,yypredict)
        recall = metrics.recall_score(result,yypredict,average=None)
        precision = metrics.precision_score(result,yypredict,average=None)
        score = metrics.f1_score(result, yypredict,average=None)
        
        end = time.time()
        logger.info('Predict ended in %s seconds' %(end-start))
        logger.info('------------')

    logger.info('Recall Score: %s' %recall)
    logger.info('Precision Score: %s' %precision)
    logger.info('Accuracy Score: %s' %accuracy)
    logger.info('F1 Score: %s' %score)
    percentage=(n/predictdatanum)*100
    
    run_opts.diagnostics([result,yypredict,unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr],'result')
#    stats=numpy.array([])
#    stats=numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage))
    # SAVE
    if settings.saveresults == 1:
        logger.info('Saving results')
        logger.info('------------')

        numpy.savetxt(settings.result_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((yypredict,result)),header="True_target Predicted_target")
        numpy.savetxt(settings.prob_outfile+('_%s' %ind_run_name)+'.txt',probs)
        numpy.savetxt(settings.feat_outfile+('_%s' %ind_run_name)+'.txt',feat_importance)
        numpy.savetxt(settings.stats_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage,clf.max_depth)),header="n_est traindatanum predictdatanum percentage max_depth",fmt="%s")
    
    return result,feat_importance,probs,bias,contributions,accuracy,recall,precision,score,clf,train_contributions
예제 #42
0
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')

# fitting train data
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

# get predictions
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

my_binary_eval = BinaryClassificationEvaluator(labelCol="PrivateIndex")

print "Decision Tree Classification: "
print my_binary_eval.evaluate(dtc_preds)
print "Random Forest Classification: "
print my_binary_eval.evaluate(rfc_preds)

my_binary_eval_2 = BinaryClassificationEvaluator(labelCol="PrivateIndex",
                                                 rawPredictionCol='prediction')
print "Gradient tree booster: "
print my_binary_eval_2.evaluate(gbt_preds)

acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',
                                             metricName='accuracy')

rfc_acc = acc_eval.evaluate(rfc_preds)

print "this is the new rfc_acc: {}".format(rfc_acc)
print("Processing crossvalidation with 3-fold & 200/500 hidden layer units")
crossval = CrossValidator(estimator=pipeline,
                  estimatorParamMaps=paramGrid_MLP,
                  evaluator=evaluator,
                  numFolds=3)
starttime = datetime.datetime.now()
CV_model = crossval.fit(vectorizedData)
print CV_model.bestModel.stages[2]
print('Done on fitting model:%s'%(datetime.datetime.now()-starttime))

print("Transforming testing data...")
vectorized_test_data = testing_data.toDF()

#transformed_data1 = CV_model.transform(vectorizedData)
#print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1)
transformed_data = CV_model.transform(vectorized_test_data)
#print transformed_data.first()
print("Fitting testing data into model...")
print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data)

predictions = transformed_data.select('indexedLabel', 'prediction')
print predictions.describe().show()
print predictions.take(10)
print predictions.where(predictions.prediction != predictions.indexedLabel)



#predictAndLabel=valid.map(lambda p: (model.predict(p.features),p.label))
#accuracy = 1.0*predictAndLabel.filter(lambda (x, v): x == v).count()/valid.count()
#accuracy
예제 #44
0
파일: V.py 프로젝트: Inscrutive/spark
ax0.set_title('First Model', color='#999999')
ax1.set_title('Second Model', color='#999999')
generateROC(axList[0], labelsAndScores)
generateROC(axList[1], labelsAndScores2)
display(fig)

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

metric = 'precision'

multiclassEval = MulticlassClassificationEvaluator()

multiclassEval.setMetricName(metric)
print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions))
print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2))

# COMMAND ----------

import inspect
print inspect.getsource(MulticlassClassificationEvaluator)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Using MLlib instead of ML
# MAGIC  
# MAGIC We've been using `ml` transformers, estimators, pipelines, and evaluators.  How can we accomplish the same things with MLlib?

# COMMAND ----------
# COMMAND ----------

# MAGIC %md Run `CrossValidator`.  `CrossValidator` checks to see if an MLflow tracking server is available.  If so, it log runs within MLflow:
# MAGIC
# MAGIC * Under the current active run, log info for `CrossValidator`.  (Create a new run if none are active.)
# MAGIC * For each submodel (number of folds of cross-validation x number of ParamMaps tested)
# MAGIC   * Log a run for this submodel, along with the evaluation metric on the held-out data.

# COMMAND ----------

# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once,
# but a second run would hit conflicts when attempting to overwrite the first run.
import mlflow
with mlflow.start_run():
    cvModel = cv.fit(training)
    test_metric = evaluator.evaluate(cvModel.transform(test))
    mlflow.log_metric('test_' + evaluator.getMetricName(),
                      test_metric)  # Logs additional metrics

# COMMAND ----------

# MAGIC %md To view the MLflow experiment associated with the notebook, click the **Runs** icon in the notebook context bar on the upper right.  There, you can view all runs. To more easily compare their results, click the button on the upper right that reads "View Experiment UI" when you hover over it.
# MAGIC
# MAGIC To understand the effect of tuning `maxDepth`:
# MAGIC
# MAGIC 1. Filter by `params.maxBins = "8"`.
# MAGIC 1. Select the resulting runs and click **Compare**.
# MAGIC 1. In the Scatter Plot, select X-axis **maxDepth** and Y-axis **avg_weightedPrecision**.
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[25]:

print "Testing precision of the model"
t0 = time()

dfValidSelect=dfValid.map(partial(vectorize, dicoUni=dict_broad.value, dicoTri=dictTri_broad.value)).toDF(['selectedFeatures','label']).cache()
dfValidIndexed = string_indexer_model.transform(dfValidSelect)
df_valid_pred = lrModel.transform(dfValidIndexed).cache()
res=evaluator.evaluate(df_valid_pred)
print res

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[28]:

import loadFiles as lf
print "Start loading  and preprocessing test data "
t0 = time()

test,names=lf.loadUknown('./data/test')
text_name=zip(test[1:2000],names[1:2000])
dfTest = sc.parallelize(text_name).toDF(['review','label']).cache()
def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))
예제 #48
0
                           stderr=sys.stderr,
                           prefix_output_with_timestamp=True)
    keras_estimator = hvd.KerasEstimator(
        backend=backend,
        store=store,
        model=model,
        optimizer=optimizer,
        loss=loss,
        metrics=['accuracy'],
        feature_cols=['features'],
        label_cols=['label_vec'],
        batch_size=args.batch_size,
        epochs=args.epochs,
        random_seed=1,
        inmemory_cache_all=True,
        verbose=1,
        callbacks=[keras.callbacks.TensorBoard(profile_batch=5)])

    keras_model = keras_estimator.fit(train_df).setOutputCols(['label_prob'])

    # Evaluate the model on the held-out test DataFrame
    pred_df = keras_model.transform(test_df)
    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))
    evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred',
                                                  labelCol='label',
                                                  metricName='accuracy')
    print('Test accuracy:', evaluator.evaluate(pred_df))

    spark.stop()
예제 #49
0
    bst_model_path = model_save_path + "_bst_model"
    train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345)
    bst_model = train_with_tune(train_df)
    bst_model.write().overwrite().save(bst_model_path)

    # 用训练得到最佳模型来对测试数据进行预测
    # 预测结果的数据结构是类似下面的结构:
    #      features = Vectors.dense(...)
    #      label=0,
    #      rawPrediction=DenseVector([0.048, -0.048]),
    #      probability=DenseVector([0.512, 0.488]),
    #      prediction=0.0
    loaded_bst_model = PipelineModel.load(bst_model_path)
    result = loaded_model.transform(train_df)
    predict_result = loaded_bst_model.transform(test_df)
    print("predicted sample :", predict_result.take(3))

    # 对训练出来的二分类模型进行评估
    bin_eval = BinaryClassificationEvaluator()
    predict_metric = bin_eval.evaluate(predict_result, {bin_eval.metricName: "areaUnderROC"})
    print("trained model test auc metric", predict_metric)

    # 查看具体分类混淆矩阵信息,默认会计算f1
    mm = MulticlassClassificationEvaluator()
    f1 = mm.evaluate(predict_result)
    accuracy = mm.evaluate(predict_result, {mm.metricName: "accuracy"})
    precision = mm.evaluate(predict_result, {mm.metricName: "weightedPrecision"})
    recall = mm.evaluate(predict_result, {mm.metricName: "weightedRecall"})
    print("predict trained model precision: %f, recall: %f, acc: %s, f1: %f " \
          % (precision, recall, accuracy, f1))
예제 #50
0
    "predictionLabel").setLabels(label_indexer.labels)
"""把数据集分成训练集和测试集"""
training_data, test_data = df.randomSplit([0.7, 0.3])
"""4 构建决策树分类模型"""
# 导入需要的包
from pyspark.ml.classification import DecisionTreeClassificationModel, DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 训练决策树模型,这里我们可以通过setter的方法来设置决策树的参数,也可以用ParamMap来设置(具体的可以查看spark mllib官网)。
# 具体的可以设置的参数可以通过explainParams()来获取
dt_classifier = DecisionTreeClassifier().setLabelCol(
    "indexedLabel").setFeaturesCol("indexedFeatures")
# 在pipeline中进行设置
pipeline_classifier = Pipeline().setStages(
    [label_indexer, feature_indexer, dt_classifier, label_converter])
# 进行决策树模型
model_classifier = pipeline_classifier.fit(training_data)
# 进行预测
predictions_classifier = model_classifier.transform(test_data)
# 查看部分预测的结果
predictions_classifier.select("predictionLabel", "label", "features").show(20)
"""5 评估决策树分类模型"""
evaluator_classifier = MulticlassClassificationEvaluator().setLabelCol(
    "indexedLabel").setPredictionCol("prediction").setMetricName("accuracy")
accuracy = evaluator_classifier.evaluate(predictions_classifier)
print("Test Error: ", str(1.0 - accuracy))
tree_model_classifier = model_classifier.stages[2]
print("Learned classification tree model:\n",
      str(tree_model_classifier.toDebugString))
spark.stop()
예제 #51
0
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()

print "Grid is build"

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)

print "CV Estimator is defined"

cv_model = cv.fit(dfTrain)

print "Model is fitted"

df_test_pred = cv_model.transform(dfTest)

print "Labels are predicted"

print evaluator.evaluate(df_test_pred)
예제 #52
0
data = vector_assembler.transform(data)
# data.show()
# 将city转换为数字编码
label_indexer = ft.StringIndexer(inputCol="city",
                                 outputCol="city_int").fit(data)
label_converter = ft.IndexToString(inputCol="pred_int",
                                   outputCol="pred",
                                   labels=label_indexer.labels)

train, test = data.randomSplit([0.7, 0.3])
# 定义随机森林分类器
classifier = RandomForestClassifier(labelCol="city_int",
                                    featuresCol="features",
                                    predictionCol="pred_int",
                                    maxDepth=8,
                                    maxBins=128,
                                    maxMemoryInMB=512,
                                    numTrees=50)

# 模型训练与预测
pipeline = Pipeline(stages=[label_indexer, classifier, label_converter])
model = pipeline.fit(train)
prediction = model.transform(test)
prediction.select("city", "city_int", "pred", "pred_int", "features").show(100)
# 评估函数
evaluator = MulticlassClassificationEvaluator(predictionCol='pred_int',
                                              labelCol='city_int',
                                              metricName='accuracy')
score = evaluator.evaluate(prediction)
print("准确率: ", score)
예제 #53
0
    # $example on$
    # load data file.
    inputData = spark.read.format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    # instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model.
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="precision")

    # compute the classification error on test data.
    precision = evaluator.evaluate(predictions)
    print("Test Error : " + str(1 - precision))
    # $example off$

    spark.stop()
예제 #54
0
    "clicks_in_15", "clicks_in_16", "clicks_in_17", "clicks_in_18",
    "clicks_in_19"
],
                            outputCol="features")
vd = assembler.transform(df_4)

training, test = vd.randomSplit([0.6, 0.4], 1234138471039)

layers = [19, 5, 3]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         blockSize=1000,
                                         seed=1234138471039)

# train the model
model = trainer.fit(training)

result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
params = model.explainParams()
print(params)

mlflow.log_param("seed", 1234138471039)
mlflow.log_param("maxIter", 100)
mlflow.log_param("blockSize", 1000)
mlflow.log_metric("Test set accuracy", evaluator.evaluate(predictionAndLabels))
data_rdd = data_rdd.map(lambda x: convertToDataFrame(x)).cache()
data_df = spark.createDataFrame(data_rdd)
# Split the data into train and test
splits = data_df.randomSplit([0.6, 0.4])
train = splits[0]
test = splits[1]

# 3. training
print '>>>>> training  '
from pyspark.ml.classification import MultilayerPerceptronClassifier

mlp = MultilayerPerceptronClassifier(
    maxIter=1000,
    tol=1e-4,
    seed=1,
    layers=[n_features, n_features, n_features, n_features, n_classes],
    blockSize=100,
    stepSize=0.03,
    solver="l-bfgs",
    initialWeights=None)
model = mlp.fit(train)

# 4. compute accuracy on the test set
print '>>>>> testing  '
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Spark Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
예제 #56
0
train_data_indexed = si_model_training.transform(train_data)
si_model_test = stringIndexer.fit(train_data)
test_data_indexed = si_model_test.transform(test_data)

# Random Forest model
random_forest_class = RandomForestClassifier(numTrees=5,
                                             maxDepth=4,
                                             labelCol="indexed",
                                             seed=42)
random_forest_model = random_forest_class.fit(train_data_indexed)

# Prediction using Random Forest
predicted_result = random_forest_model.transform(test_data_indexed)
combined_result = predicted_result.select("label", "features", "prediction")
combined_result.show(80, False)
print 'Random Forest :: Weighted Pricision of model : ', weighted_precision.evaluate(
    predicted_result)
print 'Random Forest :: Weighted Recall of model    : ', weighted_recall.evaluate(
    predicted_result)
'''
 Random Forest :: Weighted Pricision of model : 0.776882205236
 Random Forest :: Weighted Recall of model    : 0.799712452997
'''

# Decision Tree model
decision_tree_class = DecisionTreeClassifier(maxDepth=4, labelCol="indexed")
decision_tree_model = decision_tree_class.fit(train_data_indexed)
# Prediction using Decision Tree
predicted_result = decision_tree_model.transform(test_data_indexed)
combined_result = predicted_result.select("label", "features", "prediction")
combined_result.show(80, False)
print 'Decision Tree :: Weighted Pricision of model : ', weighted_precision.evaluate(
    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("predictedLabel", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    rfModel = model.stages[2]
    print(rfModel)  # summary only
    # $example off$

    spark.stop()


예제 #58
0
            'LateAircraftDelay'],
              outputCol="features")

output = assembler.transform(df2)

# Spark's mllib directly cannot deal with "Yes" or "No" values so we used StringIndexer method.
# isDelayIndex is created after transforming string values to O and 1.
indexer = StringIndexer(inputCol="isDelay", outputCol="isDelayIndex")

output_fixed = indexer.fit(output).transform(output)

final_data = output_fixed.select("features",'isDelayIndex')

train_data,test_data = final_data.randomSplit([0.3,0.7])

rfc = RandomForestClassifier(labelCol='isDelayIndex',featuresCol='features')

rfc_model = rfc.fit(train_data)

rfc_predictions = rfc_model.transform(test_data)

acc_evaluator = MulticlassClassificationEvaluator(labelCol="isDelayIndex", predictionCol="prediction", metricName="accuracy")

rfc_acc = acc_evaluator.evaluate(rfc_predictions)

print("Here are the results!")

print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))


예제 #59
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("NaiveBayesExample")\
        .getOrCreate()

    # $example on$
    # Load training data
    data = spark.read.format("libsvm") \
        .load("data/mllib/sample_libsvm_data.txt")

    # Split the data into train and test
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # train the model
    model = nb.fit(train)

    # compute accuracy on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    spark.stop()
예제 #60
0
#estima = NaiveBayes()
#grid = ParamGridBuilder().addGrid(5, [0, 2]).build()
lr = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction",maxIter=20)	#choose the model
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()	
#la grille est construite pour trouver le meilleur parametre 'alpha' pour le terme de regularisation du modele: c'est un 'elastic Net'
#max.iter vaut 30 par defaut, on pourrait changer sa valeur
#on va donc essayer 30 valeur entre 0 et 1
#alpha=0 c'est une regularisation L2, 
#alpha=1, c'est une regularisation L1
print "Cross validation debut"

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label",metricName='precision')	#choose the evaluator
cv = CrossValidator(estimator=lr, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter
#cvModel = cv.fit(dfTrain)	#train the model on the whole training set
model = lr.fit(dfTrain)
resultat=evaluator.evaluate(model.transform(dfTest))	#compute the percentage of success on test set
print "Pourcentage de bonne classification(0-1): ",resultat

##Train NaiveBayes
#model=NaiveBayes.train(labeledRDD)
##broadcast the model
#mb=sc.broadcast(model)
#
#test,names=lf.loadUknown('./data/test')
#name_text=zip(names,test)
##for each doc :(name,text):
##apply the model on the vector representation of the text
##return the name and the class
#predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect()
#
#output=file('./classifications.txt','w')