def testLogisticMLPipeline1(self): training = sqlCtx.createDataFrame([ ("a b c d e spark", 1.0), ("b d", 2.0), ("spark f g h", 1.0), ("hadoop mapreduce", 2.0), ("b spark who", 1.0), ("g d a y", 2.0), ("spark fly", 1.0), ("was mapreduce", 2.0), ("e spark program", 1.0), ("a e c l", 2.0), ("spark compile", 1.0), ("hadoop software", 2.0) ], ["text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) lr = LogisticRegression(sqlCtx) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) test = sqlCtx.createDataFrame([ ("spark i j k", 1.0), ("l m n", 2.0), ("mapreduce spark", 1.0), ("apache hadoop", 2.0)], ["text", "label"]) result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator() score = evaluator.evaluate(predictionAndLabels) self.failUnless(score == 1.0)
def RunRandomForest(tf, ctx): sqlContext = SQLContext(ctx) rdd = tf.map(parseForRandomForest) # The schema is encoded in a string. schema = ['genre', 'track_id', 'features'] # Apply the schema to the RDD. songDF = sqlContext.createDataFrame(rdd, schema) # Register the DataFrame as a table. songDF.registerTempTable("genclass") labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF) trainingData, testData = songDF.randomSplit([0.8, 0.2]) labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features") #rfc = SVMModel([.5, 10, 20], 5) #rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features") pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.show() evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision") accuracy = evaluator.evaluate(predictions) print 'Accuracy of RandomForest = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def sparking_your_interest(): df = SQLContext.read.json('speeches_dataset.json') df_fillna=df.fillna("") print(df_fillna.count()) print(df_fillna.printSchema()) df_utf=call_utf_encoder(df) df_cleaned=call_para_cleanup(df_utf) print(df_cleaned) df_with_bigrams = call_ngrams(df_cleaned, 2) df_with_trigrams = call_ngrams(df_with_bigrams, 3) df_with_4grams = call_ngrams(df_with_trigrams, 4) df_with_5grams = call_ngrams(df_with_4grams, 4) df_with_6grams = call_ngrams(df_with_5grams, 4) df_with_vocab_score = call_speech_vocab(df_with_6grams) df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams') df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams') df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams') assembler = VectorAssembler( inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"], outputCol="features") assembler_output = assembler.transform(df_with_4grams_idf_vectors) output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features') print(output.show()) print(output.count()) output_tordd = output.rdd train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123) train_df = train_rdd.toDF() test_df = test_rdd.toDF() print(train_df) print(test_df) print('Train DF - Count: ') print(train_df.count()) print('Test DF - Count: ') print(test_df.count()) print("Initializing RF Model") labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) pipeline = Pipeline(stages=[labelIndexer,rf]) model = pipeline.fit(output) print("Completed RF Model") predictions = model.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[1] print(rfModel) # summary only print("Predictions: ") print(predictions.show())
def model(classifier, ftrain, fvalid, fprediction): startTime = time.time() ctx = SparkContext(appName="model_on_Spark") sqlContext = SQLContext(ctx) logger = SparkLogger(ctx) logger.set_level('ERROR') # load and prepare training and validation data rawTrain, train = prepData(sqlContext, ctx, ftrain) rawValid, valid = prepData(sqlContext, ctx, fvalid) # is needed to join columns valid = indexData(valid) rawValid = indexData(rawValid) classifiers = { "RandomForestClassifier" : RFC } clf = classifiers[classifier]() labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") # train and predict pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(train) predictions = model.transform(valid) # write to file: subsetPrediction = predictions.select("prediction", "index") subsetValidData = rawValid.select("dataset", "index") output = (subsetValidData .join(subsetPrediction, subsetPrediction.index == subsetValidData.index) .drop("index") .drop("index")) lines = output.map(toCSVLine) lines.saveAsTextFile('output') evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print "Test Error = %g" % (1.0 - accuracy) executionTime = time.time() - startTime row=classifier+','+str(executionTime) ctx.parallelize([row]).saveAsTextFile("timing")
def price_predict(path, windows=5, spark_contest=None, sql_context=None): if spark_contest is None: spark_contest, sql_context = load_spark_context() input_data = DataParser(path=path, window_size=windows) close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data( data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context) evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION) # handle open data open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128, featuresCol=FEATURES, labelCol=LABEL, seed=1234) open_model = open_trainer.fit(open_train_df) open_result = open_model.transform(open_test_df) open_prediction_labels = open_result.select(PREDICTION, LABEL) print("Precision:" + str(evaluator.evaluate(open_prediction_labels))) # handle close data close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128, featuresCol=FEATURES, labelCol=LABEL, seed=1234) close_model = close_trainer.fit(close_train_df) close_result = close_model.transform(close_test_df) close_prediction_labels = close_result.select(PREDICTION, LABEL) print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
def print_evaluation_metrics(model, test_df, labelCol="label", featuresCol="features"): """ Prints evaluation metrics. :param model: Used model. :param test_df: dataframe containing test data. :param labelCol: label column. :param featuresCol: features column. :return: A DataFrame. """ predictions = model.transform(test_df) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="prediction",) accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}) f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"}) weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}) weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}) print "Accuracy:", accuracy print "f1:", f1 print "Precision:", weighted_precision print "Recall:", weighted_recall
def calculate_accuracy_metrics(predictions): """ Calculates accuracy metrics for a Prediction DataFrame :param predictions: :return: """ evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction") accuracy = round(evaluator.evaluate(predictions, {evaluator.metricName: "precision"}), 2) recall = round(evaluator.evaluate(predictions, {evaluator.metricName: "recall"}), 2) positive_cases = predictions.filter(predictions["indexedLabel"] == 1.0) negative_cases = predictions.filter(predictions["indexedLabel"] == 0.0) false_positive_cases = negative_cases.filter(positive_cases["prediction"] == 1.0) false_negative_cases = positive_cases.filter(positive_cases["prediction"] == 0.0) return [accuracy, recall, positive_cases.count(), negative_cases.count(), false_positive_cases.count(), false_negative_cases.count()]
def build_decision_tree(sqlContext, features, interested): print '-----------------------------------------' data = sqlContext.createDataFrame( [Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))]) data.printSchema() data.show(5) print 'created data frame' # Index the label column & adding metadata. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) print 'created label indexer' # Mark the features with < 4 distinct values as categorical featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.8, 0.2]) # Train a DecisionTree model dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) # Chain the indexers together with DecisionTree pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train the model model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData) predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) & compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") precision = evaluator.evaluate(predictions) treeModel = model.stages[2] return (1 - precision, model)
def naiveBayeseian(): def parseLine(line): keys = [float(x) for x in line.split(",")] #return LabeledPoint(keys[0],keys[1:]) return keys scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv") data= scdata1.map(parseLine) splits = data.randomSplit([0.8, 0.2], 1234) train = splits[0] test = splits[1] layers = [30, 20, 20, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
#gbt = GBTClassifier(numTrees = 10, maxDepth = 3, maxBins = 64) gbt = GBTClassifier(maxIter = 30, maxDepth = 2, impurityType = gini) #gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) ##rf = RandomForestClassifier(numTrees = 25, maxDepth = 4, maxBins = 64) pipeline = Pipeline(stages=[gbt]) pipelineModel = pipeline.fit(training) testPredictions = pipelineModel.transform(test) testPredictions.select("prediction", "label", "features").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")#.setMetricName("accuracy") evaluatorParaMap = {evaluator.metricName: "f1"} aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap) from pyspark.ml.tuning import * paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [1,5]).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3) cvModel = cv.fit(training) cvPredictions = cvModel.transform(test) cvAUCTest = evaluator.evaluate(cvPredictions, evaluatorParaMap) print("pipeline Test AUC: %g" % aucTest) print("Cross-Validation test AUC: %g" % cvAUCTest) end = time.time()
adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999) va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features').transform(df) encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False).transform(va).select(['features', 'labels']) #demonstration of options. Not all are required spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfOptimizer='adam', miniBatchSize=300, miniStochasticIters=1, shufflePerIter=True, iters=50, predictionCol='predicted', labelCol='labels', partitions=3, verbose=1, optimizerOptions=adam_config ) spark_model.fit(encoded).save('simple_dnn') predictions = SparkAsyncDLModel.load("simple_dnn").transform(encoded) evaluator = MulticlassClassificationEvaluator( labelCol="labels", predictionCol="predicted", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
]) df_train = sqlContext.createDataFrame( train.map( lambda x: Row([float(m) for m in x[0:-1]], transform_label(x[-1]))), schema) df_test = sqlContext.createDataFrame( test.map( lambda x: Row([float(m) for m in x[0:-1]], transform_label(x[-1]))), schema) list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT()) df_train = df_train.select( list_to_vector_udf(df_train["features"]).alias("features"), 'label').cache() df_test = df_test.select( list_to_vector_udf(df_test["features"]).alias("features"), 'label').cache() rf = RandomForestClassifier(maxDepth=30) rfmodel = rf.fit(df_train) rfpredicts = rfmodel.transform(df_test) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") print("F1 = %0.4f" % evaluator.evaluate(rfpredicts)) ######### sc.stop()
data = sqlContext.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test data.show() data.printSchema() data.select('features').show() splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] print (train.count()) train.show() test = splits[1] # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ sc.stop()
inputCol='features', outputCol='std_features') layers = [16, 20, 20, 10] # create the trainer and set its parameters mlp = MultilayerPerceptronClassifier( layers=layers, labelCol="label", featuresCol="std_features") pipeline = Pipeline(stages=[standardizer , mlp]) mlpModel=pipeline.fit(trainingData); mlpPredictions=mlpModel.transform(testingData); mlpPredictions.select("prediction", "label", "std_features").show(5) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(mlpPredictions) print("Accuracy on test data = %g" % accuracy) paramGrid = ParamGridBuilder().\ addGrid(mlp.maxIter, [ 50,100,150]).\ addGrid(mlp.blockSize, [ 64,128]). \ addGrid(mlp.layers, [(16, 10, 10, 10),(16, 32, 32, 10)]). \ build() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, # 80% of the data will be used for training, 20% for validation. trainRatio=0.8)
def _execute(self): df = self.df_from_temp_table(self.kwargs["previous_job_temp_table"]) if self.target_label in df.columns: df = df.drop(self.target_label) cols_to_index = [ k for k, v in df.dtypes if (v == "string" and k != self.target_label) ] cols_not_to_index = [k for k, v in df.dtypes if v != "string"] feature_cols = cols_not_to_index + [ "indexed{}".format(col_) for col_ in cols_to_index ] df = self.create_feature_vector(df, feature_cols) # Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. label_indexer = StringIndexer( inputCol=self.target_label, outputCol="{}loan_status".format("indexed")) label_indexer.setHandleInvalid("skip") label_indexer = label_indexer.fit(df) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 12 distinct values are # treated as continuous. feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=12) feature_indexer.setHandleInvalid("skip") # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = df.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestClassifier(labelCol="{}loan_status".format("indexed"), featuresCol="indexedFeatures", predictionCol="prediction", numTrees=10) # # Convert indexed labels back to original labels. label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=label_indexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline( stages=[label_indexer, feature_indexer, rf, label_converter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # # Make predictions. predictions = model.transform(testData) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="{}loan_status".format("indexed"), predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) self.metrics["accuracy"] = accuracy return str(accuracy)
# Loading the dataset KCDPFinal = spark.read.format("csv").option("header", True).option( "inferSchema", True).option("delimiter", ",").load("C:/KCcrimeForAnalytics.csv").withColumnRenamed( "Firearm_Used_Flag", "label") KCDPFinal # Create vector assembler for feature columns VAssembler = VectorAssembler(inputCols=KCDPFinal.columns[1:19], outputCol="features") KCDPFinal = VAssembler.transform(KCDPFinal) # Split the crime dataset into training and testing data sets trainingData, testingData = KCDPFinal.select("label", "features").randomSplit( [0.7, 0.3]) # Using the training set for the model traning from pyspark.ml.classification import NaiveBayes NaiveBayesModel = NaiveBayes() model = NaiveBayesModel.fit(trainingData) # Generate prediction from test dataset CrimepredKC = model.transform(testingData) # Evuluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(CrimepredKC) # Show model accuracy print("Accuracy:", accuracy)
# Chain indexers and forest in a Pipeline pipeline = Pipeline( stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select('predictedLabel', 'label', 'features').show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='accuracy') accuracy = evaluator.evaluate(predictions) print('Test Error = %g' % (1.0 - accuracy)) print('Accuracy = ', accuracy) rfModel = model.stages[2] print(rfModel) # summary only #Calcular AUC evaluator = BinaryClassificationEvaluator() evaluation = evaluator.evaluate(model.transform(testData)) print('AUC:', evaluation) #Detener sc.stop()
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("naive_bayes_example")\ .getOrCreate() # $example on$ # Load training data data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop()
from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.classification import NaiveBayes, NaiveBayesModel from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \ labelCol="indexed",metricName="accuracy") #Create the Decision Trees model dtClassifer = DecisionTreeClassifier(labelCol="indexed", \ featuresCol="features") dtModel = dtClassifer.fit(trainingData) #Predict on the test data predictions = dtModel.transform(testData) predictions.select("prediction", "indexed", "label", "features").collect() print("Results of Decision Trees : ", evaluator.evaluate(predictions)) #Create the Random Forest model rmClassifer = RandomForestClassifier(labelCol="indexed", \ featuresCol="features") rmModel = rmClassifer.fit(trainingData) #Predict on the test data predictions = rmModel.transform(testData) predictions.select("prediction", "indexed", "label", "features").collect() print("Results of Random Forest : ", evaluator.evaluate(predictions)) #Create the Naive Bayes model nbClassifer = NaiveBayes(labelCol="indexed", \ featuresCol="features") nbModel = nbClassifer.fit(trainingData) #Predict on the test data
# model on training data regPara: lasso regularisation parameter (L1) lrModel = LogisticRegression(regParam=0.1).fit(trainData) # make prediction on test data pred = lrModel.transform(testData) pred.select('catLabel', 'label', 'prediction').show() evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC") evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred)) print('F1 score of Logistic Regression model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context322"></a> # #### 3.2.2. Decision Tree # In[18]: from pyspark.ml.classification import DecisionTreeClassifier # model on training data maxDepth is the hyperparameter dtModel = DecisionTreeClassifier(maxDepth=3).fit(trainData) # make prediction on test data pred = dtModel.transform(testData)
#doing DecisionTreeClassifier dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=13, maxBins=64, impurity='entropy') time1 = time.time() dtc_model = dt.fit(trainingData) time2 = time.time() dtc_time = time2 - time1 dtc_prediction = dtc_model.transform(testData) evaluator = MulticlassClassificationEvaluator\ (labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(dtc_prediction) dtc_3 = get_top_3(trainingData, dtc_model.featureImportances) #doing DecisionTreeRegressor dt = DecisionTreeRegressor(labelCol="label", featuresCol="features", maxDepth=12, maxBins=64, minInstancesPerNode=2, minInfoGain=0.0) time1 = time.time() dtr_model = dt.fit(trainingData) time2 = time.time() dtr_time = time2 - time1 dtr_prediction = dtr_model.transform(testData) dtr_3 = get_top_3(trainingData, dtr_model.featureImportances)
(trainingData, testData) = forData.randomSplit([0.8,0.2],seed=0) print(trainingData.take(1)) rfClassifier = RandomForestClassifier(numTrees=10,maxDepth=10, seed=0,labelCol="indexed") start_time = time.time() modelClassifier = rfClassifier.fit(trainingData) end_time = time.time() cost_time = end_time - start_time print("spark rf time :",cost_time) predictionsClassifier = modelClassifier.transform(testData) evaluator= MulticlassClassificationEvaluator().setLabelCol("indexed").setPredictionCol("prediction") print("accuracy = ",evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "accuracy"})) print("weightedPrecision = ",evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "weightedPrecision"})) print("weightedRecall = ",evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "weightedRecall"})) print("f1 = ",evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "f1"}))
print >> sys.stderr, "%s <input> <model_path> <stop_file> class_num appname" % sys.argv[0] sys.exit(1) input_path = sys.argv[1] model_path = sys.argv[2] stop_file = sys.argv[3] class_num = int(sys.argv[4]) appname = sys.argv[5] conf = SparkConf().setAppName(appname) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) data_df = text_to_df(sc, sqlContext, input_path) print "*** create data frame ***" splits = data_df.randomSplit([0.8, 0.2], 1234) training = splits[0].cache() test = splits[1].cache() stopwords = get_stopwords(stop_file) print "*** load %s stopwords ***" % len(stopwords) pipeline = get_pipeline(vector_size=50, class_num=class_num, stopwords=stopwords) model = pipeline.fit(training) result = model.transform(test) pred_label = result.select("prediction", "indexLabel") evaluator = MulticlassClassificationEvaluator(metricName="precision", predictionCol="prediction", labelCol="indexLabel") print("Precision: " + str(evaluator.evaluate(pred_label)))
.build() # 2^3 models = 8 lr_grid = ParamGridBuilder().baseOn({pipeline.stages:[dtc_assembler, lr]})\ .addGrid(lr.regParam, [0.01, 0.05])\ .addGrid(lr.maxIter, [10, 100])\ .addGrid(lr.elasticNetParam, [0.0, 0.1])\ .build() paramGrid = dtc_grid + dtr_grid + lr_grid #fit cv cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, parallelism=10, numFolds=3, seed=16) cvModel = cv.fit(train) predictions = cvModel.transform(test) accuracy = evaluator.evaluate(predictions) print(cvModel.avgMetrics) print("trained models",len(cvModel.avgMetrics)) print("\n") print("Decision tree classifier models accuracy", cvModel.avgMetrics[:6]) print("\n") print("Decision tree regression models accuracy", cvModel.avgMetrics[6:15]) print("\n") print("Logistic regression models accuracy", cvModel.avgMetrics[15:23]) #GET BEST DTC model print("\n") print("Best decision tree classifier model") print("accuracy", cvModel.avgMetrics[np.argmax(cvModel.avgMetrics[:6])]) print(cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics[:6])])
prediction = piplineModel.transform(test) predicted = prediction.select("features", "prediction", "trueLabel") predicted.show(100, truncate=False) # COMMAND ---------- # MAGIC %md # MAGIC Following cell measure accuracy of Algorithm using MultiClassificationEvaluater and evaluate using predicted data. # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluation = MulticlassClassificationEvaluator( labelCol="trueLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluation.evaluate(prediction) print(accuracy) # COMMAND ---------- # MAGIC %md Following Code will give total Error in project # COMMAND ---------- print("Test Error = %g" % (1.0 - accuracy)) # COMMAND ---------- # MAGIC %md #### Gradient Boosting Evaluator # MAGIC Calculate AUC using Gradient Boosting Evaluator.
multiEvaluator = MulticlassClassificationEvaluator() #Setting Paramaters for Crossvalidation mlp_cv = CrossValidator( estimator=pipeline, evaluator=evaluator, estimatorParamMaps=mlp_paramGrid, numFolds=user_mlp_param_numFolds) mlp_cvmodel = mlp_cv.fit(train) #Evaluating Multilayer Perceptron Model Performance from pyspark.sql.functions import udf mlp_predictions = mlp_cvmodel.transform(test) auroc = evaluator.evaluate(mlp_predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(mlp_predictions, {evaluator.metricName: "areaUnderPR"}) "The AUROC is %s and the AUPR is %s" % (auroc, aupr) f1score = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "f1"}) weightedPrecision = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "weightedPrecision"}) weightedRecall = multiEvaluator.evaluate(mlp_predictions, {multiEvaluator.metricName: "weightedRecall"}) "The F1 score: %s the Weighted Precision: %s the Weighted Recall is %s" % (f1score, weightedPrecision, weightedRecall) #Select The Best Multilayer Perceptron Model After Crossvalidation mlpmodel = mlp_cvmodel.bestModel bestMLPModel = mlpmodel.stages[-1] #Retrieving Paramaters from the Best MLP Model #param_BestModel_Layers = bestMLPModel._java_obj.layers #param_BestModel_Iter = bestMLPModel._java_obj.maxIter ### Stop Timer stopTime = time.process_time()
from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator grid=(ParamGridBuilder() .baseOn([evaluator.metricName,'precision']) .addGrid(dt.maxDepth, [10,20]) .build()) cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator) # In[332]: print "Fitting the decision tree on selected features" t0 = time() cv_model = cv.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[302]: pr dfTestIndexed = string_indexer_model.transform(dfTestSelect) df_test_pred = cv_model.transform(dfTestIndexed) res=evaluator.evaluate(df_test_pred) print res
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark import SQLContext from pyspark.sql.types import * import pyspark.sql.functions as F from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import PipelineModel sc = SparkContext(appName="MyFirstApp3_Task_task2") spark = SparkSession(sc) df_node16 = spark.read.format("parquet").load( path="hdfs://namenode:9000/example3/test.parquet") model_node17 = PipelineModel.load("hdfs://namenode:9000/example3/model/") df_node18 = model_node17.transform(df_node16) evaluator_node19 = MulticlassClassificationEvaluator( labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy") score_node19 = evaluator_node19.evaluate(df_node18) df_node19 = spark.createDataFrame([(score_node19, )], ["score"]) df_node19.write.format("csv").save( path="hdfs://namenode:9000/example3/EvalResult3.csv", quote="\"", header=True, sep=",")
title_category = news_data.select("TITLE", "CATEGORY") title_category = title_category.dropna() title_category = title_category.withColumn( "only_str", regexp_replace(col('TITLE'), '\d+', '')) regex_tokenizer = RegexTokenizer(inputCol="only_str", outputCol="words", pattern="\\W") raw_words = regex_tokenizer.transform(title_category) remover = StopWordsRemover(inputCol="words", outputCol="filtered") words_df = remover.transform(raw_words) indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex") feature_data = indexer.fit(words_df).transform(words_df) cv = CountVectorizer(inputCol="filtered", outputCol="features") model = cv.fit(feature_data) countVectorizer_feateures = model.transform(feature_data) (trainingData, testData) = countVectorizer_feateures.randomSplit([0.8, 0.2], seed=11) nb = NaiveBayes(modelType="multinomial", labelCol="categoryIndex", featuresCol="features") nbModel = nb.fit(trainingData) nb_predictions = nbModel.transform(testData) evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy") nb_accuracy = evaluator.evaluate(nb_predictions) print("Accuracy of NaiveBayes is = %g" % (nb_accuracy)) print("Test Error of NaiveBayes = %g " % (1.0 - nb_accuracy)) time2 = datetime.datetime.now() elapsedTime = time2 - time1 print(elapsedTime)
#predictions.show() # In[120]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator # In[121]: ####### Random Forest Accuarcy - not using this as simple linear regression is giving better results ############ evalAcc = MulticlassClassificationEvaluator( labelCol="quality", predictionCol="prediction", metricName="accuracy") accuracy = evalAcc.evaluate(predictions) ##print("accuracy Test Error = %g" % (1.0 - accuracy)) transformed_data = model.transform(val) transformed_data = transformed_data.withColumn("prediction", func.round("prediction")) ##print(evalAcc.getMetricName(), 'accuracy:', evalAcc.evaluate(transformed_data)) # In[122]: ####### Random Forest f1 - not using this as simple linear regression is giving better results ############ evalVal = MulticlassClassificationEvaluator( labelCol="quality", predictionCol="prediction", metricName="f1")
rel = {} rel['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]), float(x[3])) rel['label'] = str(x[4]) return rel data = spark.sparkContext.textFile("E:/iris.txt").map(lambda line: line.split(',')).map(lambda p: Row(**f(p))).toDF() data.createOrReplaceTempView("iris") df = spark.sql("select * from iris") # rel = df.rdd.map(lambda t : str(t[1])+":"+str(t[0])).collect() # for item in rel: # print(item) labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df) featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df) trainingData, testData = df.randomSplit([0.7, 0.3]) dtClassifier = DecisionTreeClassifier().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures") labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) pipelinedClassifier = Pipeline().setStages([labelIndexer, featureIndexer, dtClassifier, labelConverter]) modelClassifier = pipelinedClassifier.fit(trainingData) predictionsClassifier = modelClassifier.transform(testData) predictionsClassifier.select("predictedLabel", "label", "features").show(20) evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol( "prediction").setMetricName("accuracy") lrAccuracy = evaluator.evaluate(predictionsClassifier) print("Test Error = " + str(1.0 - lrAccuracy)) treeModelClassifier = modelClassifier.stages[2] print("Learned classification tree model:\n" + str(treeModelClassifier.toDebugString))
num_test_samples = 10000 num_train_samples= 60000 test_df = spark.read.csv(test_datafile,header=False,inferSchema="true") train_df = spark.read.csv(train_datafile,header=False,inferSchema="true") assembler = VectorAssembler(inputCols=train_df.columns[1:],outputCol="feature") train_vector=assembler.transform(train_df).select("_c0","feature") pca = PCA(k=99, inputCol="feature", outputCol="features")#PCA 784 to 99 model = pca.fit(train_vector) train_pca_result = model.transform(train_vector).select('_c0','features') new_train_pca_result=train_pca_result.withColumnRenamed("_c0", "label") assembler = VectorAssembler(inputCols=test_df.columns[1:],outputCol="feature") test_vector=assembler.transform(test_df).select("_c0","feature") test_pca_result = model.transform(test_vector).select('_c0','features') new_test_pca_result=test_pca_result.withColumnRenamed("_c0", "label") # train the model # create the trainer and set its parameters treeNumber = 100 #set tree number# trainer = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=treeNumber) trainmodel = trainer.fit(new_train_pca_result)#label/features results = trainmodel.transform(new_test_pca_result) predictionAndLabels = results.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print(""Test set accuracy of treeNumber = " + str(treeNumber1) + " is " + str(evaluator.evaluate(predictionAndLabels))) #evaluator.saveAsTextFile(output_path)
outputCol="words", pattern="\\W") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label") pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx]) pipelineModel = pipeline.fit(dataset) dataset = pipelineModel.transform(dataset) #splitting dataset trainingData, testData = dataset.randomSplit([0.7, 0.3], seed=100) #Building the Logistic Regression Model lr = LogisticRegression(maxIter=20, regParam=0.01, elasticNetParam=0) lrModel = lr.fit(trainingData) #Evaluating the classification model evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") predictions = lrModel.transform(testData) acc = evaluator.evaluate(predictions) print("Accuracy on testset is:", acc) #Saving the model lrModel.write().overwrite().save("lr_Model") pipelineModel.write().overwrite().save("pipeline_Model") print("Stored pipeline and model.")
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("Text","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # COMMAND ---------- #finding the accuracy from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions) # COMMAND ---------- #applying cross validation pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx]) pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df) (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation
# $example on$ # Load training data data = spark.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop()
# Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] # summary only print(treeModel) # see for more: https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier # Churn - which customers (of a telecommunications company) are likely to stop using their service # Churn dataset provided by the UC Irvine machine-learning repository hosted by SGI # Data from https://www.sgi.com/tech/mlc/db/churn.all $ wget https://www.sgi.com/tech/mlc/db/churn.all # Classification - Random Forest
si = StringIndexer(inputCol="purpose", outputCol="purpose_index") hot = OneHotEncoder(inputCol="purpose_index", outputCol="purpose_features") va = VectorAssembler(inputCols=["loan_amnt", "interest_rate", "employment_length", "home_owner", "income", "verified", "open_accts", "credit_debt", "purpose_features"], outputCol="features") dtr = DecisionTreeRegressor(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=2, varianceCol="variance") gbr = GBTRegressor(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345) gbc = GBTClassifier(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=5, maxIter=20, seed=12345) pipeline = Pipeline(stages=[si, hot, va, gbc]) model = pipeline.fit(training) model.write().overwrite().save('hdfs:///tmp/spark_model') predictions = model.transform(testing) predictions.select(['default','prediction']).sort(col('prediction').desc()).show(25,False) #evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="default") #rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}) #r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"}) #evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="default") #evaluator.evaluate(predictions) #evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="default") evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}) #ZEND
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache() lrModel = lr.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[19]: print "Testing precision of the model" t0 = time() dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache() dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache() df_valid_pred = lrModel.transform(dfValidIndexed).cache() res=evaluator.evaluate(df_valid_pred) print res tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[12]: import loadFiles as lf print "Start loading and preprocessing test data " t0 = time() test,names=lf.loadUknown('./data/test') text_name=zip(test,names) dfTest = sc.parallelize(text_name).toDF(['review','label']).cache()
def run_MLA(XX,XXpredict,yy,yypredict,unique_IDS_tr,unique_IDS_pr,uniquetarget_tr,uniquetarget_pr,n_feat,ind_run_name,n_run): logger.info('Starting MLA run') logger.info('------------') if settings.pyspark_on == 1: # Use pyspark or not? Pyspark makes cross node (HPC) calculation possible. from pyspark import SparkContext # It's slower, manages resources between nodes using HTTP. from pyspark.sql import SQLContext # So far, it does not include feature importance outputs. from pyspark.ml import Pipeline # I would have to program feature importances myself. May be time consuming. from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # pyspark go if settings.pyspark_remake_csv == 1: # Making the csv files for the pyspark MLA to read in is time consuming, turn off the file generation? logger.info('Remaking csvs for pysparks...') numpy.savetxt(temp_train, XX, delimiter=",") logger.info('Training csv saved') numpy.savetxt(temp_pred, XXpredict, delimiter=",") logger.info('Predict csv saved') sc = SparkContext(appName="ML_RF") # Initiate spark sclogger=sc._jvm.org.apache.log4j # Initiate spark logging sclogger.LogManager.getLogger("org").setLevel(sclogger.Level.ERROR) sclogger.LogManager.getLogger("akka").setLevel(sclogger.Level.ERROR) sqlContext=SQLContext(sc) # Read in data data_tr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_train) data_pr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_pred) data_tr=data_tr.withColumnRenamed(data_tr.columns[-1],"label") # rename last column (answers), to label data_pr=data_pr.withColumnRenamed(data_pr.columns[-1],"label") assembler=VectorAssembler(inputCols=data_tr.columns[:-1],outputCol="features") reduced=assembler.transform(data_tr.select('*')) # Assemble feature vectos for spark MLA assembler_pr=VectorAssembler(inputCols=data_pr.columns[:-1],outputCol="features") reduced_pr=assembler_pr.transform(data_pr.select('*')) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(reduced) # Index vectors featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(reduced) # Initiate MLA alg rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=100,maxDepth=5,maxBins=200) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Set up fitting pipeline start, end=[],[] # Timer logger.info('Fit start') logger.info('------------') start = time.time() model=pipeline.fit(reduced) # Fit end = time.time() logger.info('Fit ended in %s seconds' %(end-start)) logger.info('------------') start, end=[],[] logger.info('Predict start') logger.info('------------') start = time.time() predictions = model.transform(reduced_pr) # Predict evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision") accuracy = evaluator.evaluate(predictions) logger.info("Test Error = %g" %(1.0-accuracy)) logger.info('------------') logger.info('Pulling results ...') yypredict=numpy.array(predictions.select("indexedLabel").collect()) # Pulls all results into numpy arrays to continue program yypredict=yypredict[:,0] result=numpy.array(predictions.select("prediction").collect()) result=result[:,0] XXpredict=numpy.array(predictions.select("indexedFeatures").collect()) XXpredict=XXpredict[:,0] probs=numpy.array(predictions.select("probability").collect()) probs=probs[:,0] XXpredict=numpy.column_stack((XXpredict,yypredict)) end=time.time() logger.info('Predict ended in %s seconds' %(end-start)) logger.info('------------') else: # Run sklearn MLA switch MLA = get_function(settings.MLA) # Pulls in machine learning algorithm from settings clf = MLA().set_params(**settings.MLAset) logger.info('MLA settings') logger.info(clf) logger.info('------------') start, end=[],[] # Timer logger.info('Fit start') logger.info('------------') start = time.time() clf = clf.fit(XX[:,0:n_feat],yy) # XX is train array, yy is training answers end = time.time() logger.info('Fit ended in %s seconds' %(end-start)) logger.info('------------') score = clf.score if 'OvsA' not in ind_run_name: if settings.output_all_trees == 1: i_tree = 0 for tree_in_forest in clf.estimators_: with open('plots/tree_' + str(i_tree) + '.dot', 'w') as my_file: my_file = tree.export_graphviz(tree_in_forest, out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True) os.system('dot -Tpng plots/tree_%s.dot -o plots/tree_%s.png' %(i_tree,i_tree)) os.remove('plots/tree_%s.dot' %i_tree) i_tree = i_tree + 1 else: with open('plots/tree_example.dot', 'w') as my_file: my_file = tree.export_graphviz(clf.estimators_[0], out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True) os.system('dot -Tpng plots/tree_example.dot -o plots/tree_example.png') os.remove('plots/tree_example.dot') start, end=[],[] # Split cats for RAM management numcats = numpy.int64((2*(XXpredict.size/1024/1024)*clf.n_jobs)) if settings.get_contributions ==1: numcats=100 if numcats < 1: numcats = 1 logger.info('Predict start') logger.info('------------') start = time.time() result,probs,bias,contributions,train_contributions=[],[],[],[],[] XXpredict_cats=numpy.array_split(XXpredict,numcats) logger.info('Splitting predict array into %s' %numcats) logger.info('------------') for i in range(len(XXpredict_cats)): logger.info('Predicting cat %s/%s' %(i,len(XXpredict_cats))) result.extend(clf.predict(XXpredict_cats[i][:,0:n_feat])) # XX is predict array. probs.extend(clf.predict_proba(XXpredict_cats[i][:,0:n_feat])) # Only take from 0:n_feat because answers are tacked on end if 'OvsA' not in ind_run_name: if (settings.get_contributions == 1) | (settings.get_perfect_contributions==1): logger.info('Getting contributions from predict catalogue %s' %i) tiresult = ti.predict(clf,XXpredict_cats[i][:,0:n_feat]) contributions.extend(tiresult[2]) bias = tiresult[1][0] feat_importance = clf.feature_importances_ result=numpy.float32(result) probs=numpy.float32(probs) if 'OvsA' not in ind_run_name: if settings.get_contributions == 1: numpy.save('contributions',contributions) if settings.get_perfect_contributions == 1: numpy.save('perfect_contributions',contributions) if settings.compute_contribution_mic == 1: logger.info('Getting contributions from train catalogue (for plot_mic_cont)') tiresult_train = ti.predict(clf,XX[:,0:n_feat]) train_contributions=tiresult_train[2] bias_train = tiresult_train[1][0] accuracy = metrics.accuracy_score(result,yypredict) recall = metrics.recall_score(result,yypredict,average=None) precision = metrics.precision_score(result,yypredict,average=None) score = metrics.f1_score(result, yypredict,average=None) end = time.time() logger.info('Predict ended in %s seconds' %(end-start)) logger.info('------------') logger.info('Recall Score: %s' %recall) logger.info('Precision Score: %s' %precision) logger.info('Accuracy Score: %s' %accuracy) logger.info('F1 Score: %s' %score) percentage=(n/predictdatanum)*100 run_opts.diagnostics([result,yypredict,unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr],'result') # stats=numpy.array([]) # stats=numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage)) # SAVE if settings.saveresults == 1: logger.info('Saving results') logger.info('------------') numpy.savetxt(settings.result_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((yypredict,result)),header="True_target Predicted_target") numpy.savetxt(settings.prob_outfile+('_%s' %ind_run_name)+'.txt',probs) numpy.savetxt(settings.feat_outfile+('_%s' %ind_run_name)+'.txt',feat_importance) numpy.savetxt(settings.stats_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage,clf.max_depth)),header="n_est traindatanum predictdatanum percentage max_depth",fmt="%s") return result,feat_importance,probs,bias,contributions,accuracy,recall,precision,score,clf,train_contributions
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features') # fitting train data dtc_model = dtc.fit(train_data) rfc_model = rfc.fit(train_data) gbt_model = gbt.fit(train_data) # get predictions dtc_preds = dtc_model.transform(test_data) rfc_preds = rfc_model.transform(test_data) gbt_preds = gbt_model.transform(test_data) my_binary_eval = BinaryClassificationEvaluator(labelCol="PrivateIndex") print "Decision Tree Classification: " print my_binary_eval.evaluate(dtc_preds) print "Random Forest Classification: " print my_binary_eval.evaluate(rfc_preds) my_binary_eval_2 = BinaryClassificationEvaluator(labelCol="PrivateIndex", rawPredictionCol='prediction') print "Gradient tree booster: " print my_binary_eval_2.evaluate(gbt_preds) acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex', metricName='accuracy') rfc_acc = acc_eval.evaluate(rfc_preds) print "this is the new rfc_acc: {}".format(rfc_acc)
print("Processing crossvalidation with 3-fold & 200/500 hidden layer units") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid_MLP, evaluator=evaluator, numFolds=3) starttime = datetime.datetime.now() CV_model = crossval.fit(vectorizedData) print CV_model.bestModel.stages[2] print('Done on fitting model:%s'%(datetime.datetime.now()-starttime)) print("Transforming testing data...") vectorized_test_data = testing_data.toDF() #transformed_data1 = CV_model.transform(vectorizedData) #print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1) transformed_data = CV_model.transform(vectorized_test_data) #print transformed_data.first() print("Fitting testing data into model...") print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data) predictions = transformed_data.select('indexedLabel', 'prediction') print predictions.describe().show() print predictions.take(10) print predictions.where(predictions.prediction != predictions.indexedLabel) #predictAndLabel=valid.map(lambda p: (model.predict(p.features),p.label)) #accuracy = 1.0*predictAndLabel.filter(lambda (x, v): x == v).count()/valid.count() #accuracy
ax0.set_title('First Model', color='#999999') ax1.set_title('Second Model', color='#999999') generateROC(axList[0], labelsAndScores) generateROC(axList[1], labelsAndScores2) display(fig) # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator metric = 'precision' multiclassEval = MulticlassClassificationEvaluator() multiclassEval.setMetricName(metric) print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions)) print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2)) # COMMAND ---------- import inspect print inspect.getsource(MulticlassClassificationEvaluator) # COMMAND ---------- # MAGIC %md # MAGIC #### Using MLlib instead of ML # MAGIC # MAGIC We've been using `ml` transformers, estimators, pipelines, and evaluators. How can we accomplish the same things with MLlib? # COMMAND ----------
# COMMAND ---------- # MAGIC %md Run `CrossValidator`. `CrossValidator` checks to see if an MLflow tracking server is available. If so, it log runs within MLflow: # MAGIC # MAGIC * Under the current active run, log info for `CrossValidator`. (Create a new run if none are active.) # MAGIC * For each submodel (number of folds of cross-validation x number of ParamMaps tested) # MAGIC * Log a run for this submodel, along with the evaluation metric on the held-out data. # COMMAND ---------- # Explicitly create a new run. # This allows this cell to be run multiple times. # If you omit mlflow.start_run(), then this cell could run once, # but a second run would hit conflicts when attempting to overwrite the first run. import mlflow with mlflow.start_run(): cvModel = cv.fit(training) test_metric = evaluator.evaluate(cvModel.transform(test)) mlflow.log_metric('test_' + evaluator.getMetricName(), test_metric) # Logs additional metrics # COMMAND ---------- # MAGIC %md To view the MLflow experiment associated with the notebook, click the **Runs** icon in the notebook context bar on the upper right. There, you can view all runs. To more easily compare their results, click the button on the upper right that reads "View Experiment UI" when you hover over it. # MAGIC # MAGIC To understand the effect of tuning `maxDepth`: # MAGIC # MAGIC 1. Filter by `params.maxBins = "8"`. # MAGIC 1. Select the resulting runs and click **Compare**. # MAGIC 1. In the Scatter Plot, select X-axis **maxDepth** and Y-axis **avg_weightedPrecision**.
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache() lrModel = lr.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[25]: print "Testing precision of the model" t0 = time() dfValidSelect=dfValid.map(partial(vectorize, dicoUni=dict_broad.value, dicoTri=dictTri_broad.value)).toDF(['selectedFeatures','label']).cache() dfValidIndexed = string_indexer_model.transform(dfValidSelect) df_valid_pred = lrModel.transform(dfValidIndexed).cache() res=evaluator.evaluate(df_valid_pred) print res tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[28]: import loadFiles as lf print "Start loading and preprocessing test data " t0 = time() test,names=lf.loadUknown('./data/test') text_name=zip(test[1:2000],names[1:2000]) dfTest = sc.parallelize(text_name).toDF(['review','label']).cache()
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
stderr=sys.stderr, prefix_output_with_timestamp=True) keras_estimator = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, metrics=['accuracy'], feature_cols=['features'], label_cols=['label_vec'], batch_size=args.batch_size, epochs=args.epochs, random_seed=1, inmemory_cache_all=True, verbose=1, callbacks=[keras.callbacks.TensorBoard(profile_batch=5)]) keras_model = keras_estimator.fit(train_df).setOutputCols(['label_prob']) # Evaluate the model on the held-out test DataFrame pred_df = keras_model.transform(test_df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) evaluator = MulticlassClassificationEvaluator(predictionCol='label_pred', labelCol='label', metricName='accuracy') print('Test accuracy:', evaluator.evaluate(pred_df)) spark.stop()
bst_model_path = model_save_path + "_bst_model" train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345) bst_model = train_with_tune(train_df) bst_model.write().overwrite().save(bst_model_path) # 用训练得到最佳模型来对测试数据进行预测 # 预测结果的数据结构是类似下面的结构: # features = Vectors.dense(...) # label=0, # rawPrediction=DenseVector([0.048, -0.048]), # probability=DenseVector([0.512, 0.488]), # prediction=0.0 loaded_bst_model = PipelineModel.load(bst_model_path) result = loaded_model.transform(train_df) predict_result = loaded_bst_model.transform(test_df) print("predicted sample :", predict_result.take(3)) # 对训练出来的二分类模型进行评估 bin_eval = BinaryClassificationEvaluator() predict_metric = bin_eval.evaluate(predict_result, {bin_eval.metricName: "areaUnderROC"}) print("trained model test auc metric", predict_metric) # 查看具体分类混淆矩阵信息,默认会计算f1 mm = MulticlassClassificationEvaluator() f1 = mm.evaluate(predict_result) accuracy = mm.evaluate(predict_result, {mm.metricName: "accuracy"}) precision = mm.evaluate(predict_result, {mm.metricName: "weightedPrecision"}) recall = mm.evaluate(predict_result, {mm.metricName: "weightedRecall"}) print("predict trained model precision: %f, recall: %f, acc: %s, f1: %f " \ % (precision, recall, accuracy, f1))
"predictionLabel").setLabels(label_indexer.labels) """把数据集分成训练集和测试集""" training_data, test_data = df.randomSplit([0.7, 0.3]) """4 构建决策树分类模型""" # 导入需要的包 from pyspark.ml.classification import DecisionTreeClassificationModel, DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator # 训练决策树模型,这里我们可以通过setter的方法来设置决策树的参数,也可以用ParamMap来设置(具体的可以查看spark mllib官网)。 # 具体的可以设置的参数可以通过explainParams()来获取 dt_classifier = DecisionTreeClassifier().setLabelCol( "indexedLabel").setFeaturesCol("indexedFeatures") # 在pipeline中进行设置 pipeline_classifier = Pipeline().setStages( [label_indexer, feature_indexer, dt_classifier, label_converter]) # 进行决策树模型 model_classifier = pipeline_classifier.fit(training_data) # 进行预测 predictions_classifier = model_classifier.transform(test_data) # 查看部分预测的结果 predictions_classifier.select("predictionLabel", "label", "features").show(20) """5 评估决策树分类模型""" evaluator_classifier = MulticlassClassificationEvaluator().setLabelCol( "indexedLabel").setPredictionCol("prediction").setMetricName("accuracy") accuracy = evaluator_classifier.evaluate(predictions_classifier) print("Test Error: ", str(1.0 - accuracy)) tree_model_classifier = model_classifier.stages[2] print("Learned classification tree model:\n", str(tree_model_classifier.toDebugString)) spark.stop()
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" ) grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build() print "Grid is build" cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator) print "CV Estimator is defined" cv_model = cv.fit(dfTrain) print "Model is fitted" df_test_pred = cv_model.transform(dfTest) print "Labels are predicted" print evaluator.evaluate(df_test_pred)
data = vector_assembler.transform(data) # data.show() # 将city转换为数字编码 label_indexer = ft.StringIndexer(inputCol="city", outputCol="city_int").fit(data) label_converter = ft.IndexToString(inputCol="pred_int", outputCol="pred", labels=label_indexer.labels) train, test = data.randomSplit([0.7, 0.3]) # 定义随机森林分类器 classifier = RandomForestClassifier(labelCol="city_int", featuresCol="features", predictionCol="pred_int", maxDepth=8, maxBins=128, maxMemoryInMB=512, numTrees=50) # 模型训练与预测 pipeline = Pipeline(stages=[label_indexer, classifier, label_converter]) model = pipeline.fit(train) prediction = model.transform(test) prediction.select("city", "city_int", "pred", "pred_int", "features").show(100) # 评估函数 evaluator = MulticlassClassificationEvaluator(predictionCol='pred_int', labelCol='city_int', metricName='accuracy') score = evaluator.evaluate(prediction) print("准确率: ", score)
# $example on$ # load data file. inputData = spark.read.format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="precision") # compute the classification error on test data. precision = evaluator.evaluate(predictions) print("Test Error : " + str(1 - precision)) # $example off$ spark.stop()
"clicks_in_15", "clicks_in_16", "clicks_in_17", "clicks_in_18", "clicks_in_19" ], outputCol="features") vd = assembler.transform(df_4) training, test = vd.randomSplit([0.6, 0.4], 1234138471039) layers = [19, 5, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1000, seed=1234138471039) # train the model model = trainer.fit(training) result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) params = model.explainParams() print(params) mlflow.log_param("seed", 1234138471039) mlflow.log_param("maxIter", 100) mlflow.log_param("blockSize", 1000) mlflow.log_metric("Test set accuracy", evaluator.evaluate(predictionAndLabels))
data_rdd = data_rdd.map(lambda x: convertToDataFrame(x)).cache() data_df = spark.createDataFrame(data_rdd) # Split the data into train and test splits = data_df.randomSplit([0.6, 0.4]) train = splits[0] test = splits[1] # 3. training print '>>>>> training ' from pyspark.ml.classification import MultilayerPerceptronClassifier mlp = MultilayerPerceptronClassifier( maxIter=1000, tol=1e-4, seed=1, layers=[n_features, n_features, n_features, n_features, n_classes], blockSize=100, stepSize=0.03, solver="l-bfgs", initialWeights=None) model = mlp.fit(train) # 4. compute accuracy on the test set print '>>>>> testing ' from pyspark.ml.evaluation import MulticlassClassificationEvaluator result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Spark Accuracy: " + str(evaluator.evaluate(predictionAndLabels)))
train_data_indexed = si_model_training.transform(train_data) si_model_test = stringIndexer.fit(train_data) test_data_indexed = si_model_test.transform(test_data) # Random Forest model random_forest_class = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexed", seed=42) random_forest_model = random_forest_class.fit(train_data_indexed) # Prediction using Random Forest predicted_result = random_forest_model.transform(test_data_indexed) combined_result = predicted_result.select("label", "features", "prediction") combined_result.show(80, False) print 'Random Forest :: Weighted Pricision of model : ', weighted_precision.evaluate( predicted_result) print 'Random Forest :: Weighted Recall of model : ', weighted_recall.evaluate( predicted_result) ''' Random Forest :: Weighted Pricision of model : 0.776882205236 Random Forest :: Weighted Recall of model : 0.799712452997 ''' # Decision Tree model decision_tree_class = DecisionTreeClassifier(maxDepth=4, labelCol="indexed") decision_tree_model = decision_tree_class.fit(train_data_indexed) # Prediction using Decision Tree predicted_result = decision_tree_model.transform(test_data_indexed) combined_result = predicted_result.select("label", "features", "prediction") combined_result.show(80, False) print 'Decision Tree :: Weighted Pricision of model : ', weighted_precision.evaluate(
# Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[2] print(rfModel) # summary only # $example off$ spark.stop()
'LateAircraftDelay'], outputCol="features") output = assembler.transform(df2) # Spark's mllib directly cannot deal with "Yes" or "No" values so we used StringIndexer method. # isDelayIndex is created after transforming string values to O and 1. indexer = StringIndexer(inputCol="isDelay", outputCol="isDelayIndex") output_fixed = indexer.fit(output).transform(output) final_data = output_fixed.select("features",'isDelayIndex') train_data,test_data = final_data.randomSplit([0.3,0.7]) rfc = RandomForestClassifier(labelCol='isDelayIndex',featuresCol='features') rfc_model = rfc.fit(train_data) rfc_predictions = rfc_model.transform(test_data) acc_evaluator = MulticlassClassificationEvaluator(labelCol="isDelayIndex", predictionCol="prediction", metricName="accuracy") rfc_acc = acc_evaluator.evaluate(rfc_predictions) print("Here are the results!") print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("NaiveBayesExample")\ .getOrCreate() # $example on$ # Load training data data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop()
#estima = NaiveBayes() #grid = ParamGridBuilder().addGrid(5, [0, 2]).build() lr = LogisticRegression(featuresCol="features", labelCol="label", predictionCol="prediction",maxIter=20) #choose the model grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() #la grille est construite pour trouver le meilleur parametre 'alpha' pour le terme de regularisation du modele: c'est un 'elastic Net' #max.iter vaut 30 par defaut, on pourrait changer sa valeur #on va donc essayer 30 valeur entre 0 et 1 #alpha=0 c'est une regularisation L2, #alpha=1, c'est une regularisation L1 print "Cross validation debut" evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label",metricName='precision') #choose the evaluator cv = CrossValidator(estimator=lr, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter #cvModel = cv.fit(dfTrain) #train the model on the whole training set model = lr.fit(dfTrain) resultat=evaluator.evaluate(model.transform(dfTest)) #compute the percentage of success on test set print "Pourcentage de bonne classification(0-1): ",resultat ##Train NaiveBayes #model=NaiveBayes.train(labeledRDD) ##broadcast the model #mb=sc.broadcast(model) # #test,names=lf.loadUknown('./data/test') #name_text=zip(names,test) ##for each doc :(name,text): ##apply the model on the vector representation of the text ##return the name and the class #predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect() # #output=file('./classifications.txt','w')