def testLogisticMLPipeline1(self): training = sqlCtx.createDataFrame([ ("a b c d e spark", 1.0), ("b d", 2.0), ("spark f g h", 1.0), ("hadoop mapreduce", 2.0), ("b spark who", 1.0), ("g d a y", 2.0), ("spark fly", 1.0), ("was mapreduce", 2.0), ("e spark program", 1.0), ("a e c l", 2.0), ("spark compile", 1.0), ("hadoop software", 2.0) ], ["text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) lr = LogisticRegression(sqlCtx) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) test = sqlCtx.createDataFrame([ ("spark i j k", 1.0), ("l m n", 2.0), ("mapreduce spark", 1.0), ("apache hadoop", 2.0)], ["text", "label"]) result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator() score = evaluator.evaluate(predictionAndLabels) self.failUnless(score == 1.0)
def main(sc, spark): # Load and vectorize the corpus corpus = load_corpus(sc, spark) vector = make_vectorizer().fit(corpus) # Index the labels of the classification labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel") labelIndex = labelIndex.fit(corpus) # Split the data into training and test sets training, test = corpus.randomSplit([0.8, 0.2]) # Create the classifier clf = LogisticRegression( maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="indexedLabel", featuresCol="tfidf") # Create the model model = Pipeline(stages=[ vector, labelIndex, clf ]).fit(training) # Make predictions predictions = model.transform(test) predictions.select("prediction", "indexedLabel", "tfidf").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) gbtModel = model.stages[2] print(gbtModel) # summary only
def RunRandomForest(tf, ctx): sqlContext = SQLContext(ctx) rdd = tf.map(parseForRandomForest) # The schema is encoded in a string. schema = ['genre', 'track_id', 'features'] # Apply the schema to the RDD. songDF = sqlContext.createDataFrame(rdd, schema) # Register the DataFrame as a table. songDF.registerTempTable("genclass") labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF) trainingData, testData = songDF.randomSplit([0.8, 0.2]) labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features") #rfc = SVMModel([.5, 10, 20], 5) #rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features") pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter]) model = pipeline.fit(trainingData) predictions = model.transform(testData) predictions.show() evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision") accuracy = evaluator.evaluate(predictions) print 'Accuracy of RandomForest = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def sparking_your_interest(): df = SQLContext.read.json('speeches_dataset.json') df_fillna=df.fillna("") print(df_fillna.count()) print(df_fillna.printSchema()) df_utf=call_utf_encoder(df) df_cleaned=call_para_cleanup(df_utf) print(df_cleaned) df_with_bigrams = call_ngrams(df_cleaned, 2) df_with_trigrams = call_ngrams(df_with_bigrams, 3) df_with_4grams = call_ngrams(df_with_trigrams, 4) df_with_5grams = call_ngrams(df_with_4grams, 4) df_with_6grams = call_ngrams(df_with_5grams, 4) df_with_vocab_score = call_speech_vocab(df_with_6grams) df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams') df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams') df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams') assembler = VectorAssembler( inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"], outputCol="features") assembler_output = assembler.transform(df_with_4grams_idf_vectors) output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features') print(output.show()) print(output.count()) output_tordd = output.rdd train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123) train_df = train_rdd.toDF() test_df = test_rdd.toDF() print(train_df) print(test_df) print('Train DF - Count: ') print(train_df.count()) print('Test DF - Count: ') print(test_df.count()) print("Initializing RF Model") labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) pipeline = Pipeline(stages=[labelIndexer,rf]) model = pipeline.fit(output) print("Completed RF Model") predictions = model.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[1] print(rfModel) # summary only print("Predictions: ") print(predictions.show())
def model(classifier, ftrain, fvalid, fprediction): startTime = time.time() ctx = SparkContext(appName="model_on_Spark") sqlContext = SQLContext(ctx) logger = SparkLogger(ctx) logger.set_level('ERROR') # load and prepare training and validation data rawTrain, train = prepData(sqlContext, ctx, ftrain) rawValid, valid = prepData(sqlContext, ctx, fvalid) # is needed to join columns valid = indexData(valid) rawValid = indexData(rawValid) classifiers = { "RandomForestClassifier" : RFC } clf = classifiers[classifier]() labelIndexer = StringIndexer(inputCol="label", outputCol="indexed") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") # train and predict pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf]) model = pipeline.fit(train) predictions = model.transform(valid) # write to file: subsetPrediction = predictions.select("prediction", "index") subsetValidData = rawValid.select("dataset", "index") output = (subsetValidData .join(subsetPrediction, subsetPrediction.index == subsetValidData.index) .drop("index") .drop("index")) lines = output.map(toCSVLine) lines.saveAsTextFile('output') evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print "Test Error = %g" % (1.0 - accuracy) executionTime = time.time() - startTime row=classifier+','+str(executionTime) ctx.parallelize([row]).saveAsTextFile("timing")
def build_decision_tree(sqlContext, features, interested): print '-----------------------------------------' data = sqlContext.createDataFrame( [Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))]) data.printSchema() data.show(5) print 'created data frame' # Index the label column & adding metadata. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) print 'created label indexer' # Mark the features with < 4 distinct values as categorical featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (trainingData, testData) = data.randomSplit([0.8, 0.2]) # Train a DecisionTree model dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) # Chain the indexers together with DecisionTree pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train the model model = pipeline.fit(trainingData) # Make predictions predictions = model.transform(testData) predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) & compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") precision = evaluator.evaluate(predictions) treeModel = model.stages[2] return (1 - precision, model)
def naiveBayeseian(): def parseLine(line): keys = [float(x) for x in line.split(",")] #return LabeledPoint(keys[0],keys[1:]) return keys scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv") data= scdata1.map(parseLine) splits = data.randomSplit([0.8, 0.2], 1234) train = splits[0] test = splits[1] layers = [30, 20, 20, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
def print_evaluation_metrics(model, test_df, labelCol="label", featuresCol="features"): """ Prints evaluation metrics. :param model: Used model. :param test_df: dataframe containing test data. :param labelCol: label column. :param featuresCol: features column. :return: A DataFrame. """ predictions = model.transform(test_df) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="prediction",) accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}) f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"}) weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"}) weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}) print "Accuracy:", accuracy print "f1:", f1 print "Precision:", weighted_precision print "Recall:", weighted_recall
def price_predict(path, windows=5, spark_contest=None, sql_context=None): if spark_contest is None: spark_contest, sql_context = load_spark_context() input_data = DataParser(path=path, window_size=windows) close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data( data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context) evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION) # handle open data open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128, featuresCol=FEATURES, labelCol=LABEL, seed=1234) open_model = open_trainer.fit(open_train_df) open_result = open_model.transform(open_test_df) open_prediction_labels = open_result.select(PREDICTION, LABEL) print("Precision:" + str(evaluator.evaluate(open_prediction_labels))) # handle close data close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128, featuresCol=FEATURES, labelCol=LABEL, seed=1234) close_model = close_trainer.fit(close_train_df) close_result = close_model.transform(close_test_df) close_prediction_labels = close_result.select(PREDICTION, LABEL) print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
def calculate_accuracy_metrics(predictions): """ Calculates accuracy metrics for a Prediction DataFrame :param predictions: :return: """ evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction") accuracy = round(evaluator.evaluate(predictions, {evaluator.metricName: "precision"}), 2) recall = round(evaluator.evaluate(predictions, {evaluator.metricName: "recall"}), 2) positive_cases = predictions.filter(predictions["indexedLabel"] == 1.0) negative_cases = predictions.filter(predictions["indexedLabel"] == 0.0) false_positive_cases = negative_cases.filter(positive_cases["prediction"] == 1.0) false_negative_cases = positive_cases.filter(positive_cases["prediction"] == 0.0) return [accuracy, recall, positive_cases.count(), negative_cases.count(), false_positive_cases.count(), false_negative_cases.count()]
##################### Preprocessing ##################### # PCA pca = PCA(k=d, inputCol="features", outputCol="pca") ##################### Decision Tree ##################### # Train a Random Forest model. rf = RandomForestClassifier(labelCol="label", featuresCol="pca", \ numTrees=n, seed=1234, maxDepth=30, \ minInstancePerNode=5) ##################### Pipelined Model ##################### pipeline_rf = Pipeline(stages=[pca, rf]) # build pipelined model with train data model_rf = pipeline_rf.fit(train_df) ##################### Prediction ##################### # make predictions result_rf = model_rf.transform(test_df) ##################### Evaluation ##################### # compute accuracy evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(result_rf) print("\n+-------------------+") print("| Accuracy = %.2f%% |" % (100 * accuracy)) print("+-------------------+\n")
data = sc.parallelize(Xtrain) print("\nSplitting data into 60","%"," training and 40","%","testing") training_data, testing_data = data.randomSplit([0.6, 0.4], seed=0) vectorizedData = training_data.toDF() print("Creating MultilayerPerceptronClassifier...") MLP = MultilayerPerceptronClassifier(labelCol='indexedLabel', featuresCol='indexedFeatures') labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(vectorizedData) featureIndexer = VectorIndexer(inputCol='features', outputCol='indexedFeatures', maxCategories=2).fit(data.toDF()) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, MLP]) paramGrid_MLP = ParamGridBuilder().addGrid(MLP.layers,[[3072, neuron, 10] for neuron in [200, 500]]).build() evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='f1') print("Processing crossvalidation with 3-fold & 200/500 hidden layer units") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid_MLP, evaluator=evaluator, numFolds=3) starttime = datetime.datetime.now() CV_model = crossval.fit(vectorizedData) print CV_model.bestModel.stages[2] print('Done on fitting model:%s'%(datetime.datetime.now()-starttime)) print("Transforming testing data...") vectorized_test_data = testing_data.toDF() #transformed_data1 = CV_model.transform(vectorizedData) #print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1)
"hours-per-week") data.show() assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) data.show() # Splitting the data into training and data set training, test = data.select("label", "features").randomSplit([0.70, 0.30]) # Create Random Forest model and fit the model with training dataset rf = RandomForestClassifier() model = rf.fit(training) # Generate prediction from test dataset pred = model.transform(test) # Evaluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(pred) # Show model accuracy print("Accuracy:", accuracy) # Report predictionAndLabels = pred.select("prediction", "label").rdd metrics = MulticlassMetrics(predictionAndLabels) print("Confusion Matrix:", metrics.confusionMatrix()) print("Precision:", metrics.precision()) print("Recall:", metrics.recall()) print("F-measure:", metrics.fMeasure())
si = StringIndexer(inputCol="purpose", outputCol="purpose_index") hot = OneHotEncoder(inputCol="purpose_index", outputCol="purpose_features") va = VectorAssembler(inputCols=["loan_amnt", "interest_rate", "employment_length", "home_owner", "income", "verified", "open_accts", "credit_debt", "purpose_features"], outputCol="features") dtr = DecisionTreeRegressor(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=2, varianceCol="variance") gbr = GBTRegressor(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345) gbc = GBTClassifier(featuresCol="features", labelCol="default", predictionCol="prediction", maxDepth=5, maxIter=20, seed=12345) pipeline = Pipeline(stages=[si, hot, va, gbc]) model = pipeline.fit(training) model.write().overwrite().save('hdfs:///tmp/spark_model') predictions = model.transform(testing) predictions.select(['default','prediction']).sort(col('prediction').desc()).show(25,False) #evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="default") #rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}) #r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"}) #evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="default") #evaluator.evaluate(predictions) #evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="default") evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}) #ZEND
# $example on$ # load data file. inputData = spark.read.format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="precision") # compute the classification error on test data. precision = evaluator.evaluate(predictions) print("Test Error : " + str(1 - precision)) # $example off$ spark.stop()
# Fit the pipeline pipelined_data = pipeline.fit(df) transformed_data = pipelined_data.transform(df) training_set, test_set = transformed_data.randomSplit([0.8, 0.2], seed=10) # Create the model, train and predict nb = NaiveBayes(smoothing=1.0, modelType="multinomial", featuresCol='TF', labelCol='race') training_set.cache() model = nb.fit(training_set) predictions = model.transform(test_set) # Evaluate the results evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='race') result = predictions.select('race', 'prediction') result_rdd = result.rdd metrics = MulticlassMetrics(result_rdd) print("Naive Bayes model evaluation") print("F score: {}".format(evaluator.evaluate(result))) print(metrics.confusionMatrix()) for k, v in race_to_number.iteritems(): print("F score for {}: {}".format(k, metrics.fMeasure(v))) print("Precision: {}".format( evaluator.evaluate(result, {evaluator.metricName: 'precision'}))) print("Contigency table of the prediction results of naive bayes model") result.stat.crosstab('prediction', 'prediction').show() print("Predictions for naive bayes: {}".format(
# set seed for reproducibility and Split Data in 80-20% for Train and Test Data Set (trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) #Logistic Regression Classification lr = LogisticRegression(maxIter=25, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 0).select("text","index","probability","label","prediction").orderBy("probability", ascending=False).show(n = 10, truncate = 30) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") accuracy = evaluator.evaluate(predictions) print("Test Error for Logistic Regression :" + str((1.0 - accuracy)*100)+ "%") print("Test Accuracy for Logistic Regression :" + str((accuracy)*100)+ "%") evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction" ,metricName='f1') f1 = evaluator.setMetricName("f1").evaluate(predictions) weightedPrecision = evaluator.setMetricName("weightedPrecision").evaluate(predictions) weightedRecall = evaluator.setMetricName("weightedRecall").evaluate(predictions) accuracy = evaluator.setMetricName("accuracy").evaluate(predictions) print("Test weightedRecall for Logistic Regression :" + str(weightedRecall)) print("Test weightedPrecision for Logistic Regression :" + str(weightedPrecision)) print("Test f1 score for Logistic Regression :" + str(f1))
bst_model_path = model_save_path + "_bst_model" train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345) bst_model = train_with_tune(train_df) bst_model.write().overwrite().save(bst_model_path) # 用训练得到最佳模型来对测试数据进行预测 # 预测结果的数据结构是类似下面的结构: # features = Vectors.dense(...) # label=0, # rawPrediction=DenseVector([0.048, -0.048]), # probability=DenseVector([0.512, 0.488]), # prediction=0.0 loaded_bst_model = PipelineModel.load(bst_model_path) result = loaded_model.transform(train_df) predict_result = loaded_bst_model.transform(test_df) print("predicted sample :", predict_result.take(3)) # 对训练出来的二分类模型进行评估 bin_eval = BinaryClassificationEvaluator() predict_metric = bin_eval.evaluate(predict_result, {bin_eval.metricName: "areaUnderROC"}) print("trained model test auc metric", predict_metric) # 查看具体分类混淆矩阵信息,默认会计算f1 mm = MulticlassClassificationEvaluator() f1 = mm.evaluate(predict_result) accuracy = mm.evaluate(predict_result, {mm.metricName: "accuracy"}) precision = mm.evaluate(predict_result, {mm.metricName: "weightedPrecision"}) recall = mm.evaluate(predict_result, {mm.metricName: "weightedRecall"}) print("predict trained model precision: %f, recall: %f, acc: %s, f1: %f " \ % (precision, recall, accuracy, f1))
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'engine-type') dt_model = dt.fit(train_df) dt_predictions = dt_model.transform(test_df) dt_evaluator = RegressionEvaluator( labelCol="engine-type", predictionCol="prediction", metricName="rmse") rmse = dt_evaluator.evaluate(dt_predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator rf = RandomForestClassifier(labelCol="engine-type",featuresCol="features",numTrees = 100,maxDepth = 4,maxBins = 32) # Train model with Training Data rfModel = rf.fit(train_df) predictions = rfModel.transform(test_df) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="engine-type") evaluator.evaluate(predictions) from pyspark.ml.classification import NaiveBayes nb = NaiveBayes(smoothing=1,labelCol="engine-type",featuresCol="features") model = nb.fit(train_df) predictions = model.transform(test_df) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="engine-type") evaluator.evaluate(predictions)
def transform_predictions(dataframe, spark): df_transformed = dataframe.drop("Patient addmited to regular ward (1=yes, 0=no)", "Patient addmited to semi-intensive unit (1=yes, 0=no)", "Patient addmited to intensive care unit (1=yes, 0=no)") df_transformed_no_missing = dismiss_missing_values(df_transformed) # build the dataset to be used as a rf_model base outcome_features = ["SARS-Cov-2 exam result"] required_features = ['Hemoglobin', 'Hematocrit', 'Platelets', 'Eosinophils', 'Red blood Cells', 'Lymphocytes', 'Leukocytes', 'Basophils', 'Monocytes'] assembler = VectorAssembler(inputCols=required_features, outputCol='features') model_data = assembler.transform(df_transformed_no_missing) # split the dataset into train/test subgroups (training_data, test_data) = model_data.randomSplit([0.8, 0.2], seed=2020) # Random Forest classifier rf = RandomForestClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features', maxDepth=5) rf_model = rf.fit(training_data) rf_predictions = rf_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') rf_accuracy = multi_evaluator.evaluate(rf_predictions) # Decision Tree Classifier dt = DecisionTreeClassifier(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxDepth=3) dt_model = dt.fit(training_data) dt_predictions = dt_model.transform(test_data) dt_predictions.select(outcome_features + required_features).show(10) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') dt_accuracy = multi_evaluator.evaluate(dt_predictions) # Logistic Regression Model lr = LogisticRegression(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxIter=10) lr_model = lr.fit(training_data) lr_predictions = lr_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') lr_accuracy = multi_evaluator.evaluate(lr_predictions) # Gradient-boosted Tree classifier Model gb = GBTClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features') gb_model = gb.fit(training_data) gb_predictions = gb_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') gb_accuracy = multi_evaluator.evaluate(gb_predictions) rdd = spark.sparkContext.parallelize([rf_accuracy, dt_accuracy, lr_accuracy, gb_accuracy]) predictions_dataframe = spark.createDataFrame(rdd, FloatType()) return predictions_dataframe
handleInvalid='error') indexer = stringIndexer.fit(df_tf_idf) df_tf_idf_lab = indexer.transform(df_tf_idf).select('features', 'indexed') df_tf_idf_lab.show() # 切分训练集和预测集 splits = df_tf_idf_lab.randomSplit([0.7, 0.3], 123) train = splits[0] test = splits[1] # 定义模型 nb = NaiveBayes(featuresCol='features', labelCol='indexed', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', smoothing=1.0, modelType='multinomial') # 模型训练 model = nb.fit(train) # 预测集训练 predictions = model.transform(test) predictions.show() # 计算准确率 evaluator = MulticlassClassificationEvaluator(labelCol='indexed', predictionCol='prediction', metricName='accuracy') accuracy = evaluator.evaluate(predictions) print("Test set accuracy =" + str(accuracy))
indexer = StringIndexer(inputCol="Embarked", outputCol="EmbarkedInd") dataset = indexer.fit(dataset).transform(dataset) #assemble features assembler = VectorAssembler(inputCols=[ "Age", "Pclass", "SexInd", "SibSp", "Parch", "Fare", "EmbarkedInd" ], outputCol="features") dataset = assembler.transform(dataset) (trainingData, testData) = dataset.randomSplit([0.8, 0.2]) #MLP layers = [7, 8, 4, 2] #input: 7 features; output: 2 classes mlp = MultilayerPerceptronClassifier(maxIter=100, layers=layers, labelCol="Survived", featuresCol="features", blockSize=128, seed=0) model = mlp.fit(trainingData) result = model.transform(testData) prediction_label = result.select("prediction", "Survived") evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy") print("MLP test accuracy: " + str(evaluator.evaluate(prediction_label)))
data = data.withColumn('length', length(data['question_text'])) #選取建模需要的所有特徵,做成一個向量 assembler = VectorAssembler(inputCols=['question_tfidf', 'length'], outputCol='features') lgr = LogisticRegression(labelCol="target", featuresCol="features", maxIter=100) pipeline = Pipeline( stages=[tokenizer, remover, ngram, hashingTF, idf, assembler, lgr]) paramGrid = ParamGridBuilder().build() evaluator = MulticlassClassificationEvaluator(labelCol="target", metricName='f1') cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) train = data.filter(data['target'].isNotNull()) (trainX, validation) = train.randomSplit([0.7, 0.3]) test = data.filter(data['target'].isNull()) model = cv.fit(trainX) results = model.transform(validation).select("qid", "target", "prediction") f1 = evaluator.evaluate(results) ### f1 = 0.90 ????
# MAGIC Automated MLflow tracking is enabled by default for: # MAGIC # MAGIC - Databricks Runtime 5.4 ML or above # MAGIC - Databricks Runtime 5.4 or above # MAGIC # MAGIC To enable it for earlier versions, set the `SparkSession` configuration flag `"spark.databricks.mlflow.trackMLlib.enabled"` to `"true"`. # COMMAND ---------- #spark.conf.set("spark.databricks.mlflow.trackMLlib.enabled", "true") # COMMAND ---------- # Define an evaluation metric. In this case, use "weightedPrecision", which is equivalent to 0-1 accuracy. from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName="weightedPrecision") # COMMAND ---------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # COMMAND ---------- grid = ParamGridBuilder() \ .addGrid(dtc.maxDepth, [2, 3, 4, 5, 6, 7, 8]) \ .addGrid(dtc.maxBins, [2, 4, 8]) \ .build() # COMMAND ---------- cv = CrossValidator(estimator=pipeline,
def model_accuracy(test_results): print("control entered model_accuracy ") print(test_results. select('Sentiment','length','stop_tokens','hash_token','idf_token','probability','prediction').show(20)) acc_eval = MulticlassClassificationEvaluator() acc = acc_eval.evaluate(test_results) return acc
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
# COMMAND ---------- display([list('0123456789'),truePositives].toDF()) # COMMAND ---------- testDF=spark.read.parquet('/mnt/adls/testset.parquet') # COMMAND ---------- predictions=lrModel.transform(testDF) # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions) # COMMAND ---------- !pip install Keras #!pip install tensorflow from __future__ import print_function import keras from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv2D, MaxPooling2D from keras import backend as K # COMMAND ----------
) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf") idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed") dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" ) grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build() print "Grid is build" cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator) print "CV Estimator is defined" cv_model = cv.fit(dfTrain) print "Model is fitted" df_test_pred = cv_model.transform(dfTest)
labelConverter = IndexToString().\ setInputCol('prediction').\ setOutputCol('predictedLabel').\ setLabels(labelIndexer.labels) trainingData, testData = data.randomSplit([0.7, 0.3]) # 构建决策树分类模型,设置决策树的参数 dtClassifier = DecisionTreeClassifier().\ setLabelCol('indexedLabel').\ setFeaturesCol('indexedFeatures') # 构建机器学习流水线(Pipeline),调用fit()进行模型训练 dtPipeline = Pipeline().\ setStages([labelIndexer, featureIndexer, dtClassifier, labelConverter]) dtPipelineModel = dtPipeline.fit(trainingData) dtPredictions = dtPipelineModel.transform(testData) dtPredictions.select('predictedLabel', 'label', 'features').show(20) evaluator = MulticlassClassificationEvaluator().\ setLabelCol('indexedLabel').\ setPredictionCol('prediction') dtAccuracy = evaluator.evaluate(dtPredictions) print('决策树模型准确率:{}'.format(dtAccuracy)) # 模型的预测准确率 # 通过调用toDebugString方法查看训练的决策树模型结构 treeModelClassifier = dtPipelineModel.stages[2] print('Learned classification tree model:\n' + str(treeModelClassifier.toDebugString))
fig, axList = prepareSubplot(np.arange(0., 1.1, 0.1), np.arange(0., 1.1, 0.1), figsize=(12., 5.), subplots=(1,2)) ax0, ax1 = axList ax0.set_title('First Model', color='#999999') ax1.set_title('Second Model', color='#999999') generateROC(axList[0], labelsAndScores) generateROC(axList[1], labelsAndScores2) display(fig) # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator metric = 'precision' multiclassEval = MulticlassClassificationEvaluator() multiclassEval.setMetricName(metric) print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions)) print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2)) # COMMAND ---------- import inspect print inspect.getsource(MulticlassClassificationEvaluator) # COMMAND ---------- # MAGIC %md # MAGIC #### Using MLlib instead of ML # MAGIC
# $example on$ # Load training data data = spark.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop()
idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit( rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0) print(trainingData.take(1)) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="indexed") start_time = time.time() modelClassifier = nb.fit(trainingData) end_time = time.time() print(end_time - start_time) predictionsClassifier = modelClassifier.transform(testData) evaluator = MulticlassClassificationEvaluator().setLabelCol( "indexed").setPredictionCol("prediction") print( "accuracy = ", evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "accuracy"})) print( "weightedPrecision = ", evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "weightedPrecision"})) print( "weightedRecall = ", evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "weightedRecall"})) print("f1 = ", evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "f1"}))
(trainingData, testData) = td.randomSplit([0.7, 0.3]) trainingData.count() testData.count() from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator #Create the model rmClassifer = RandomForestClassifier(labelCol="indexed", \ featuresCol="pcaFeatures", numTrees=100) rmModel = rmClassifer.fit(trainingData) #Predict on the test data predictions = rmModel.transform(testData) predictions.select("prediction", "indexed", "label", "pcaFeatures").collect() evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \ labelCol="indexed",metricName="accuracy") evaluator.evaluate(predictions) #Draw a confusion matrix predictions.groupBy("indexed", "prediction").count().show() #Balance data set from numpy.random import randint from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType RATIO_ADJUST = 2.0 ## ratio of pos to neg in the df_subsample counts = trainingData.select('indexed').groupBy('indexed').count().collect() higherBound = counts[0][1] TRESHOLD_TO_FILTER = int(RATIO_ADJUST * float(counts[1][1]) / counts[0][1] *
from pyspark.sql.types import FloatType #Extracting only the column with probability with column with 1's probability secondelement=udf(lambda v:float(v[1]),FloatType()) transformed.select(secondelement('probability')).show() #Dataframe column to list mvv_count_df.select('mvv').collect() #-------- creating saving loading model ------------------ rf = RandomForestClassifier(labelCol='label', featuresCol='features',numTrees=20) paramGrid = ParamGridBuilder().build()#ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01, 0.001, 0.0001]).build() #lr = LinearRegression() #paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [500]).addGrid(lr.regParam, [0]).addGrid(lr.elasticNetParam, [1]).build() pipeline_new = Pipeline(stages=[rf]) evaluator = MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("f1") #/setMetricName/ "f1" (default), "weightedPrecision", "weightedRecall", "accuracy" #evaluator = RegressionEvaluator(metricName="mae") crossval = CrossValidator(estimator=pipeline_new, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) model_new_rf = crossval.fit(trainingData) model_new_rf.bestModel model_new_rf.bestModel.save('rf_pipeline_model_saved') model_new_rf.avgMetrics #loading a saved model from pyspark.ml import PipelineModel loadedModel = PipelineModel.load("rf_pipeline_model_saved") #Checkpointing is a process of truncating RDD lineage graph and saving it to a reliable distributed (HDFS) or local file system. sc.setCheckpointDir("hdfs://hadoop-master:9000/data/checkpoint") df.repartition(100)
# Split the data into training and test sets (30% held out for testing) # multi_feat = MultiFeaturizer(spark, [wv, wv_tweet]) # feat_df = multi_feat.featurize(converted_df) # converted_df2 = shape_df(spark, df, 'nagisa', ['補助記号']).drop("age") # tfidf = TfidfFeaturizer(spark) # feat_df = tfidf.featurize(converted_df2) # onehot = OneHotFeaturizer(spark) # feat_df = onehot.featurize(converted_df) # multi_feat = MultiFeaturizer(spark, [wv_tweet, tfidf], [converted_df, converted_df2]) # feat_df = multi_feat.featurize() (trainingData, testData) = feat_df.randomSplit([0.8, 0.2], seed=3) # 3. call `fit`. (fit のときにはたんに事前に作った data-frame を入れる) clf = model.fit(trainingData) predict_train = model.transform(trainingData) predict_test = model.transform(testData) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predict_train) print("train accuracy: " + str(accuracy)) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predict_test) print("test accuracy: " + str(accuracy))
data1 = output.select("label", "features") (training, test) = data1.randomSplit([0.8, 0.2], seed = 12345) #gbt = GBTClassifier(numTrees = 10, maxDepth = 3, maxBins = 64) gbt = GBTClassifier(maxIter = 30, maxDepth = 2, impurityType = gini) #gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10) ##rf = RandomForestClassifier(numTrees = 25, maxDepth = 4, maxBins = 64) pipeline = Pipeline(stages=[gbt]) pipelineModel = pipeline.fit(training) testPredictions = pipelineModel.transform(test) testPredictions.select("prediction", "label", "features").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")#.setMetricName("accuracy") evaluatorParaMap = {evaluator.metricName: "f1"} aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap) from pyspark.ml.tuning import * paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [1,5]).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3) cvModel = cv.fit(training) cvPredictions = cvModel.transform(test) cvAUCTest = evaluator.evaluate(cvPredictions, evaluatorParaMap) print("pipeline Test AUC: %g" % aucTest)
import findspark findspark.init('D:\Spark') from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.sql import SparkSession spark = SparkSession.builder.appName('MultiLayer').getOrCreate() data = spark.read.csv('iris.data', inferSchema=True, header=False) from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=['_c0', '_c1', '_c2', '_c3'], outputCol='features') final_data = assembler.transform(data) splits = final_data.randomSplit([0.6, 0.4]) train = splits[0] test = splits[1] layers = [4, 5, 4, 3] trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, labelCol='_c4') model = trainer.fit(final_data) result = model.transform(test) predictionAndLabels = result.select("prediction", "_c4") evaluator = MulticlassClassificationEvaluator(metricName="accuracy", labelCol='_c4') print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
def test_save_load_pipeline_estimator(self): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(5) lr2 = LogisticRegression().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(ova.classifier, [lr1, lr2]) \ .build() tvs = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) # test save/load of CrossValidatorModel tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, tvsModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline( stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) tvs2 = TrainValidationSplit( estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) # test save/load of CrossValidatorModel tvsModelPath2 = temp_path + "/tvsModel2" tvsModel2.save(tvsModelPath2) loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2) self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = tvsModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip( loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)
# Train a DecisionTree model. dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") treeModel = model.stages[2] # summary only print(treeModel) hashingTF = HashingTF(inputCol="features", outputCol="features") regParam = 0.3 paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator,
print >> sys.stderr, "%s <input> <model_path> <stop_file> class_num appname" % sys.argv[0] sys.exit(1) input_path = sys.argv[1] model_path = sys.argv[2] stop_file = sys.argv[3] class_num = int(sys.argv[4]) appname = sys.argv[5] conf = SparkConf().setAppName(appname) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) data_df = text_to_df(sc, sqlContext, input_path) print "*** create data frame ***" splits = data_df.randomSplit([0.8, 0.2], 1234) training = splits[0].cache() test = splits[1].cache() stopwords = get_stopwords(stop_file) print "*** load %s stopwords ***" % len(stopwords) pipeline = get_pipeline(vector_size=50, class_num=class_num, stopwords=stopwords) model = pipeline.fit(training) result = model.transform(test) pred_label = result.select("prediction", "indexLabel") evaluator = MulticlassClassificationEvaluator(metricName="precision", predictionCol="prediction", labelCol="indexLabel") print("Precision: " + str(evaluator.evaluate(pred_label)))
print(f"Test set length: {test.count()} records") # Cross validation # Cross-validation is a model validation technique for assessing how the results of a statistical analysis will generalize to an independent data set. It is mainly used in settings where the goal is prediction, and one wants to estimate how accurately a predictive model will perform in practice. from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Create an initial RandomForest model. rf = RandomForestClassifier(labelCol="label", featuresCol="features") # Evaluate model rfevaluator = MulticlassClassificationEvaluator(metricName="f1") # Create ParamGrid for Cross Validation rfparamGrid = (ParamGridBuilder() .addGrid(rf.maxDepth, [2, 5, 10, 20, 30]) .addGrid(rf.maxBins, [10, 20, 40, 80, 100]) .addGrid(rf.numTrees, [5, 20, 50, 100, 500]) .build()) # Create 5-fold CrossValidator rfcv = CrossValidator(estimator = rf, estimatorParamMaps = rfparamGrid, evaluator = rfevaluator, numFolds = 5) # Run cross validations.
metricName="rmse") logger.info('Regression train RMSE: %g' % lr_evaluator.evaluate(lr_pred_train)) logger.info('Regression test RMSE: %g' % lr_evaluator.evaluate(lr_pred_test)) # save pipeline lr_model.save('/app/saved_models/lr_model') # binary classification # predict if post belongs to AskReddit bc_pipeline = get_binary_classification_pipeline() bc_model = bc_pipeline.fit(train_data) bc_pred = bc_model.transform(test_data) bc_evaluator_acc = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="label", metricName="accuracy") bc_evaluator_f1 = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="f1") logger.info('Binary classification test Accuracy: %g' % bc_evaluator_acc.evaluate(bc_pred)) logger.info('Binary classification test F1: %g' % bc_evaluator_f1.evaluate(bc_pred)) # save pipeline bc_model.save('/app/saved_models/bc_model') # multi-class classification # predict post's subreddit
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark import SQLContext from pyspark.sql.types import * import pyspark.sql.functions as F from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs from pyspark.ml.tuning import CrossValidatorModel from pyspark.ml import PipelineModel from pyspark.ml.evaluation import MulticlassClassificationEvaluator sc = SparkContext(appName="MyFirstApp4_Task_task2") spark = SparkSession(sc) df_node18=spark.read.format("parquet").load(path="hdfs://namenode:9000/example4/test.parquet") model_node21=CrossValidatorModel.load("hdfs://namenode:9000/example4/model_2/") model_node19=PipelineModel.load("hdfs://namenode:9000/example4/model_1/") df_node20=model_node19.transform(df_node18) df_node22=model_node21.transform(df_node20) evaluator_node23 = MulticlassClassificationEvaluator(labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy") score_node23=evaluator_node23.evaluate(df_node22) df_node23= spark.createDataFrame([(score_node23,)], ["score"]) df_node23.write.format("csv").save(path="hdfs://namenode:9000/example4/EvalResult3.csv")
#applying logistic regression using the "Text" to predict "Sentiment" lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("Text","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # COMMAND ---------- #finding the accuracy from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions) # COMMAND ---------- #applying cross validation pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx]) pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df) (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
data = sqlContext.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test data.show() data.printSchema() data.select('features').show() splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] print (train.count()) train.show() test = splits[1] # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ sc.stop()
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") # Chain indexers and tree in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] # summary only print(treeModel) # see for more: https://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier # Churn - which customers (of a telecommunications company) are likely to stop using their service # Churn dataset provided by the UC Irvine machine-learning repository hosted by SGI # Data from https://www.sgi.com/tech/mlc/db/churn.all $ wget https://www.sgi.com/tech/mlc/db/churn.all # Classification - Random Forest
from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainSelect) dfTrainIndexed = string_indexer_model.transform(dfTrainSelect) # In[329]: from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10) # In[330]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # In[331]: from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator grid=(ParamGridBuilder() .baseOn([evaluator.metricName,'precision']) .addGrid(dt.maxDepth, [10,20]) .build()) cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator) # In[332]:
print "Done in {} second".format(round(tt,3)) # In[18]: from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import MulticlassClassificationEvaluator print "Fitting the classifier on selected features" t0 = time() string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') string_indexer_model = string_indexer.fit(dfTrainSelect) dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache() lrModel = lr.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[19]: print "Testing precision of the model" t0 = time() dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()
spark = getSpark() df = spark.read.load('../dataset/merged/publisher/') df = df.withColumn('label', df._hyperpartisan.cast('integer')) testSet = spark.read.load('../dataset/merged/article/') testSet = testSet.withColumn('label', testSet._hyperpartisan.cast('integer')) hashingTF = HashingTF(inputCol="words", outputCol="rawfeatures", numFeatures=1000) idf = IDF(inputCol="rawfeatures", outputCol="features") #pca = PCA(k=1000, inputCol="rfeatures", outputCol="features") #lr = LogisticRegression(regParam=0.1, maxIter=20) lr = RandomForestClassifier(numTrees=20, maxDepth=5, seed=42) pipeline = Pipeline(stages=[hashingTF, idf, lr]) filename = "TFIDF-RF20-5" ev = MulticlassClassificationEvaluator(metricName='accuracy') crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=ParamGridBuilder().build(), evaluator=ev, numFolds=10, seed=42) model = crossval.fit(df) with open(filename,"a") as f: f.write(f"accuracy crossValidation: {max(model.avgMetrics)}\n") f.write(f"accuracy testSet : {ev.evaluate(model.transform(testSet))}\n") ev = MulticlassClassificationEvaluator(metricName='weightedPrecision') crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=ParamGridBuilder().build(),
# instantiate the base classifier. lr = LogisticRegression(featuresCol='tfidf', weightCol='weight', maxIter=10, tol=1E-6, fitIntercept=True) # train the multiclass model. model = lr.fit(train) # score the model on test data. predictions = model.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="f1") # compute the classification error on test data. f1 = evaluator.evaluate(predictions) print("f1 score = %g" % (f1)) # to show dataframe with predictions and probabilities #display(predictions) # to save predictions -- on Azure Databricks #predictions.write.save("/FileStore/lr_output.parquet") #################################### # One-vs-All (Logistic Regression) # ####################################
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("naive_bayes_example")\ .getOrCreate() # $example on$ # Load training data data = spark.read.format("libsvm") \ .load("sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop()
# Select results to view display(predictions.select("label", "prediction", "probability")) # COMMAND ---------- # MAGIC %md # MAGIC #### Model Evaluation # MAGIC # MAGIC To evaluate our model, we will be making use of the Evaluator in MulticlassClassification. Note that f1-score is the default metric for the MulticlassClassificationEvaluator. # COMMAND ---------- from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print "Model Accuracy: ", accuracy # COMMAND ---------- # MAGIC %md # MAGIC The Evaluator is able to use a few metrics such as f1-score, precision, recall, weightedPrecision and weightedRecall. # MAGIC # MAGIC evaluator.setMetricName("insert_metric_here") can be used to change the metric used to evaluate models. # COMMAND ---------- evaluator.explainParam("metricName") # COMMAND ----------
def run_MLA(XX,XXpredict,yy,yypredict,unique_IDS_tr,unique_IDS_pr,uniquetarget_tr,uniquetarget_pr,n_feat,ind_run_name,n_run): logger.info('Starting MLA run') logger.info('------------') if settings.pyspark_on == 1: # Use pyspark or not? Pyspark makes cross node (HPC) calculation possible. from pyspark import SparkContext # It's slower, manages resources between nodes using HTTP. from pyspark.sql import SQLContext # So far, it does not include feature importance outputs. from pyspark.ml import Pipeline # I would have to program feature importances myself. May be time consuming. from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator # pyspark go if settings.pyspark_remake_csv == 1: # Making the csv files for the pyspark MLA to read in is time consuming, turn off the file generation? logger.info('Remaking csvs for pysparks...') numpy.savetxt(temp_train, XX, delimiter=",") logger.info('Training csv saved') numpy.savetxt(temp_pred, XXpredict, delimiter=",") logger.info('Predict csv saved') sc = SparkContext(appName="ML_RF") # Initiate spark sclogger=sc._jvm.org.apache.log4j # Initiate spark logging sclogger.LogManager.getLogger("org").setLevel(sclogger.Level.ERROR) sclogger.LogManager.getLogger("akka").setLevel(sclogger.Level.ERROR) sqlContext=SQLContext(sc) # Read in data data_tr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_train) data_pr = sqlContext.read.format("com.databricks.spark.csv").options(header='false',inferSchema='true').load(temp_pred) data_tr=data_tr.withColumnRenamed(data_tr.columns[-1],"label") # rename last column (answers), to label data_pr=data_pr.withColumnRenamed(data_pr.columns[-1],"label") assembler=VectorAssembler(inputCols=data_tr.columns[:-1],outputCol="features") reduced=assembler.transform(data_tr.select('*')) # Assemble feature vectos for spark MLA assembler_pr=VectorAssembler(inputCols=data_pr.columns[:-1],outputCol="features") reduced_pr=assembler_pr.transform(data_pr.select('*')) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(reduced) # Index vectors featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(reduced) # Initiate MLA alg rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures",numTrees=100,maxDepth=5,maxBins=200) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) # Set up fitting pipeline start, end=[],[] # Timer logger.info('Fit start') logger.info('------------') start = time.time() model=pipeline.fit(reduced) # Fit end = time.time() logger.info('Fit ended in %s seconds' %(end-start)) logger.info('------------') start, end=[],[] logger.info('Predict start') logger.info('------------') start = time.time() predictions = model.transform(reduced_pr) # Predict evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",predictionCol="prediction",metricName="precision") accuracy = evaluator.evaluate(predictions) logger.info("Test Error = %g" %(1.0-accuracy)) logger.info('------------') logger.info('Pulling results ...') yypredict=numpy.array(predictions.select("indexedLabel").collect()) # Pulls all results into numpy arrays to continue program yypredict=yypredict[:,0] result=numpy.array(predictions.select("prediction").collect()) result=result[:,0] XXpredict=numpy.array(predictions.select("indexedFeatures").collect()) XXpredict=XXpredict[:,0] probs=numpy.array(predictions.select("probability").collect()) probs=probs[:,0] XXpredict=numpy.column_stack((XXpredict,yypredict)) end=time.time() logger.info('Predict ended in %s seconds' %(end-start)) logger.info('------------') else: # Run sklearn MLA switch MLA = get_function(settings.MLA) # Pulls in machine learning algorithm from settings clf = MLA().set_params(**settings.MLAset) logger.info('MLA settings') logger.info(clf) logger.info('------------') start, end=[],[] # Timer logger.info('Fit start') logger.info('------------') start = time.time() clf = clf.fit(XX[:,0:n_feat],yy) # XX is train array, yy is training answers end = time.time() logger.info('Fit ended in %s seconds' %(end-start)) logger.info('------------') score = clf.score if 'OvsA' not in ind_run_name: if settings.output_all_trees == 1: i_tree = 0 for tree_in_forest in clf.estimators_: with open('plots/tree_' + str(i_tree) + '.dot', 'w') as my_file: my_file = tree.export_graphviz(tree_in_forest, out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True) os.system('dot -Tpng plots/tree_%s.dot -o plots/tree_%s.png' %(i_tree,i_tree)) os.remove('plots/tree_%s.dot' %i_tree) i_tree = i_tree + 1 else: with open('plots/tree_example.dot', 'w') as my_file: my_file = tree.export_graphviz(clf.estimators_[0], out_file = my_file,feature_names=feat_names,class_names=uniquetarget_tr[0], filled=True) os.system('dot -Tpng plots/tree_example.dot -o plots/tree_example.png') os.remove('plots/tree_example.dot') start, end=[],[] # Split cats for RAM management numcats = numpy.int64((2*(XXpredict.size/1024/1024)*clf.n_jobs)) if settings.get_contributions ==1: numcats=100 if numcats < 1: numcats = 1 logger.info('Predict start') logger.info('------------') start = time.time() result,probs,bias,contributions,train_contributions=[],[],[],[],[] XXpredict_cats=numpy.array_split(XXpredict,numcats) logger.info('Splitting predict array into %s' %numcats) logger.info('------------') for i in range(len(XXpredict_cats)): logger.info('Predicting cat %s/%s' %(i,len(XXpredict_cats))) result.extend(clf.predict(XXpredict_cats[i][:,0:n_feat])) # XX is predict array. probs.extend(clf.predict_proba(XXpredict_cats[i][:,0:n_feat])) # Only take from 0:n_feat because answers are tacked on end if 'OvsA' not in ind_run_name: if (settings.get_contributions == 1) | (settings.get_perfect_contributions==1): logger.info('Getting contributions from predict catalogue %s' %i) tiresult = ti.predict(clf,XXpredict_cats[i][:,0:n_feat]) contributions.extend(tiresult[2]) bias = tiresult[1][0] feat_importance = clf.feature_importances_ result=numpy.float32(result) probs=numpy.float32(probs) if 'OvsA' not in ind_run_name: if settings.get_contributions == 1: numpy.save('contributions',contributions) if settings.get_perfect_contributions == 1: numpy.save('perfect_contributions',contributions) if settings.compute_contribution_mic == 1: logger.info('Getting contributions from train catalogue (for plot_mic_cont)') tiresult_train = ti.predict(clf,XX[:,0:n_feat]) train_contributions=tiresult_train[2] bias_train = tiresult_train[1][0] accuracy = metrics.accuracy_score(result,yypredict) recall = metrics.recall_score(result,yypredict,average=None) precision = metrics.precision_score(result,yypredict,average=None) score = metrics.f1_score(result, yypredict,average=None) end = time.time() logger.info('Predict ended in %s seconds' %(end-start)) logger.info('------------') logger.info('Recall Score: %s' %recall) logger.info('Precision Score: %s' %precision) logger.info('Accuracy Score: %s' %accuracy) logger.info('F1 Score: %s' %score) percentage=(n/predictdatanum)*100 run_opts.diagnostics([result,yypredict,unique_IDS_tr, unique_IDS_pr,uniquetarget_tr,uniquetarget_pr],'result') # stats=numpy.array([]) # stats=numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage)) # SAVE if settings.saveresults == 1: logger.info('Saving results') logger.info('------------') numpy.savetxt(settings.result_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((yypredict,result)),header="True_target Predicted_target") numpy.savetxt(settings.prob_outfile+('_%s' %ind_run_name)+'.txt',probs) numpy.savetxt(settings.feat_outfile+('_%s' %ind_run_name)+'.txt',feat_importance) numpy.savetxt(settings.stats_outfile+('_%s' %ind_run_name)+'.txt',numpy.column_stack((clf.n_estimators,traindatanum,predictdatanum,percentage,clf.max_depth)),header="n_est traindatanum predictdatanum percentage max_depth",fmt="%s") return result,feat_importance,probs,bias,contributions,accuracy,recall,precision,score,clf,train_contributions
dfE = dfEntrenamiento.select("cat", "tipo").join(dfGlobal, "cat", "inner") columnas = [ 'area', 'perimeter', 'fd', 'perimeter', 'compact_circle', 'fd', 'B1_sum', 'B1_mean', 'B1_median', 'B1_stdev', 'B1_min', 'B1_max', 'B1_variance', 'B2_sum', 'B2_mean', 'B2_median', 'B2_stdev', 'B2_min', 'B2_max', 'B2_variance', 'B3_sum', 'B3_mean', 'B3_median', 'B3_stdev', 'B3_min', 'B3_max', 'B3_variance' ] constructor = VectorAssembler(inputCols=columnas, outputCol="features") dfEF = constructor.transform(dfE).select("cat", "features", "tipo") entrena, evalua = dfEF.randomSplit([0.8, 0.2]) rf = RandomForestClassifier(labelCol="tipo") modelo = rf.fit(entrena) pred = modelo.transform(evalua) evaluador = MulticlassClassificationEvaluator(labelCol="tipo", metricName="accuracy") evaluador.evaluate(pred) modeloOk = rf.fit(dfEF) modeloOk.write().overwrite().save( "modelorf") #eliminar overwrite() si no se quiere sobreescribir el modelo
from pyspark.ml.classification import NaiveBayes naivebayes = NaiveBayes(featuresCol="features", labelCol="label") # %% [markdown] # ### Parameter grid # %% from pyspark.ml.tuning import ParamGridBuilder param_grid = ParamGridBuilder(). addGrid(naivebayes.smoothing, [0, 1, 2, 4, 8]). build() # %% [markdown] # ### Evaluator # %% from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator() # %% [markdown] # ## Build cross-validation model # %% from pyspark.ml.tuning import CrossValidator crossvalidator = CrossValidator(estimator=naivebayes, estimatorParamMaps=param_grid, evaluator=evaluator) # %% [markdown] # ## Fit cross-validation model # %% crossvalidation_mode = crossvalidator.fit(training) # %% [markdown]
# Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData) # Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[2] print(rfModel) # summary only # $example off$ spark.stop()
if __name__ == '__main__': #initialize spark session spark = SparkSession\ .builder\ .appName("Test")\ .getOrCreate() sc = spark.sparkContext #reading the train dataframes trainingDF = spark.read.load("../data/train_small.parquet") #train = trainingDF.withColumn('features',trainingDF.features.cast(VectorUDT())) # Split the data into train and test splits = trainingDF.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("NaiveBayesExample")\ .getOrCreate() # $example on$ # Load training data data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop()
from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator #Create the model dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\ featuresCol="features") dtModel = dtClassifer.fit(trainingData) #Predict on the test data predictions = dtModel.transform(testData) predictions.select("prediction","species","label").show() #Evaluate accuracy evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \ labelCol="label",metricName="accuracy") evaluator.evaluate(predictions) #Draw a confusion matrix predictions.groupBy("label","prediction").count().show() ###################################### INSULT as the output #Split into training and testing data (trainingData, testData) = INSULTDf.randomSplit([0.75, 0.25]) trainingData.count() testData.count() testData.show() from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator