def naive_bayes(): conf = SparkConf().setAppName('RF') sc = SparkContext(conf=conf) spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([ Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])), Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])), Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0])) ]) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") model = nb.fit(df) # model.pi # # DenseVector([-0.81..., -0.58...]) # model.theta # # DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1) test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() result = model.transform(test0).head() # result.prediction # # 1.0 # result.probability # # DenseVector([0.32..., 0.67...]) # result.rawPrediction # # DenseVector([-1.72..., -0.99...]) test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0])) ]).toDF() # model.transform(test1).head().prediction # # 1.0 temp_path = "." nb_path = temp_path + "/nb" nb.save(nb_path) nb2 = NaiveBayes.load(nb_path) # nb2.getSmoothing() # # 1.0 model_path = temp_path + "/nb_model" model.save(model_path) model2 = NaiveBayesModel.load(model_path) # model.pi == model2.pi # # True # model.theta == model2.theta # # True nb = nb.setThresholds([0.01, 10.00]) model3 = nb.fit(df) result = model3.transform(test0).head()
def NaiveBayesCl(train): # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model nbModel = nb.fit(train) return nbModel
def exec_naive_bayes(self, featuresCol1="features", labelCol1="label", predictionCol1="prediction", smoothing1=1, numClass1=2): ''' Creates the Naive Bayes model Pipeline Input: featureCol1: feature column name, labelCol: label column name, predictionCol1: prediction column name model parameters: {smoothing}, numClass1: number of class labels Output: None ''' #Initialize NaiveBayes Model with parameters passed nb = NaiveBayes(featuresCol=featuresCol1, labelCol=labelCol1, predictionCol=predictionCol1, smoothing=smoothing1) #Fit nb model with training data nbModel = nb.fit(self.trainingData) #Make nb model predictions on testData predictions = nbModel.transform(self.testData) #Evaluate the results generated by the model prediction self.model_evaluator(predictions, modelType="NaiveBayes Model", modelParams=str({'smoothing': smoothing1}), numClass=numClass1)
def main(): spark = SparkSession.builder.appName('nlp').getOrCreate() data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection", inferSchema=True, sep='\t') data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text') data.show() data = data.withColumn('length', length(data['text'])) data.show() data.groupby('class').mean().show() tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label') clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') nb = NaiveBayes() data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove, count_vec, idf, clean_up]) cleaner = data_prep_pipe.fit(data) clean_data = cleaner.transform(data) clean_data = clean_data.select(['label', 'features']) clean_data.show() (training, testing) = clean_data.randomSplit([0.7, 0.3]) spam_predictor = nb.fit(training) data.printSchema() test_results = spam_predictor.transform(testing) test_results.show() acc_eval = MulticlassClassificationEvaluator() acc = acc_eval.evaluate(test_results) print("Accuracy of model at predicting spam was: {}".format(acc))
def naive_bayes(training, test): # 特征向量必须为非负值 testing = test.select("features") nb = NaiveBayes(smoothing=1.0, modelType="multinomial") model = nb.fit(training) result = model.transform(test) accuracy = 1.0 * result.rdd.filter( lambda l: l.label == l.prediction).count() / test.count() print "朴素贝叶斯模型的正确率为:", accuracy
def training(self, transformed_ddf): train_ddf, test_ddf = transformed_ddf.randomSplit([0.7, 0.3]) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", featuresCol="tfidf_vector", labelCol=self.target_col, predictionCol=self.prediction_col) self.model = nb.fit(train_ddf) self.evaluation(train_ddf, 'Train') self.evaluation(test_ddf, 'Test')
def NaiveBayesEvaluation(TransformedDataset): nb = NaiveBayes() nb.setLabelCol("LabelIndex") nb.setPredictionCol("Label_Prediction") training, test = TransformedDataset.randomSplit([0.8, 0.2], seed=11) nvModel = nb.fit(training) prediction = nvModel.transform(test) # selected = prediction.select("body", "LabelIndex", "label", "Label_Prediction") # for row in selected.collect(): # print(row) from pyspark.mllib.evaluation import MulticlassMetrics predictionAndLabels = prediction.select( "Label_Prediction", "LabelIndex").rdd.map(lambda r: (float(r[0]), float(r[1]))) # predictionAndLabels = test.rdd.map(lambda lp: (float(nvModel.predict(lp.features)), lp.label)) metrics = MulticlassMetrics(predictionAndLabels) precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) # Statistics by class labels = prediction.rdd.map(lambda lp: lp.label).distinct().collect() labelIndices = prediction.rdd.map( lambda lp: lp.LabelIndex).distinct().collect() labelIndicesPairs = prediction.rdd.map( lambda lp: (lp.label, lp.LabelIndex)).distinct().collect() print("Labels", labels) print("Label Indices", labelIndices) print("Label Indice Pairs", labelIndicesPairs) for label, labelIndex in sorted(labelIndicesPairs): print("\n Class %s precision = %s" % (label, metrics.precision(labelIndex))) print("Class %s recall = %s" % (label, metrics.recall(labelIndex))) print( "Class %s F1 Measure = %s" % (label, metrics.fMeasure(labelIndex, beta=1.0)), "\n") # Weighted stats print("Weighted recall = %s" % metrics.weightedRecall) print("Weighted precision = %s" % metrics.weightedPrecision) print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
def naive_bayes_classifier(training_df, testing_df): """ Apply Naive Bayes Classifier to test data for predicting sentiment of Tweets. :param training_df: Trained labelled data :param testing_df: Test data :return: transformed dataframe of predicted labels for tweets """ nb = NaiveBayes() model = nb.fit(training_df) return model.transform(testing_df).select(["label", "words", "prediction"])
def naive_bayes(trainingDataFrame, smoothing=1.0, modelType="multinomial", weightCol="weight"): nb = NaiveBayes(smoothing=smoothing, modelType=modelType, weightCol=weightCol) nbModel = nb.fit(trainingDataFrame) result = {} result["model"] = nbModel return result
def pipeline_bayes(file,name): print("control entered pipeline_bayes ") cleaner = file.fit(name) cleaned = cleaner.transform(name) training, testing = cleaned.randomSplit([0.7, 0.3]) # Create a Naive Bayes model and fit training data nb = NaiveBayes() predictor = nb.fit(training) test_results = predictor.transform(testing) return test_results
def train2(self): print("Training Model\n") train_df = self.load_train() test_df = self.load_test() nb = NaiveBayes() nb.setPredictionCol("predict_") nb.setFeaturesCol("features") nb.setLabelCol("label") self.__model = nb.fit(train_df) print("Complate\n") self.saveModel() self.testModel_df(test_df)
def naiveBayes_predict(trainingData,testData): print('\n************************ Apprentissage du NaiveBayes ************************\n') nb = NaiveBayes() nbModel = nb.fit(trainingData) print('\n**************************** Sauvegarder le model ****************************\n') nbModel.save("./models/myNaiveBayesModel") return nbModel.transform(testData)
def train_naive_bayes(self, smoothing=1.0): ''' train dataset on naive bayes algo -------- Parameters smoothing = float -------- Returns None ''' # create the trainer and set its parameters nb = NaiveBayes(smoothing=smoothing, modelType="multinomial") self.model = nb.fit(self.train)
def naive_bayes(train, test, smoothing, modelType): nb = NaiveBayes(smoothing=smoothing, modelType=modelType) # Entrenamos el modelo model = nb.fit(train) predictions = model.transform(test) # evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") evaluator = BinaryClassificationEvaluator() accuracy = evaluator.evaluate(predictions) return accuracy
def naive_bayes(trainingData, testData): from pyspark.ml.classification import NaiveBayes nb = NaiveBayes(smoothing=1) model = nb.fit(trainingData) predictions = model.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("Descript", "Category", "probability", "label", "prediction") \ .orderBy("probability", ascending=False) \ .show(n=10, truncate=30) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions)
def main(spark, filename): df = spark.read.csv(filename, header=False, inferSchema=True) vector_assembler = VectorAssembler(inputCols=['_c0', '_c1', '_c2', '_c3'], outputCol='features') # >>> df.show(4) # +---+---+---+---+-----------+ # |_c0|_c1|_c2|_c3| _c4| # +---+---+---+---+-----------+ # |5.1|3.5|1.4|0.2|Iris-setosa| # |4.9|3.0|1.4|0.2|Iris-setosa| # |4.7|3.2|1.3|0.2|Iris-setosa| # |4.6|3.1|1.5|0.2|Iris-setosa| # +---+---+---+---+-----------+ vector_assembler = VectorAssembler(inputCols=['_c0', '_c1', '_c2', '_c3'], outputCol='features') v_df = vector_assembler.transform(df) # >>> v_df.show(4) # +---+---+---+---+-----------+-----------------+ # |_c0|_c1|_c2|_c3| _c4| features| # +---+---+---+---+-----------+-----------------+ # |5.1|3.5|1.4|0.2|Iris-setosa|[5.1,3.5,1.4,0.2]| # |4.9|3.0|1.4|0.2|Iris-setosa|[4.9,3.0,1.4,0.2]| # |4.7|3.2|1.3|0.2|Iris-setosa|[4.7,3.2,1.3,0.2]| # |4.6|3.1|1.5|0.2|Iris-setosa|[4.6,3.1,1.5,0.2]| # +---+---+---+---+-----------+-----------------+ # only showing top 4 rows indexer = StringIndexer(inputCol='_c4', outputCol='label') i_df = indexer.fit(v_df).transform(v_df) # >>> i_df.show(4) # +---+---+---+---+-----------+-----------------+-----+ # |_c0|_c1|_c2|_c3| _c4| features|label| # +---+---+---+---+-----------+-----------------+-----+ # |5.1|3.5|1.4|0.2|Iris-setosa|[5.1,3.5,1.4,0.2]| 0.0| # |4.9|3.0|1.4|0.2|Iris-setosa|[4.9,3.0,1.4,0.2]| 0.0| # |4.7|3.2|1.3|0.2|Iris-setosa|[4.7,3.2,1.3,0.2]| 0.0| # |4.6|3.1|1.5|0.2|Iris-setosa|[4.6,3.1,1.5,0.2]| 0.0| # +---+---+---+---+-----------+-----------------+-----+ # only showing top 4 rows splits = i_df.randomSplit([0.6, 0.4], 1) train_df = splits[0] test_df = splits[1] nb = NaiveBayes(modelType='multinomial') nbmodel = nb.fit(train_df) predictions = nbmodel.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy') nbaccuracy = evaluator.evaluate(predictions) print(nbaccuracy)
def naive_bayes_classify(comment_preprocessed): sc = SparkContext(appName="Classification") sql_context = SQLContext(sc) data = sql_context.createDataFrame(comment_preprocessed) train, test = data.randomSplit([0.7, 0.3], 1234) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") model = nb.fit(train) predictions = model.transform(test) evaluate_classification(predictions) time.sleep(1) # predict_comment(sql_context, model) compare_classification_with_tool(sql_context, model)
def bayes_classifier(training_data, test_data, validation_data): dt = NaiveBayes(featuresCol='scaled_features', labelCol='label', smoothing=0.00001) # ROC 0.43 dtModel = dt.fit(training_data) predict_valid = dtModel.transform(validation_data) predict_valid.show(10) evaluate_metrics(predict_valid) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label', metricName="areaUnderROC") model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_valid, data_type="valid_data")
def nb_classifier(training, testing): #MODEL 1: DECISION TREE CLASSIFIER from pyspark.ml.classification import NaiveBayes #Initialize model nb = NaiveBayes(modelType='multinomial') #Fit data into model nb_model = nb.fit(training) #Test model nb_predictions = nb_model.transform(testing) #Evaluate model nb_evaluator = MulticlassClassificationEvaluator(metricName='accuracy') nb_accuracy = nb_evaluator.evaluate(nb_predictions) return nb_accuracy
def test_naive_bayes(): df = spark.createDataFrame([ Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])), Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])), Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))]) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") model = nb.fit(df) model.setFeaturesCol("features") test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() features = test0.head().features result = model.predict(features) assert result == 1.0
def load_classifier_model(): # model = PipelineModel.load("./movie-robot-model") # print(model) # if model != None: # return model data_set = ModelProcessUtil.create_train_vectors() df = spark.createDataFrame(data_set) df.show() nb = NaiveBayes(modelType="bernoulli") nb_model = nb.fit(df) nb_model.setFeaturesCol("features") # nb_model.save("./movie-robot-model") nb_model.write().overwrite().save("./movie-robot-model") return nb_model
def naive_bayes_generator(training_data, deal_id): ####In: #A training data set, as generated by data_prep() #The deal_id you want to generate a model for ####Out #The model is saved #An update message is outputted training_data = training_data.withColumnRenamed(deal_id, 'label') model = NaiveBayes(smoothing=10, modelType="bernoulli") model = model.fit(training_data) model.write().overwrite().save( f"s3://rtl-databricks-datascience/lpater/naive_bayes/{deal_id}/") output_message = "Saved a Naive Bayes model for " + deal_id + "." #sea also: https://spark.apache.org/docs/latest/ml-classification-regression.html return output_message
def bayes_cv(business_id): """ Crossvalidation of bayes model """ spark = yelp_lib.spark review = yelp_lib.get_parq('review') business_df = review.filter(review['business_id'] == business_id) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") wordsDataFrame = regexTokenizer.transform(business_df) remover = StopWordsRemover(inputCol="words", outputCol="filtered") cleaned = remover.transform(wordsDataFrame) star_mapping = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0} cleaned = cleaned.replace(star_mapping, 'stars') cleaned = cleaned.withColumn("stars", cleaned["stars"].cast("double")) cv = CountVectorizer(inputCol="filtered", outputCol="features") model = cv.fit(cleaned) vectorized = model.transform(cleaned) vectorized = vectorized.select( col('stars').alias('label'), col('features')) splits = vectorized.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0) # train the model nb_model = nb.fit(train) # compute accuracy on the test set result = nb_model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") return "Accuracy: " + str(evaluator.evaluate(predictionAndLabels))
def filter_detections(spark, resources_folder): messages = spark.read.csv(resources_folder + 'SMSSpamCollection', inferSchema=True, sep='\t') messages.printSchema() messages.show() messages = messages.withColumnRenamed('_c0', 'class').withColumnRenamed( '_c1', 'text') messages.show() messages = messages.withColumn('length', length(messages['text'])) messages.show() messages.groupBy('class').mean().show() tokenizer = Tokenizer(inputCol='text', outputCol='token_text') stop_remover = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='count_vec') # idf = inverse document frecuency # td = term frequency idf = IDF(inputCol='count_vec', outputCol='tf_idf') ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label') assembler = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') nb = NaiveBayes() data_pre_pipeline = Pipeline(stages=[ ham_spam_to_numeric, tokenizer, stop_remover, count_vec, idf, assembler ]) clean_data = data_pre_pipeline.fit(messages).transform(messages) clean_data.show() clean_data = clean_data.select('label', 'features') training_messages, test_messages = clean_data.randomSplit([0.7, 0.3]) spam_detector = nb.fit(training_messages) test_results = spam_detector.transform(test_messages) test_results.show() acc_eval = MulticlassClassificationEvaluator() acc = acc_eval.evaluate(test_results) print("ACC of NB Model") print(acc)
def fit(self): # 构建一个spark类型的dataframe格式的训练集形式 pkl_file = open('data.pkl', 'rb') train_data = pickle.load(pkl_file) df = self.spark.createDataFrame([ Row(label=train_data[j][0], weight=0.1, features=Vectors.dense(train_data[j][1][i])) for j in range(14) for i in range(len(train_data[j][1])) ]) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") # nb = DecisionTreeClassifier() print("训练正在开始-------------->") model = nb.fit(df) model.save(self.model_path)
def naive_bayes(train, test): """Naive Bayes model. It uses cross validation to calculate the best smoothing value to train the model.""" nb = NaiveBayes(modelType="multinomial", featuresCol='scaledFeatures') grid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.5, 1.0]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=nb, estimatorParamMaps=grid, evaluator=evaluator) cv_model = cv.fit(train) best_model = cv_model.bestModel best_smooth = best_model._java_obj.getSmoothing() """Training with the best smoothing value""" best_nb = NaiveBayes(smoothing=best_smooth, modelType="multinomial", featuresCol='scaledFeatures') nb_model = best_nb.fit(train) predictions = nb_model.transform(test) return predictions
def naive_bayes(df, seed): # Drop preferred_foot because it's the only categorical column, the others are all numerical # Use preferred_foot if we have time to implement it df = df.drop("preferred_foot") labelIndexer = StringIndexer(inputCol="team_position", outputCol="label").fit(df) df = labelIndexer.transform(df) df = df.drop("team_position") list_of_features = df.drop("label").columns # Get list of all features assembler = VectorAssembler(inputCols=list_of_features, outputCol="features") df = assembler.transform(df) (train_data, test_data) = df.randomSplit([0.8, 0.2], seed) n_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial") model = n_bayes.fit(train_data) # Training happens here predictions = model.transform(test_data) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) y_true = predictions.select(['label']).collect() y_pred = predictions.select(['prediction']).collect() print("Classification report and confusion matrix for Naive Bayes:") print(classification_report(y_true, y_pred)) cm = confusion_matrix(y_true, y_pred) confusion_matrix_corrected = [[cm[1][1], cm[1][2], cm[1][0]], [cm[2][1], cm[2][2], cm[2][0]], [cm[0][1], cm[0][2], cm[0][0]]] print("") print(confusion_matrix_corrected[0]) print(confusion_matrix_corrected[1]) print(confusion_matrix_corrected[2]) cm = np.array([confusion_matrix_corrected[0], confusion_matrix_corrected[1], confusion_matrix_corrected[2]]) return accuracy, cm
def train(spark): sc = spark.sparkContext tokenizer = Tokenizer(inputCol="sentence", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=8000) idf = IDF(inputCol="rawFeatures", outputCol="features") srcdf = sc.textFile('part.csv').map(parse_line) srcdf = srcdf.toDF() training, testing = srcdf.randomSplit([0.9, 0.1]) wordsData = tokenizer.transform(training) featurizedData = hashingTF.transform(wordsData) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.persist() trainDF = rescaledData.select("features", "label").rdd.map( lambda x: Row(label=float(x['label']), features=Vectors.dense(x['features']))).toDF() naivebayes = NaiveBayes() model = naivebayes.fit(trainDF) testWordsData = tokenizer.transform(testing) testFeaturizedData = hashingTF.transform(testWordsData) testIDFModel = idf.fit(testFeaturizedData) testRescaledData = testIDFModel.transform(testFeaturizedData) testRescaledData.persist() testDF = testRescaledData.select("features", "label").rdd.map( lambda x: Row(label=float(x['label']), features=Vectors.dense(x['features']))).toDF() predictions = model.transform(testDF) predictions.show() evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("The accuracy on test-set is " + str(accuracy)) model.save('Bayes20000')
def nbModel(self, dfTrain, dfTest, seed): client = mlflow.tracking.MlflowClient() mlflow.set_experiment("gML NB") mlflow.end_run() mlflow.start_run() nb = NaiveBayes(smoothing=1.0, modelType="multinomial") model = nb.fit(dfTrain) predictions = model.transform(dfTest) metrics = ["accuracy", "f1"] result = [] for metric in metrics: evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName=metric) v = evaluator.evaluate(predictions) mlflow.log_metric(metric, v) # print(" {}: {}".format(metric,v)) temp = [metric, v] result.append(temp) mlflow.spark.log_model(model, "nbModel") return result
def driver(takeSample=False): data_df, features = feature_eng.preprocess_features2(takeSample=takeSample) data_df.cache() # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data_df.randomSplit([0.7, 0.3]) trainingData = sampling.undersample(trainingData, class_ratio=0.6) # create the trainer and set its parameters nb = NaiveBayes(labelCol='TARGET', featuresCol='OCCUPATION_TYPE', smoothing=1.0, modelType="multinomial") # Train model. This also runs the indexers. model = nb.fit(trainingData) # Make predictions. predictions = model.transform(testData) predictions.select('TARGET', 'rawPrediction', 'prediction', 'probability').show(20) return multiple_evaluator(predictions)
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("NaiveBayesExample")\ .getOrCreate() # $example on$ # Load training data data = spark.read.format("libsvm") \ .load("data/mllib/sample_libsvm_data.txt") # Split the data into train and test splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # train the model model = nb.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Accuracy: " + str(evaluator.evaluate(predictionAndLabels))) # $example off$ spark.stop()
.select("Text","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # COMMAND ---------- #finding the accuracy evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions) # COMMAND ---------- ##applying naive bayes using the "Text" to predict "Sentiment" from pyspark.ml.classification import NaiveBayes nb = NaiveBayes(smoothing=1) model = nb.fit(trainingData) predictions = model.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("Text","Sentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # COMMAND ---------- #finding the accuracy evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions)
#ENCODING LABEL stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res") ppl = Pipeline(stages=[stage_string]) df1 = ppl.fit(df01).transform(df01) #CREATING TF_IDF tokenizer = Tokenizer(inputCol="review_body", outputCol="words") wordsData = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #NAIVEBAYES nb = NaiveBayes(featuresCol="features", labelCol="class_res") #Model training model = nb.fit(rescaledData) #Model Saving model.write().overwrite().save("./NB_model") #Predictions pred = model.transform(rescaledData) #Disploying top 5 prediction values pred.select('prediction').show(5)