def load_NaiveB_Model(dataset): print ("Accuracy of best NB Model with CrossValidation:") evaluator = BinaryClassificationEvaluator() best_NBModel = NaiveBayesModel.load("model/NB1/") predictions = best_NBModel.transform(dataset) accuracy = evaluator.evaluate(predictions) print "The accuracy = %g" % accuracy
def getOrCreateNB (self): try: if self.nbModel == None: self.nbModel = NaiveBayesModel.load(CONST_NB_FILE) except: print ("Creating NB Model") self.nbModel = self.createNB() return self.nbModel
def predictTweetCategNB(testtf, sc): modelTweetCategoryNB = NaiveBayesModel.load("NaiveBayes_model/") # select example rows to display. tt = sc.parallelize(testtf).map(lambda x: Row(features=x)).toDF() tt.show() predictions = modelTweetCategoryNB.transform(tt) #predictions.show() labels = predictions.select("prediction").rdd.map( lambda x: category[int(x.prediction)]).collect() return labels
def classify_tweets(inbound_dataset): # Run the cleansing UDF for tweet column udf_cleansing = functions.udf(cleansing) inbound_dataset = inbound_dataset.withColumn( "tweet_cleansed", udf_cleansing(functions.col("tweet"))) # Tokenizing from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words") inbound_dataset = tokenizer.transform(inbound_dataset) # Generating features from pyspark.ml.feature import HashingTF features_generator = HashingTF(inputCol="words", outputCol="features") inbound_dataset = features_generator.transform(inbound_dataset) model_folder = os.path.join(os.getcwd(), "saved_models") model_full_path = os.path.join(model_folder, "twitter_sentiment_spark") if not os.path.exists(model_folder): print("model does not exists") from pyspark.ml.classification import NaiveBayesModel loaded_model = NaiveBayesModel.load(model_full_path) # Classifying using saved model classified = loaded_model.transform(inbound_dataset) spark = getSparkSessionInstance(inbound_dataset.rdd.context.getConf()) if files_source == "hdfs": labels = spark.read.load(os.path.join("file://" + model_folder, "labels.csv"), format="csv", header=True) else: labels = spark.read.load(os.path.join(model_folder, "labels.csv"), format="csv", header=True) classified = classified.join(labels, classified["NB_pred"] == labels["label_id"]) udf_get_probability = functions.udf(get_probability) classified = classified.withColumn( "probability", udf_get_probability(functions.col("NB_prob"), functions.col("NB_pred"))) classified = classified.withColumn( "label_predicted", functions.when(classified.probability < probability_threshold, "2").otherwise(classified.label_predicted)) return classified
def index(): # Return the template with the teams list passed in prediction = '' if request.method == 'POST': headline = request.form.get('headline') #Use headline to make a pysparkdf df = spark.createDataFrame([(0, headline)], ['ID', 'text']) # Create a length column to be used as a future feature df = df.withColumn('length', length(df['text'])) df.show() # Create all the features to the data set tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token') idf = IDF(inputCol='hash_token', outputCol='idf_token') # Create feature vectors clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features') # Create a and run a data processing Pipeline data_prep_pipeline = Pipeline( stages=[tokenizer, stopremove, hashingTF, idf, clean_up]) # Fit and transform the pipeline cleaner = data_prep_pipeline.fit(df) cleaned = cleaner.transform(df) #load model and make prediction model = NaiveBayesModel.load('Trade_Predictor_Model') prediction = model.transform(cleaned).select('prediction').toPandas() prediction = prediction['prediction'].values[0] print(prediction) #transform 0, 1, 2 to hold/sell/buy if prediction == 0: prediction = 'Hold' elif prediction == 1: prediction = 'Sell' else: prediction = 'Buy' print(prediction) return render_template('index.html', action=prediction) #, teams=teams)
def predict(): blob_account_name = os.environ.get('ds_blob_account') blob_account_key = os.environ.get('ds_blob_key') mycontainer = os.environ.get('ds_container') filename = os.environ.get('ds_model_filename') dirname = os.getcwd() dirname1 = "model/data" dirname2 = "model/metadata" filename1 = "part-00000-a1f9ca3a-3bec-4451-849f-546af11b14ab.snappy.parquet" filename2 = "part-00000" blobfilename = "HdiSamples/HdiSamples/sentimentfinal/stages/3_NaiveBayes_471fad31e436e6de3ade" blob_service = BlockBlobService(account_name=blob_account_name, account_key=blob_account_key, endpoint_suffix='core.usgovcloudapi.net') generator = blob_service.list_blobs(mycontainer) dirname = os.getcwd() if not os.path.exists(dirname + "/" + dirname1): os.makedirs(dirname + "/" + dirname1) if not os.path.exists(dirname + "/" + dirname2): os.makedirs(dirname + "/" + dirname2) blob_service.get_blob_to_path(mycontainer, blobfilename + "/data/" + filename1, dirname + "/" + dirname1 + "/" + filename1) blob_service.get_blob_to_path(mycontainer, blobfilename + "/metadata/" + filename2, dirname + "/" + dirname2 + "/" + filename2) localmodel = os.path.join(dirname, "model") model = NaiveBayesModel.load(localmodel) input = request.get_data().decode("utf-8") inputformat = ast.literal_eval(input) testrdd = sc.parallelize(inputformat) temp = testrdd.map(lambda x: Row(text=x)) tempdf = spark.createDataFrame(temp) tokenizer = Tokenizer(inputCol="text", outputCol="words") stopremover = StopWordsRemover().setInputCol("words").setOutputCol( "removed").setCaseSensitive(False) newhashingTF = HashingTF(inputCol="removed", outputCol="features", numFeatures=2000) nb_pipeline = Pipeline(stages=[tokenizer, stopremover, newhashingTF]) temp1df = nb_pipeline.fit(tempdf).transform(tempdf) testpred = model.transform(temp1df) return str(testpred.take(1))
def naive_bayes(): conf = SparkConf().setAppName('RF') sc = SparkContext(conf=conf) spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame([ Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0])), Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0])), Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0])) ]) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight") model = nb.fit(df) # model.pi # # DenseVector([-0.81..., -0.58...]) # model.theta # # DenseMatrix(2, 2, [-0.91..., -0.51..., -0.40..., -1.09...], 1) test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() result = model.transform(test0).head() # result.prediction # # 1.0 # result.probability # # DenseVector([0.32..., 0.67...]) # result.rawPrediction # # DenseVector([-1.72..., -0.99...]) test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0])) ]).toDF() # model.transform(test1).head().prediction # # 1.0 temp_path = "." nb_path = temp_path + "/nb" nb.save(nb_path) nb2 = NaiveBayes.load(nb_path) # nb2.getSmoothing() # # 1.0 model_path = temp_path + "/nb_model" model.save(model_path) model2 = NaiveBayesModel.load(model_path) # model.pi == model2.pi # # True # model.theta == model2.theta # # True nb = nb.setThresholds([0.01, 10.00]) model3 = nb.fit(df) result = model3.transform(test0).head()
def test(self, sentence, vocabularys): # sentence = ' '.join(jieba.cut(sentence)) sentence = sentence.split(" ") print('句子抽象化后的结果: {}'.format(sentence)) vector = [0 for x in range(len(vocabularys))] for word in sentence: if word in vocabularys: index = vocabularys.index(word) vector[index] = 1 model = NaiveBayesModel.load(self.model_path) # model = DecisionTreeClassificationModel.load(self.model_path) test0 = self.spark.createDataFrame( [Row(features=Vectors.dense(vector))]) result = model.transform(test0).head() print('The model index is: {}'.format(result.prediction)) return int(result.prediction)
def update_models(): # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map logger.debug( '===================================================Starting load models===================================================' ) try: logger.debug('Loading tokenizer model') new_tokenizer = Tokenizer.load(tokenizer_file) logger.debug('Load tokenizer model successfully') except: logger.debug('Fail to load tokenizer') try: logger.debug('Loading hashing_tf model') new_hashing_tf = HashingTF.load(hashing_tf_file) logger.debug('Load hashing_tf model successfully') except: logger.debug('Fail to load hashing_tf') try: logger.debug('Loading idf_model') new_idf_model = IDFModel.load(idf_model_file) logger.debug('Load IDFModel successfully') except: logger.debug('Fail to load IDFModel') try: logger.debug('Loading nb_model') new_nb_model = NaiveBayesModel.load(nb_model_file) logger.debug('Load NaiveBayesModel successfully') except: logger.debug('Fail to load NaiveBayesModel') try: logger.debug('Updating models') tokenizer = new_tokenizer hashing_tf = new_hashing_tf idf_model = new_idf_model nb_model = new_nb_model logger.debug('update model successfully') except: logger.debug('Fail to update models') logger.debug( '===================================================Stopped load models===================================================' )
def sendRecord(df): hashingTF = HashingTF(inputCol="filteredWords", outputCol="features") rescaledData = hashingTF.transform(df) from pyspark.ml.classification import NaiveBayesModel sameModel = NaiveBayesModel.load("TwitterSentimentNB.model") from pyspark.ml.evaluation import MulticlassClassificationEvaluator predictions = sameModel.transform(rescaledData) pr=predictions.select("prediction") pr=pr.rdd if((str(pr.collect()==[Row(prediction=1.0)]))=="True"): print("Positive tweet") else: print("Negative tweet")
def naive_bayes_evaluator(test_data, deal_id): ####In: #A testing data set, as generated by data_prep() #The deal_id you want to test a model for #NB: The model has to be already saved to the cloud ####Out #An update message is outputted #an evaluator model = NaiveBayesModel.load(f"/mnt/lotte/naive_bayes/{deal_id}/") predictions = model.transform(test_data.withColumnRenamed( deal_id, 'label')) # compute accuracy on the test set evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy" ) #alternatively, use AreaUnderPR to get the precision-recall curve instead of the accuracy accuracy = evaluator.evaluate(predictions) print("Naive Bayes test accuracy for " + deal_id + " = " + str(accuracy)) return evaluator
def main(): logging.config.fileConfig('%s/../logging.conf' % os.path.dirname(os.path.abspath(__file__))) logger = logging.getLogger(name="simpleExample") parser = arg_parser() args = parser.parse_args() categories = [ 'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med' ] fe_pipeline_save_path = args.fe_pipeline_save_path classifier_save_path = args.classifier_save_path logger.info("Load Data") newsgroup_data_loader = DataLoader(categories=categories) _, twenty_test = newsgroup_data_loader.load_data() logger.info("Transform raw data into Spark DataFrame") spark_df_converter = SparkDataFrameConverter() twenty_test_df = spark_df_converter.convert(twenty_test.data, twenty_test.target) logger.info( "Load Feature Engineering Pipeline and apply transformations on train set" ) fe_pipeline_model = PipelineModel.load(fe_pipeline_save_path) twenty_test_counts_df = fe_pipeline_model.transform(twenty_test_df) logger.info("Load classifier and apply predictions") nb_model = NaiveBayesModel.load(classifier_save_path) predicted = nb_model.transform(twenty_test_counts_df) logger.info("Evaluate Results") evaluator = MulticlassClassificationEvaluator(labelCol="label_indexed", predictionCol="prediction", metricName="accuracy") logger.info("Accuracy on test set : {}".format( evaluator.evaluate(predicted)))
def loadModelFromDisk(self): self.logger.log("info", "Loading pretrained model from disk") self.__model = NaiveBayesModel.load(self.__modelPath) self.logger.log("info", "Complate..")
def apply_naive_bayes_classifier(tweets): model_path = 'hdfs://spark01.ctweb.inweb.org.br:9000/limonero/models/' \ 'Sentiment_Analysis_-_Naive_Bayes.0000' model = NaiveBayesModel.load(model_path) return model.transform(tweets)
'', words_with_url) return url_less_words extract_words_udf = udf(extract_words, StringType()) data_filtered = data_filtered.withColumn('text_words', extract_words_udf('body')) # data_filtered.show() regexTokenizer = RegexTokenizer(inputCol="text_words", outputCol="words", pattern="\\W") ## stop words f = open("./stopwords_twitter.txt", "r") model = NaiveBayesModel.load('./NB_model_without_pipeline') add_stopwords = [] for l in f.readlines(): add_stopwords.append(l.strip()) # print(add_stopwords[:5]) stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) ## bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", binary=True, vocabSize=10000, minDF=1)
sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) schema = StructType([StructField('label', FloatType(), True), \ StructField('sentences', StringType(), True)]) tokenizer = Tokenizer(inputCol="sentences", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=500) idf = IDF(inputCol="rawFeatures",outputCol="features") ################Unknown data########################### model1 = LogisticRegressionModel.load('./model/logisticRegModel5') nb_model = NaiveBayesModel.load('./model/naiveBayesModel5') documents1 = sc.wholeTextFiles('../lab3_data/unknown_data/*') documents1 = \ documents1.map( lambda (title, text): ( title.encode('ascii', 'ignore').decode('ascii'), text.encode('ascii', 'ignore').decode('ascii') ) ) documents1 = documents1.map(lambda (title, text): (getCategory(title), cleanSentences(text))) documentsDF = sqlContext.createDataFrame(documents1, schema) wordsData = tokenizer.transform(documentsDF)
try: # Create dstream from kafka topic directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip}) logger.debug('Create direct dstream from kafka successfully') except: logger.debug('Unable to create dstream from kafka') atexit.register(shutdown_hook, kafka_producer, spark) # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map try: logger.debug('Loading models') tokenizer = Tokenizer.load(tokenizer_file) hashing_tf = HashingTF.load(hashing_tf_file) idf_model = IDFModel.load(idf_model_file) nb_model = NaiveBayesModel.load(nb_model_file) selected_tags = pd.read_csv(selected_tags_file, header=None) local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0])) local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index))) catId_to_tags = sc.broadcast(local_catId_to_tags) tags_to_catId = sc.broadcast(local_tags_to_catId) tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType()) catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType()) logger.debug('loaded models successfully') except: logger.debug('Fail to load models') logger.debug('Start to process data') process_data(directKafkaStream, kafka_producer) ssc.start()
remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredDataFrame = remover.transform(tokenized).select( "label", "filtered", "time_stamp_ms") ngram = NGram(n=1, inputCol="filtered", outputCol="ngrams") ngramDataFrame = ngram.transform(filteredDataFrame) ngramDataFrame.show() ngramData = ngramDataFrame.select("label", "ngrams", "time_stamp_ms") hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(ngramData) # alternatively, CountVectorizer can also be used to get term frequency vectors featurizedData.show() idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() model = NaiveBayesModel.load("hdfs://localhost:9000/model_naive_bayes") predictions = model.transform(rescaledData) predictions.show() print(type(predictions)) res = predictions.toPandas() res.to_pickle("prediction_panda_df.pck") print(predictions.count())
# Create Ngrams ngram = NGram(n = 2, inputCol= "words", outputCol = "bigrams") wordsData = ngram.transform(wordsData) #changed inputCol = "words" hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3600) featurizedData = hashingTF.transform(wordsData) # Obtain the TF-IDF score idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # Load Naive Bayes model. print("Loading Naive Bayes Model...") model = NaiveBayesModel.load('/models/tmp/myNaiveBayesModel') # Make prediction and test accuracy. print("Naive Bayes Model loaded. Begin testing...") predictions = model.transform(rescaledData) print("Testing completed.") # Compute metrics print("Computing accuracy...") accuracyEval = MulticlassClassificationEvaluator(labelCol = "label", predictionCol = "prediction", metricName = "accuracy") accuracy = accuracyEval.evaluate(predictions) print("Accuracy = " + str(accuracy)) # Format output data #resultsDF = predictions.drop("predictions", "rawFeatures", "features", "rawPrediction", "probability", "words", "bigrams", "label") #resultsDF.show(2)
regParam=6.969697, labelCol='score', featuresCol='X') LR_model = LR.fit(X_train_large) LR_model.save(LR_model_path) # Random Forest RF = RandomForestClassifier(numTrees=100, maxDepth=15, labelCol="score", featuresCol="X") RF_model = RF.fit(X_train_large) RF_model.save(RF_model_path) # Loading all trained models NB_Model = NaiveBayesModel.load(NB_model_path) LR_Model = LogisticRegressionModel.load(LR_model_path) RF_Model = RandomForestClassificationModel.load(RF_model_path) voteClassifier = VoteClassifier(NB_Model, LR_Model, RF_Model) evaluate(voteClassifier.transform_vote(X_test_large), confusion=False, predictionCol='prediction_vote') evaluate(voteClassifier.transform_vote(X_test_imbd), confusion=False, predictionCol='prediction_vote') voteClassifier.transform_vote(X_test_imbd).show() # Accuracy: (TP+TN)/N # Positive Predicitve Value: TP/(TP+FP) # Negative Predicitve Value: TN/(TN+FN)
idfModel = idf.fit(test_data) test_data = idfModel.transform(test_data) test_data.show(5) from pyspark.ml.feature import StringIndexer stringIndexer = StringIndexer(inputCol="label", outputCol="labelIndex") model = stringIndexer.fit(test_data) test_data = model.transform(test_data) test_data.show(5) predicted = test_data.select("tfidf", "labelIndex") predicted.show(5) model_folder = os.path.join(os.getcwd(), 'saved_models') model_full_path = os.path.join(model_folder, "twitter_sentiment_spark") if not os.path.exists(model_folder): print("model does not exists") from pyspark.ml.classification import NaiveBayes, NaiveBayesModel loadModel = NaiveBayesModel.load(model_full_path) predicted = loadModel.transform(predicted) predicted.show() total = predicted.count() correct = predicted.where(predicted['labelIndex'] == predicted['NB_pred']).count() accuracy = correct/total print( "\nTotal:", total, "\nCorrect:", correct, "\nAccuracy:", accuracy)
def login(): message = '' e_result = '' s_result = '' t_result = '' j_result = '' if request.method == 'POST': post = request.form.get('text') # access the data inside if len(post) >= 100: test = pd.DataFrame([post], columns=['post']) newrows = [] def filter_text(post): """Decide whether or not we want to use the post.""" # should remove link only posts here return len(post) > 0 reg_punc = re.compile('[%s]' % re.escape(string.punctuation)) def preprocess_text(post): """Remove any junk we don't want to use in the post.""" # Remove links post = re.sub(r'http\S+', '', post, flags=re.MULTILINE) # All lowercase post = post.lower() # Remove puncutation post = reg_punc.sub('', post) return post def create_new_rows(row): posts = row['post'] rows = [] # for p in posts: p = preprocess_text(posts) rows.append({'post': p}) return rows for index, row in test.iterrows(): newrows += create_new_rows(row) test = pd.DataFrame(newrows) df = spark.createDataFrame(test) # Create a length column to be used as a future feature df = df.withColumn('length', length(df['post'])) types = [ 'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP', 'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ' ] types = [x.lower() for x in types] tokenizer = Tokenizer(inputCol="post", outputCol="words") tokenized = tokenizer.transform(df) # Remove stop words stopwordList = types stopwordList.extend(StopWordsRemover().getStopWords()) stopwordList = list(set(stopwordList)) #optionnal remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stopwordList) newFrame = remover.transform(tokenized) # Run the hashing term frequency hashing = HashingTF(inputCol="filtered", outputCol="hashedValues") # Transform into a DF hashed_df = hashing.transform(newFrame) # Fit the IDF on the data set idf = IDF(inputCol="hashedValues", outputCol="idf_token") idfModel = idf.fit(hashed_df) rescaledData = idfModel.transform(hashed_df) # Create feature vectors #idf = IDF(inputCol='hash_token', outputCol='idf_token') clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features') output = clean_up.transform(rescaledData) ei_model = NaiveBayesModel.load("static/models/EI_Predictor.h5") sn_model = NaiveBayesModel.load("static/models/SN_Predictor.h5") tf_model = NaiveBayesModel.load("static/models/TF_Predictor.h5") jp_model = NaiveBayesModel.load("static/models/JP_Predictor.h5") test_e = ei_model.transform(output) e = test_e.toPandas()["prediction"].values[0] if e == 0: e_result = "I" else: e_result = "E" test_s = sn_model.transform(output) s = test_s.toPandas()["prediction"].values[0] if s == 0: s_result = "N" else: s_result = "S" test_t = tf_model.transform(output) t = test_t.toPandas()["prediction"].values[0] if t == 0: t_result = "F" else: t_result = "T" test_j = jp_model.transform(output) j = test_j.toPandas()["prediction"].values[0] if j == 0: j_result = "P" else: j_result = "J" else: message = "Please tell us more about yourself!" return render_template('index.html', message=message, test_e=e_result, test_s=s_result, test_t=t_result, test_j=j_result)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.sql import functions as F import time ## Kafka VM IP kafka_topic = 'from-pubsub' zk = '10.182.0.2:2181' app_name = "from-pubsub" sc = SparkContext(appName="KafkaPubsub") ssc = StreamingContext(sc, 0.1) kafkaStream = KafkaUtils.createStream(ssc, zk, app_name, {kafka_topic: 1}) ### Load Saved Model pipelineFit = PipelineModel.load('gs://wcs_word/NB_pipeline') print("1") model = NaiveBayesModel.load('gs://wcs_word/NB_FullTrainedModel') print("2") spark = SparkSession(sc) ## Global Variable to calculate Latency init_time = None count = 0 ## Parsing Function def row_generate(r): return Row(star=float(r[0]) - 1, useful=float(r[1]), funny=float(r[2]), cool=float(r[3]), text=str(r[4]))
import pandas as pd from kafka import KafkaConsumer import sys from pyspark.ml import PipelineModel from pyspark.ml.classification import LogisticRegressionModel, NaiveBayesModel from sklearn.metrics import accuracy_score, recall_score, precision_score sc = SparkContext() sqlContext = SQLContext(sc) spark = SparkSession.builder.appName('consumer').getOrCreate() brokers, topic = sys.argv[1:] consumer = KafkaConsumer(topic, bootstrap_servers=['localhost:9092']) pip = PipelineModel.load('/Users/aditya/PycharmProjects/BigDataHW3/pipeline') model_nb = NaiveBayesModel.load( '/Users/aditya/PycharmProjects/BigDataHW3/nbModel') model_lr = LogisticRegressionModel.load( '/Users/aditya/PycharmProjects/BigDataHW3/lrModel') columns = ['actual', 'predicted'] result_df_lr = pd.DataFrame(columns=columns) result_df_nb = pd.DataFrame(columns=columns) feed = 0 for msg in consumer: article = msg.value data = article.split("||") label = data[0] text = data[1] df = sc.parallelize([{"label": label, "text": text}]).toDF() df = pip.transform(df)
label = json_data["response"]['results'][ind]['sectionName'] temp = list() temp.append(label) temp.append(headline) d.append(temp) return d if __name__ == "__main__": sc = SparkContext() sqlContext = SQLContext(sc) lr_model = LogisticRegressionModel.load("lrm.model") model = NaiveBayesModel.load("model.model") key = "00254a08-1426-4547-b54f-bc0137d9d547" from_date = "2018-02-01" to_date = "2018-02-12" url = 'http://content.guardianapis.com/search?from-date=' + from_date + '&to-date=' + to_date + \ '&order-by=newest&show-fields=all&page-size=200&%20num_per_section=10000&api-key=' + key data = get_data(url) df = sqlContext.createDataFrame(data, schema=["category", "text"]) pipeline_fit = PipelineModel.load("pipelining") dataset = pipeline_fit.transform(df) predictions = lr_model.transform(dataset) predictions1 = model.transform(dataset)
from flask import Flask, jsonify, render_template, request from flask_sqlalchemy import SQLAlchemy from pyspark.sql import SparkSession from pyspark import SparkFiles spark = SparkSession.builder.appName('prez').getOrCreate() from pyspark.ml.classification import NaiveBayesModel from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer from pyspark.sql.functions import length from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vector app = Flask(__name__) app.config['UPLOAD_FOLDER'] = 'uploads' app.config['DEBUG'] = True model = NaiveBayesModel.load('models/naivebayes.h5') def pipeline(df): print(df.head()) df = df.withColumn("length", length(df['Speech'])) # Create the data processing pipeline functions here (note: StringIndexer will be used to encode # your target variable column. This column should be named 'label' so our model will recognize it later) review_data = Tokenizer(inputCol="Speech", outputCol="Words") reviewed = review_data.transform(df) #reviewed.show() remover = StopWordsRemover(inputCol="Words", outputCol="filtered") newFrame = remover.transform(reviewed) #newFrame.show() hashing = HashingTF(inputCol="filtered", outputCol="hashedValues",
## FIlter for those rows where topic is Null data_filter_with_null_topic = data_modified_tweet.where( col("topic").isNull()).select('trend', 'creation_time', 'twid', 'text_words') data_filter_with_null_topic.show(5) print(data_filter_with_null_topic.count()) df_for_topic = data_filter_with_null_topic ## Fit pipeline to filtered dataframe pipelineFit = pipeline.fit(df_for_topic) dataset_for_topic = pipelineFit.transform(df_for_topic) dataset_for_topic.show(5) ## Load saved model for topic classification model_for_topic_classification = NaiveBayesModel.load( '/Users/saumya/Desktop/Big_data_project/NB_model_without_pipeline') print(model_for_topic_classification) ## Predict topics for unlabelled tweets predictions = model_for_topic_classification.transform(dataset_for_topic) ## convert the labels to text labels labeler = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=[ 'event', 'sports', 'politics', 'news', 'technology', 'business', 'entertainment', 'health' ]) # print(predictions) prediciton_with_label = labeler.transform(predictions) prediciton_with_label.show(5)
total_count_naive_bayes_classification = 0 sc = SparkContext(appName="PythonStreamingKafkaWordCount") sc.setLogLevel("ERROR") spark = SparkSession.builder.getOrCreate() ssc = StreamingContext(sc, 1) # Setting model path save_pipeline_path = output_folder_path + "pipeline" saved_logistic_model_path = output_folder_path + "LogisticClassificationModel" saved_naive_bayes_model_path = output_folder_path + "NaiveBayesClassificationModel" # Loading Pipeline and Model loded_pipeline = PipelineModel.load(save_pipeline_path) saved_logistic_model = LogisticRegressionModel.load( saved_logistic_model_path) saved_naive_bayes_model = NaiveBayesModel.load( saved_naive_bayes_model_path) # Creating Kafka Stream kvs = KafkaUtils.createDirectStream( ssc, topics=['guardian2stream'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) document_tuple = kvs.map(lambda line: (int(line[1].split("||")[0].strip( ).encode("ascii", "ignore")), line[1].split("||")[1].encode( "ascii", "ignore"))) document_tuple.pprint() document_tuple.foreachRDD(process) ssc.start() ssc.awaitTermination() ssc.stop(stopGraceFully=True)