def load_parameters(self): self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_method') self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_method') self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features') self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol') self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser') amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model') trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model') if self.amount_prediction_method == self.RANDOM_FOREST: amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path) elif self.amount_prediction_method == self.LINEAR_REGRESSION: amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path) else: amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model') if self.trend_prediction_method == self.RANDOM_FOREST: trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.NAIVE_BAYES: trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.SVM: trend_model = SVMModel.load(sc=self.sc, path=trend_model_path) else: trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model') return trend_model, amount_model
def predict(): # Make prediction and test accuracy. sc = SparkContext(appName= 'nb_test') sameModel = NaiveBayesModel.load(sc, "../../target/myNaiveBayesModel") data = sc.textFile('../../data/mllib/sample_naive_bayes_data.txt').map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.1, 0.9], seed=0) print test.collect() predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) print predictionAndLabel.collect() accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print accuracy
def main(): sc = SparkContext(appName="BayesClassifer") htf = HashingTF(50000) data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels.csv') data_cleaned = data.map(lambda line : line.split(",")) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(label, htf.transform(text))) data_hashed.persist() # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine) #print data # Split data aproximately into training (60%) and test (40%) training, test = data_hashed.randomSplit([0.70, 0.30], seed=0) sameModel = NaiveBayesModel.load(sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel") print "----------" print sameModel.predict(htf.transform("posts jump in net profit")) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) predictionAndLabel1 = training.map(lambda p: (sameModel.predict(p.features), p.label)) prediction = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() prediction1 = 1.0 * predictionAndLabel1.filter(lambda (x, v): x == v).count() / training.count() buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v ==1).count() # Instantiate metrics object # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabel) # Overall statistics precision = metrics.precision() precision = normalize(precision) recall = metrics.recall() recall = normalize(recall) f1Score = metrics.fMeasure() f1Score = normalize(f1Score) print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) ''' # Statistics by class labels = data_hashed.map(lambda lp: lp.label).distinct().collect() for label in sorted(labels): print("Class %s precision = %s" % (label, metrics.precision(label))) print("Class %s recall = %s" % (label, metrics.recall(label))) print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) ''' '''
def predict_row(sentence, spark_context, model_folder): """ :param sentence: a sentence to be analysed :type sentence: basestring :param spark_context: the current spark context :type spark_context: SparkContext :param model_folder: :type model_folder: basestring :return: 0.0 if the sentence is negative, 1 if the sentence is neutral and 2 if the sentence is positive :rtype: float """ htf = HashingTF(50000) sentence_features = htf.transform(tokenize(sentence)) model = NaiveBayesModel.load(spark_context, model_folder) prediction = model.predict(sentence_features) print 'prediction :', prediction return prediction
def get_naive_bayes_model(spark_context, train_hashed, model_folder): """ :param spark_context: the current spark context :type spark_context: SparkContext :param train_hashed: :type train_hashed: DataFrame :param model_folder: :type model_folder: basestring :return: a trained Naive Bayes model :rtype: NaiveBayesModel """ if not path.exists(model_folder): # Train a Naive Bayes model on the training data model = NaiveBayes.train(train_hashed) # Ask Spark to save the model so it won't have to be re-trained later model.save(spark_context, model_folder) else: model = NaiveBayesModel.load(spark_context, model_folder) return model
def main(): sc = SparkContext(appName="BayesClassifer") htf = HashingTF(50000) data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/1.csv') data_cleaned = data.map(lambda line: line.split(",")) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors data_hashed = data_cleaned.map( lambda (label, text): LabeledPoint(label, htf.transform(text))) data_hashed.persist() # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine) #print data # Split data aproximately into training (60%) and test (40%) training, test = data_hashed.randomSplit([0.70, 0.30], seed=0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Save and load model model.save(sc, "/home/varshav/Desktop/Bangalore") sameModel = NaiveBayesModel.load(sc, "/home/varshav/Desktop/Bangalore") print "----------" print model.predict(htf.transform("posts jump in net profit")) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) predictionAndLabel1 = training.map( lambda p: (sameModel.predict(p.features), p.label)) prediction = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() #buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v == 1 ).count() # print buy_buy prediction1 = 1.0 * predictionAndLabel1.filter( lambda (x, v): x == v).count() / training.count() print prediction print prediction1 sc.stop()
def test_Model(): model = NaiveBayesModel.load(sc, "finalproject/model/NaiveBayesModel") testFile = sc.textFile("testdata.manual.csv") testData = testFile.map(transformData) testData = testData.filter(lambda x: x[0] != '\"2\"') features = testData.map(lambda x: x[1]) #print(testData.map(lambda x:x[0]).collect()) label = testData.map( lambda x: x[0]).distinct().zipWithIndex().collectAsMap() print(label) preprocessedData = preProcess(features) featuresData = getFeatures(preprocessedData) print("train", featuresData.take(5)) testingLabeled = createLabeledData(featuresData, testData) print("train", testingLabeled.take(5)) labeledTestingData = testingLabeled.map( lambda record: LabeledPoint(label[record[0][0]], record[1])) predictionAndLabel = labeledTestingData.map( lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / labeledTestingData.count() print("Accuracy: ", accuracy)
from pyspark import SparkContext from pyspark.mllib.feature import HashingTF from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import NaiveBayes from pyspark import SparkConf from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel from pyspark.mllib.linalg import Vectors print("Successfully imported Spark Modules") except ImportError as e: print("Can not import Spark Modules", e) sys.exit(1) sc = SparkContext(appName="Test") sameModel = NaiveBayesModel.load( sc, "/home/sparkCluster/work/PycharmProjects/StockAnalysis/myModel") htf = HashingTF(50000) pg_no = 1 company_name = [] company_list = [] companies = [] codes_list = [] comp = [] experts = [] stopwords = [] exclude = [] def getCompanyName(): global comp
# ************************************************************************************************************** # 옷 카테고리 예측 # ************************************************************************************************************** from pyspark.mllib.feature import HashingTF, IDF from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel from pyspark.mllib.linalg import Vectors pdf = pd.read_csv('file:///home/ec2-user/data/parseData.csv',encoding='utf-8') df = sqlContext.createDataFrame(pdf) #df.show() htf = HashingTF(10000) categoryModel = NaiveBayesModel.load(sc, "target/tmp/parseModel") # ************************************************************************************************************** # 분류 # ************************************************************************************************************** # labelDf.show() def getGenderLabelCode(rdd, label): GenderRdd = rdd.map(lambda row: row.gender).distinct() def getGenderCode(rdd): dic = {'etc': 'e'} for feature in rdd.collect(): uniToStr = str(feature) dic[uniToStr] = uniToStr[0]
average = numpy.average(score) deviation = numpy.std(score) return 50 + 10 * ((score - average) / deviation) # mix-in NaiveBayesModel.likelihood = likelihood conf = SparkConf().setAppName("sample").setMaster("local") sc = SparkContext(conf=conf) path = os.path.abspath(os.path.dirname(__file__)) texts = pickle.load(open("%s/model/texts.pick" % path)) labels = pickle.load(open("%s/model/labels.pick" % path)) texts = sc.parallelize(texts) htf = HashingTF(1000) # Warning!! default value is 2^20 htf.transform(texts) words = sys.argv[1].split() test_tf = htf.transform(words) model = NaiveBayesModel.load(sc, "%s/model" % path) test = model.predict(test_tf) likelihoods = model.likelihood(test_tf) print "likelihoods: %s" % likelihoods print "standard scores: %s" % standard_score(likelihoods) print "label: %s" % labels[int(test)].encode('utf-8') # json_data = {"likelihood": likelihoods[int(test)], "label": labels[int(test)].encode('utf-8')} # print json.dumps(json_data)
def load_model(cls, path="$SPARK_HOME/NaiveBayes"): """ """ return NaiveBayesModel.load(sc, path)
no_stopwords = [w for w in no_punctuation if not w in STOPWORDS] stemmed = [STEMMER.stem(w) for w in no_stopwords] result = [w for w in stemmed if w] if not result: return [""] return result folderpath='hdfs://ec2-54-213-170-202.us-west-2.compute.amazonaws.com:9000/user/root/crawled_data' sc = SparkContext() data_raw = sc.wholeTextFiles(folderpath) data_cleaned = data_raw.map(lambda (filename, text): (filename, tokenize(text))) htf = HashingTF(50000) data_hashed = data_cleaned.map(lambda (filename, text): (filename, htf.transform(text))) data_hashed.persist() sameModel = NaiveBayesModel.load(sc, 'hdfs://ec2-54-213-170-202.us-west-2.compute.amazonaws.com:9000/user/root/bbcmodel') predictedLabel = data_hashed.map(lambda (filename, text): (filename.split("/")[-1][:-4],sameModel.predict(text))) preds = predictedLabel.collect() conn = psycopg2.connect(database="NewsSource", user="******", password="******", host="newdb.cnceaogjppz8.us-west-2.rds.amazonaws.com", port="5432") cur = conn.cursor() update = 'update articlestable set classifiedcategory=%s where id=%s' newscategory = {hash('entertainment'):'entertainment',hash('sports'):'sports',hash('politics'):'politics',hash('technology'):'technology',hash('business'):'business'} for pred in preds: if(hash('entertainment')== pred[1]): category = 'entertainment' elif(hash('sports')== pred[1]): category = 'sports'
sorted_dict = sorted(dictionary_RDD_IDFs_Weights.items(), key=operator.itemgetter(1)) # Set to max of N words for corresponding number of features for which the model is trained Dictionary = [] for key, value in sorted_dict: Dictionary.append(key) print(len(Dictionary)) # Create a broadcast variable for the Dictionary Dictionary_BV = sc.broadcast(sorted(Dictionary)) # Load Naive Bayes Model model_path = "/Users/path/to/twitter_analytics/NB_model" sameModel = NaiveBayesModel.load(sc, model_path) # Start intro Video - make sure to first run "chmod a+x play.sh" otherwise --> permission denied exception video = "Users:path:to:vids:intro.mp4" video_1 = subprocess.Popen("osascript runner.scpt " + "'" + video + "'", shell=True) # Get user twitter-handle x = int( input( "Do you have a twitter account? \n(1) Yes \n(2) No \nYour choice: " )) if x == 1: user_handle = input("Please provide user twitter handle: ") friends = get_friends(user_handle, api)
terms = tags.split() # filter words that not exist in the vocabulary terms = [x for x in list(set(terms)) if x in list(set(vocabulary))] indices = list(map(lambda x: vocabulary.index(x), list(set(terms)))) indices.sort() occurrences = list( map(lambda x: float(terms.count(vocabulary[x])), indices)) return [len(vocabulary), indices, occurrences] conf = SparkConf() conf.setAppName("NaiveBaye") conf.set('spark.driver.memory', '6g') conf.set('spark.executor.memory', '6g') conf.set('spark.cores.max', 156) #load tags passed as parameter tags = sys.argv[1] bow = bow(tags) #bag of words of that tags sc = SparkContext(conf=conf) # SparkContext model = NaiveBayesModel.load(sc, "model") result = model.predict(SparseVector(bow[0], bow[1], bow[2])) print str(classValues[result])
def load_model(self, context, path): return NaiveBayesModel.load(context, path)
def save_model(model, model_name): output_dir = model_name shutil.rmtree(output_dir, ignore_errors=True) model.save(sc, output_dir) print('*' * 50, 'MODELS_TRAIN', '*' * 50) iris = datasets.load_iris() data_set = iris.data Y = iris.target data_set = pd.DataFrame(data_set) data_set['labels'] = Y print(data_set.head(5)) print(data_set.shape) s_df = sqlContext.createDataFrame(data_set) train_dataset = s_df.rdd.map(lambda x: LabeledPoint(x[-1], x[:4])) training, test = train_dataset.randomSplit([0.6, 0.4]) model = NaiveBayes.train(training, 0.7) predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy(predictionAndLabel) ################################################SAVE_LOAD############################################################### print('*' * 50, 'SAVE_LOAD', '*' * 50) save_model(model, 'myNaiveBayesModel') sameModel = NaiveBayesModel.load(sc, 'myNaiveBayesModel') predictionAndLabel_1 = test.map(lambda p: (model.predict(p.features), p.label)) accuracy(predictionAndLabel_1)
__author__ = 'ruben' from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint def parseLine(line): parts = line.split(',') label = float(parts[0]) features = Vectors.dense([float(x) for x in parts[1].split(' ')]) return LabeledPoint(label, features) data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed = 0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() # Save and load model model.save(sc, "myModelPath") sameModel = NaiveBayesModel.load(sc, "myModelPath")
def loadModelFromDisk(self, sc): print("Loading pretrained model from disk \n") model = NaiveBayesModel.load( sc, "hdfs://192.168.1.33:9000//NaiveBayes.model") print("Complate \n") return model
if ascontext: if ascontext.isComputeDataModelOnly(): ascontext.setSparkOutputSchema(output_schema) sys.exit(0) else: modelpath = ascontext.getModelContentToPath("model") model_metadata = json.loads(ascontext.getModelContentToString("model.metadata")) # create a DataModelTools to handle data model and data conversions datamodel = model_metadata["datamodel"] dmt = DataModelTools(datamodel) predictors = model_metadata["predictors"] DataModelTools.checkPredictors(datamodel,predictors,df) from pyspark.mllib.classification import NaiveBayesModel model = NaiveBayesModel.load(sc, modelpath); # to score the model, we need an RDD of DenseVector (the numeric encoded values of the predictors), use DataModelTools to do this dv = dmt.extractDenseVector(df,predictors).map(lambda x:x[1]) # scoring generates an RDD of predictions (but not the original features) predictions = model.predict(dv) # now we need to zip together the original rows from the DataFrame and the RDD of predictions # we end up with an RDD containing the list of values from the original dataframe plus the predicted class, converted from the encoded number to the original string def rowToList(row): result = [] for idx in range(0, len(row)): result.append(row[idx]) return result
# $example on$ # Load and parse the data file. data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split data approximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4]) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / test.count() print('model accuracy {}'.format(accuracy)) # Save and load model output_dir = 'target/tmp/myNaiveBayesModel' shutil.rmtree(output_dir, ignore_errors=True) model.save(sc, output_dir) sameModel = NaiveBayesModel.load(sc, output_dir) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda pl: pl[0] == pl[1]).count() / test.count() print('sameModel accuracy {}'.format(accuracy)) # $example off$
def getModel(self, path): if self.type == 'NaiveBayes': return NaiveBayesModel.load(self.sc, path) elif self.type == 'DecisionTree': return DecisionTreeModel.load(self.sc, path)
def main(): sc = SparkContext(appName="BayesClassifer") htf = HashingTF(50000) data = sc.textFile( '/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels1.csv' ) data_cleaned = data.map(lambda line: line.split(",")) # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors data_hashed = data_cleaned.map( lambda (label, text): LabeledPoint(label, htf.transform(text))) data_hashed.persist() # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine) #print data # Split data aproximately into training (60%) and test (40%) training, test = data_hashed.randomSplit([0.70, 0.30], seed=0) sameModel = NaiveBayesModel.load( sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel") print "----------" print sameModel.predict(htf.transform("posts jump in net profit")) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) predictionAndLabel1 = training.map( lambda p: (sameModel.predict(p.features), p.label)) prediction = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() prediction1 = 1.0 * predictionAndLabel1.filter( lambda (x, v): x == v).count() / training.count() buy_buy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == 1 and v == 1).count() sell_sell = 1.0 * predictionAndLabel.filter( lambda (x, v): x == 2 and v == 2).count() hold_hold = 1.0 * predictionAndLabel.filter( lambda (x, v): x == 3 and v == 3).count() print buy_buy print sell_sell print hold_hold # Statistics by class labels = data_hashed.map(lambda lp: lp.label).distinct().collect() print labels print type(labels[0]) ''' for label in sorted(labels): print("Class %s precision = %s" % (label, metrics.precision(label))) print("Class %s recall = %s" % (label, metrics.recall(label))) print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) ''' ''' print("Class %s precision = %s" % (1, metrics.precision(1))) print("Class %s recall = %s" % (1, metrics.recall(1))) print("Class %s F1 Measure = %s" % (1, metrics.fMeasure())) print("Class %s precision = %s" % (2, metrics.precision(2))) print("Class %s recall = %s" % (2, metrics.recall(2))) print("Class %s F1 Measure = %s" % (2, metrics.fMeasure(2))) ''' # Weighted stats ''' print("Weighted recall = %s" % metrics.weightedRecall) print("Weighted precision = %s" % metrics.weightedPrecision) print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) ''' sc.stop()
kafka_configuration_params = { "topic": ["BigData"], "connectionstring": "localhost:9092" } from pyspark.streaming.kafka import KafkaUtils directKafkaStream = KafkaUtils.createDirectStream( ssc, kafka_configuration_params["topic"], {"metadata.broker.list": kafka_configuration_params["connectionstring"]}) from pyspark.mllib.classification import SVMModel, LogisticRegressionModel, NaiveBayesModel LR_model = LogisticRegressionModel.load(sc, "../../notebooks/LR_model") SVM_model = SVMModel.load(sc, "../../notebooks/SVM_model") NB_model = NaiveBayesModel.load(sc, "../../notebooks/NB_model") import nltk import random from nltk.tokenize import word_tokenize allowed_word_types = ["JJ"] rdd_all_words = sc.textFile("../../notebooks/all_words/part-00000") rdd_broadcast_all_words = sc.broadcast(rdd_all_words.collect()) def convert_tweet_to_instance(tweets): rdd_tweets = tweets.map( \ lambda tweet: [word[0] for word in nltk.pos_tag(word_tokenize(tweet)) if word[1] in allowed_word_types])
def init_spark_components(self): print("Loading Model") self.model = NaiveBayesModel.load(sc, path.join(self.base_path, 'model')) self.tf = HashingTF()
from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession import time as tm from threading import Thread import numpy as np conf = SparkConf().setAppName("appName").setMaster("local") conf.set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) spark = SparkSession(sc) #Load pretrained models output_dir1 = '/home/emmittxu/Desktop/Stock-Sentiment-alalysis/Models/myNaiveBayesModel' output_dir2 = '/home/emmittxu/Desktop/Stock-Sentiment-alalysis/Models/sent_stockModel' print("Loading model.......") model1 = NaiveBayesModel.load(sc, output_dir1) model2 = NaiveBayesModel.load(sc, output_dir2) print("Models successfully loaded......") #Global variables to record the number of positive and negative sentiments negative = 0.0 neutral = 0.0 positive = 0.0 #Do feature extraction using TF-IDF and feed feature vectors to the sentiment classifier def vectorize_feature(training): try: global positive global negative positive = 0
def loadModel(self, sc): global model model = NaiveBayesModel.load( sc, "hdfs://192.168.1.33:9000/NaiveBayes.model")
for word in lowercased: punct_removed = ''.join([letter for letter in word if not letter in PUNCTUATION]) no_punctuation.append(punct_removed) no_stopwords = [w for w in no_punctuation if not w in STOPWORDS] stemmed = [STEMMER.stem(w) for w in no_stopwords] return [w for w in stemmed if w] def parseLine(line): parts = line.split(',') label = float(parts[0]) features = Vectors.dense([float(x) for x in parts[1].split(' ')]) return LabeledPoint(label, features) data = sc.textFile('C:\Users\SigurdLap\PycharmProjects\sparkTwitter\naiveBayes.txt').map(parseLine) # Split data approximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) # Prøve split i 5 deler, cross validation # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() # Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")
analysis_type = 'hashtag_analysis' send_df_to_dashboard(hashtag_counts_df, analysis_type) except: e = sys.exc_info()[0] print("There is an error: %s" % e) conf = SparkConf() conf.setAppName("TwitterStreamApp") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 3) ssc.checkpoint("checkpoint_models") htf = HashingTF(50000) NB_output_dir = '/spark/NaiveBayes' NB_load_model = NaiveBayesModel.load(sc, NB_output_dir) # Sentiment Analysis # ## 01 read tweets from stream ## dataStream = ssc.socketTextStream("localhost", 9009) ## 02 split the text into words # words = dataStream.map(lambda x: x.split(" ")) ## 03 transformed the words into features ## features = words.map(lambda x: htf.transform(x)) ## 04 predict the sentiment ## prediction = features.map(lambda x: classify(x)) ## 05 label the sentiments ## label_sentiments = prediction.map(lambda x: ('positive', 1) if x == 1 else ('negative', 1)) ## 06 aggregate the results using sentiment as key ##
tf_val = 1048576 # LOADING AND COMPUTING TF's TRAINING MODEL print('Loading TRAINING_TF_MODEL...', end="") tf_training = sc.pickleFile(os.getcwd() + "/Desktop/MODEL/TF/TF_MODEL_" + str(tf_val)) print('done!') print('Computing TF-IDF MODEL...', end="") idf_training = IDF(minDocFreq=5).fit(tf_training) print('done!') print('Loading Naive Bayes Model...', end="") NBM = NaiveBayesModel.load( sc, os.getcwd() + "/Desktop/MODEL/NBM/NaiveBayesModel_" + str(tf_val)) print('done!') print('READY TO PROCESS DATA...') kafkaParams = {'metadata.broker.list"': kafka_brokers} # CREATE DIRECT KAFKA STREAM WITH BROKERS AND TOPICS streamData = KafkaUtils.createDirectStream( ssc, [kafka_topic], {"metadata.broker.list": kafka_brokers}) ######### FROM NOW ON, EACH ACTION OR TRANSFORMATION IS DONE ON A SINGLE INCOMING BATCH OF TWEETS ######### # PRE-PROCESSING TWEETS DATA (TESTING) obj1 = TweetPreProcessing()
def parseLine(line): parts = line.split(',') label = float(parts[0]) features = Vectors.dense([float(x) for x in parts[1].split(' ')]) return LabeledPoint(label, features) # $example off$ if __name__ == "__main__": sc = SparkContext(appName="Jay") # $example on$ data = sc.textFile('data/mllib/naive_bayes_data.txt').map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() # Save and load model model.save(sc, "jay/myNaiveBayesModel") sameModel = NaiveBayesModel.load(sc, "jay/myNaiveBayesModel") # $example off$
# $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonNaiveBayesExample") # $example on$ data = sc.textFile("data/mllib/sample_naive_bayes_data.txt").map(parseLine) # Split data approximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print("model accuracy {}".format(accuracy)) # Save and load model output_dir = "target/tmp/myNaiveBayesModel" shutil.rmtree(output_dir, ignore_errors=True) model.save(sc, output_dir) sameModel = NaiveBayesModel.load(sc, output_dir) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print("sameModel accuracy {}".format(accuracy)) # $example off$
def classify(self, transformer): votes = [] for c in self._classifiers: v = c.predict(transformer) votes.append(v) return mode(votes) conf = SparkConf() conf.setAppName("TA") sc = SparkContext(conf=conf) tre = StreamingContext(sc, 10) htf = HashingTF(50000) NB_directory = 'hdfs://master:9000/user/hadoop/NaiveBayes' NB_model = NaiveBayesModel.load(sc, NB_directory) LR_directory = 'hdfs://master:9000/user/hadoop/LogisticRegression' LR_model = LogisticRegressionModel.load(sc, LR_directory) DT_output_dir = 'hdfs://master:9000/user/hadoop/DT' DT_model = DecisionTreeModel.load(sc, DT_output_dir) voted_classifier = VoteClassifier(NB_model, LR_model, DT_model) def sentiment(test_sample): sample_data_test = test_sample.split(" ") cli = htf.transform(sample_data_test) return voted_classifier.classify(cli)
def salary_pre(request): sc=SparkContext('local','test') spark = SparkSession.builder.getOrCreate() hive_con=HiveContext(sc) nd_idf=IDFModel.load('hdfs://localhost:9000/ndidf') agg_idf=IDFModel.load('hdfs://localhost:9000/aggidf') model=NaiveBayesModel.load(sc,'hdfs://localhost:9000/nymodel') # hive_con.sql('use zp') # testdata=hive_con.sql('select education,mon_wa,name,work_area,work_desp,work_exp,work_lable from `qtzp` where id=789') # testdataRDD = testdata.rdd.map(lambda i: Row(**{ # 'education': new_edu_trans(i.education), # 'salary': mon_wa_trans(i.mon_wa), # 'name': i.name, # 'city': i.work_area, # 'work_desp': i.work_desp, # 'work_exp': i.work_exp, # 'work_lable': i.work_lable # })) # dataDF=testdataRDD.map(lambda i:Row(**{ # 'salary': int(i.salary), # 'agg': [i.education] + [i.city] + [i.work_lable] + [i.work_exp], # 'name_and_desp': desp_text_division(i.name + ',' + i.work_desp) # })).toDF() # dataDF.show() city=request.POST.get('city') edu=request.POST.get('education') introduce=request.POST.get('introduce') position=request.POST.get('job') exp=request.POST.get('exp') dataRDD=sc.parallelize([[edu,city,position,exp,introduce]]) # schema=StructType([StructField('education',StringType(),True),StructField('work_area',StringType(),True),StructField('work_lable', # StringType(),True),StructField('work_exp',StringType(),True),StructField('work_desp',StringType(),True)]) # rowRDD=dataRDD.map(lambda i:Row(i[0],i[1],i[2],i[3],i[4])) # dataDF=spark.createDataFrame(rowRDD,schema) # dataDF.show() dataDF=dataRDD.map(lambda i:Row(**{ 'education':i[0], 'work_area':i[1], 'work_lable':i[2], 'work_exp':i[3], 'work_desp':i[4] })).map(lambda i:Row(**{ 'education':str(new_edu_trans(i.education)), 'city':[i.work_area], 'work_desp':i.work_desp, 'work_lable':[i.work_lable], 'work_exp':[i.work_exp] })).map(lambda i:Row(**{ 'agg':[i.education] + i.city + i.work_lable + i.work_exp, 'name_and_desp':desp_text_division(i.work_desp) })).toDF() dataDF.show() ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240) aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256) data = ndtf.transform(dataDF) data = aggtf.transform(data) idfdata = nd_idf.transform(data) idfdata = agg_idf.transform(idfdata) RDD = idfdata.rdd # featuresRDD = RDD.map(lambda i: (i.salary, i.ndfeatures.toArray().tolist() + i.features_agg.toArray().tolist())) #test featuresRDD = RDD.map(lambda i: i.ndfeatures.toArray().tolist() + i.features_agg.toArray().tolist()) #应用 # featuresRDD = featuresRDD.map(lambda i: features_trans(i)) #test featuresRDD=featuresRDD.map(lambda i:DenseVector(i)) #应用 # result=featuresRDD.map(lambda i: model.predict(i.features)).collect() #test result=featuresRDD.map(lambda i:model.predict(i)).collect() # result=result[0] spark.stop() sc.stop() city_mean=models.CSR.objects.using('db2').filter(city__contains=city) city_mean=city_mean[0].salary salary=result_trans(result[0]) pos_mean=models.ITS.objects.using('db2').get(name=position) pos_mean=pos_mean.salary data_lst=[city_mean,pos_mean,salary] data_lst=json.dumps(data_lst) return render(request,'salary.html',{'result':result,'position':position,'city':city,'edu':edu,'exp':exp,'data':data_lst})
def calculateSentiment(self,sc,query): model = NaiveBayesModel.load(sc,"finalproject/model/NaiveBayesModel") query = query print (query) twitDG = TwitterDataGenerator() twitDG.getData(query) inputFile = sc.textFile("finalproject/tweets.csv").distinct() input_id = inputFile.zipWithIndex().map(lambda l:(l[1],l[0])) preprocessedData = self.preProcess(inputFile) inputFileProcessed = self.processInputFile(inputFile) print("#################################################################################################") print(preprocessedData.take(5)) print("--------------------------------------------------------------------------------------------------") print(inputFileProcessed.take(5)) print("input file processed ",inputFileProcessed.count()) print("preprocessed count",preprocessedData.count()) hashingTF = HashingTF() tfData = preprocessedData.map(lambda tup: hashingTF.transform(tup)) idfData = IDF().fit(tfData) tfidfData = idfData.transform(tfData) output = tfidfData.map(lambda rec: model.predict(rec)) i_I=inputFileProcessed.map(lambda l: l[0]).zipWithIndex().map(lambda l:(l[1],l[0])) print("input file count",inputFile.count()) print ("output file count",output.count()) o_I=output.zipWithIndex().map(lambda l:(l[1],l[0])) i_o =i_I.join(o_I).map(lambda l:l[1]) print(i_o.take(i_o.count())) print(i_o.count()) outputJson = {} tweetList = [] tweet = {} positiveCount =0 negativeCount =0 for i in i_o.take(i_o.count()): print(i) #print data,data1 if i[1] == 0.0: negativeCount = negativeCount+1 text = "This is a negative Tweet" elif i[1] == 1.0: positiveCount = positiveCount + 1 text = "This is a positive Tweet" #data = text #replace(u"\u2022", "*").encode("utf-8") if len(i[0]) > 4: tweet = {} tweet['value'] = i[0].encode("ascii","ignore") tweet['sentiment'] = text tweetList.append(tweet) print i[0].encode("ascii","ignore") print text print "-------------------------------------" #print unicode(str(data),"utf-8") print (positiveCount) print (negativeCount) outputJson["tweets"] = json.dumps(tweetList) outputJson["positiveTweetCount"] = positiveCount outputJson["negativeTweetCount"] = negativeCount wordflatMap = preprocessedData.flatMap(lambda xs: [x for x in xs]).map(lambda x:x.encode("ascii","ignore")).map(lambda x: (x, 1)).reduceByKey(add) wordFlatMap_reversed = wordflatMap.map(lambda l:(l[1],l[0])).filter(lambda l: (l[1]!="rt" and l[1]!=query)) wordFlatMap_sorted = wordFlatMap_reversed.sortByKey(False) print (wordFlatMap_sorted.take(10)) outputFrequencyList = {} mostFrequentWordList = [] wordCount = {} words =[] counts = [] for i in wordFlatMap_sorted.take(10): wordCount = {} wordCount['word'] = i[1] wordCount['count'] = i[0] mostFrequentWordList.append(wordCount) outputJson["frequency"] = json.dumps(mostFrequentWordList) return outputJson
tokenhtml = tokenize(a) print(tokenhtml) for i in range(0, len(tokenhtml)): body = '' body += tokenhtml[i] + ' ' html_dict.append({"label": "0", "text": body}) sc = SparkContext() htmldata = sc.parallelize(html_dict) labels = htmldata.map(lambda doc: doc["label"], preservesPartitioning=True) tf = HashingTF().transform( htmldata.map(lambda doc: doc["text"], preservesPartitioning=True)) idf = IDF().fit(tf) tfidf = idf.transform(tf) end_tfidf = datetime.now() tfidf_time = format(end_tfidf - start_tfidf) dataset = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])) sameModel = NaiveBayesModel.load( sc, "/Users/apple/Dropbox/2016Spring/COSC526/MacHW1/mymodel") start_predict = datetime.now() predictionAndLabel = dataset.map(lambda p: (sameModel.predict(p.features))) predict_time = format(end_predict - start_predict) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / dataset.count() print(tfidf_time) print(accuracy)
import sys os.environ['SPARK_HOME'] = 'spark/spark' sys.path.append('spark/spark/python/') try: from pyspark import SparkContext from pyspark import SparkConf print("Successfully imported Spark Modules") except ImportError as e: print("Can not import Spark Modules", e) sys.exit(1) config = SparkConf().setMaster('local[*]').setAppName('SparkService') sc = SparkContext(conf=config) sc.setLogLevel("ERROR") from pyspark.mllib.feature import HashingTF from pyspark.mllib.classification import NaiveBayesModel hashingTF = HashingTF() sameModel = NaiveBayesModel.load(sc, "spark/nbm") print(sameModel.predict(hashingTF.transform("This is good place".split(" "))))
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[ 0], word_tokenize(s[1].lower()), s[2], s[3])).map(lambda p: (p[ 0], [x for x in p[1] if x in tokens], p[2], p[3])).cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [ PorterStemmer().stem(x) for x in s[1] if x not in stpwrds ], s[2], s[3])).filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1]) > 0)).cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map( lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll( sqlContext.read.parquet( "/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = ( tfIDF.filter(tfIDF.type == u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load( sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF.map(lambda p: (NB.predict(p.features), p[ 0], SVM.predict(p.features))).filter(lambda p: p[2] == 1).map( lambda p: (p[0], p[1])).groupByKey().mapValues(list).collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type == category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter( postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf.map(lambda x: ( post, x.label, cossine(x.features, postVector))).filter( lambda x: x[2] >= threshold).collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
label = float(parts[0]) features = Vectors.dense([float(x) for x in parts[1].split(' ')]) return LabeledPoint(label, features) # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonNaiveBayesExample") # $example on$ data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() # Save and load model model.save(sc, "target/tmp/myNaiveBayesModel") sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") # $example off$
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3])) .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3])) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3])) .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0)) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = (tfIDF .filter(tfIDF.type==u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features))) .filter(lambda p: p[2]==1) .map(lambda p: (p[0], p[1])) .groupByKey() .mapValues(list) .collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type==category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf .map(lambda x: (post, x.label, cossine(x.features, postVector))) .filter(lambda x: x[2]>=threshold) .collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint def parseLine(line): parts = line.split(',') label = float(parts[0]) features = Vectors.dense([float(x) for x in parts[1].split(' ')]) return LabeledPoint(label, features) data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) # Train a naive Bayes model. model = NaiveBayes.train(training, 1.0) # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter( lambda (x, v): x == v).count() / test.count() # Save and load model model.save(sc, "myModelPath") sameModel = NaiveBayesModel.load(sc, "myModelPath")