def computerTrainError(self): posPath = '/Users/sunxinzi/Documents/Machine_Learning/Project/Sentiment Prediction/train/pos/' negPath = '/Users/sunxinzi/Documents/Machine_Learning/Project/Sentiment Prediction/train/neg/' testPath = '/Users/sunxinzi/Documents/Machine_Learning/Project/Sentiment Prediction/train/pos/' process = preprocess() posTrain = [] negTrain = [] testData = [] posTrain = process.getCleanTxt(posPath, 'pos') negTrain = process.getCleanTxt(negPath, 'neg') testData = process.getCleanTestData(testPath) classifier = naiveBayes() result = classifier.test(posTrain, negTrain, testData) errorCount = 0 for i in result: if i[1] == 'neg': errorCount = errorCount + 1 testPath = '/Users/sunxinzi/Documents/Machine_Learning/Project/Sentiment Prediction/train/neg/' testData = [] testData = process.getCleanTestData(testPath) result = classifier.test(posTrain, negTrain, testData) for i in result: if i[1] == 'pos': errorCount = errorCount + 1 return errorCount
def semTratamento(textos, polaridades, classes): s1, s2 = naiveBayes(textos, polaridades, classes) arq = open(pasta + 'resultados sem tratamento.txt', 'w') arq.writelines(s1) arq.write('\n') arq.writelines(s2) arq.close()
from modifyInput import modifier from naiveBayes import naiveBayes import pandas as pd link = "Medical_data.csv" testframe = pd.read_csv("test_medical.csv") totalTestpoints = testframe.shape[0] classifier = naiveBayes(link) classifier.fit() count = 0 for i in range(0, totalTestpoints): original = testframe.iat[i, 0] predicted = classifier.predict((testframe.iloc[i, 1:4]).tolist()) if predicted == original: count = count + 1 print("accuracy=", 100 * count / totalTestpoints)
# generate 365 days worth of data # this will take a format of [0, 1, 0, 1, 1, 0] - these are all the data variables days = 365 while days > 0: randList = lambda n: [randint(0, 1) for b in range(1, n+1)] # code attained at http://code.activestate.com/recipes/577944-random-binary-list/ oneDay = randList(20) dailyMood.append(oneDay) days -= 1 # let's assume there'll be 4 classes for our moods - angry, happy, sad, neutral randMood = lambda n: [randint(0, 3) for b in range (1, n+1)] target = randMood(365) # test naive bayes on generated array # instantiate object clf = naiveBayes(dailyMood, target) data_train, target_train, data_test, target_test = clf.split_train_test(dailyMood, target, 0.7) # use 70% of the data as training # split class # basically generates a dictionary where the key is a target class, and the values are the list of instances that fall in that class splitClass = clf.classSplitter(data_train, target_train) # get probabilities of each class appearing. This will probably be fumbled since we have a random set classProbs = clf.classProbabilities(target_train) condiProbs = clf.get_conditional_probs(splitClass) # alright, now we have trained the set. # let's test the effectiveness accuracy = clf.test_errors(data_train, target_train, data_test, target_test) print accuracy
# -*- coding: utf-8 -*- """ Created on Thu Jun 14 21:54:46 2018 @author: ssheh Testimi i algoritmit te naiveBayes.py """ import csv from naiveBayes import naiveBayes reader = csv.DictReader(open('TestOrigins.csv')) k = naiveBayes(shteguDataBaze="Origins.csv", AtributiKlase="Buys_Computer") k.kalkulo_propabilitetin_AtributitKlase() for row in reader: k.hipoteza = row k.Kalkulo_propabilitetin_kushtezues(k.hipoteza) k.klasifiko() print("\n\n")
from naiveBayes import naiveBayes from knn import knn_string from svm import runMachine test_messages = [ "hey dude what's up", "HEY BRO, WHATSUP! REPLY NOW AT 83147661764. WIN WIN WIN", "i Like pie", "TEXT ME to win a FREE car free 4 lyfe yo 374237432" ] # Demo Naive Bayes print("Naive Bayes") print("------------") for msg in test_messages: print(naiveBayes(msg)) print("K-Nearest Neighbors") print("--------------------") for msg in test_messages: knn_string(msg) print("Support Vector Machine") print("----------------------") for msg in test_messages: runMachine(msg)
def bagOfWords(linesOfReal,linesOfFake, ngram, stopEnglish, stemed): # create the transform / stop_words="english" vectorizerReal = CountVectorizer(lowercase=True, stop_words=stopEnglish, analyzer='word', ngram_range=(ngram, ngram), max_df=1.0, min_df=1,max_features=None) trainReal = linesOfReal[:] vectorizerReal.fit(trainReal) ##fit in vector indexDictOfWordReal = vectorizerReal.vocabulary_ ##assign index for each word by vocab function bag_of_words_real = vectorizerReal.transform(trainReal) BoWOfReal = bag_of_words_real.toarray() ##bag of words array uniqlistOfRealWords = vectorizerReal.get_feature_names() ## create unique list by feature function #print("uniq real: ",len(uniqlistOfRealWords)) #print("uniq real: ",uniqlistOfRealWords) # create the transform / stop_words="english" vectorizerFake = CountVectorizer(lowercase=True, stop_words=stopEnglish, analyzer='word', ngram_range=(ngram, ngram), max_df=1.0, min_df=1,max_features=None) trainFake = linesOfFake[:] vectorizerFake.fit(trainFake) ##fit in vector indexDictOfWordFake = vectorizerFake.vocabulary_ ##assign index for each word by vocab function bag_of_words_fake = vectorizerFake.transform(trainFake) BoWOfFake = bag_of_words_fake.toarray() ##bag of words array uniqlistOfFakeWords = vectorizerFake.get_feature_names() ## create unique list by feature function countOfRealsDict = {} countOfFakesDict = {} frequenciesOfReal = np.sum(BoWOfReal,axis=0) ## sum counts of the same worlds for word in indexDictOfWordReal.keys(): countOfRealsDict[word] = frequenciesOfReal[indexDictOfWordReal[word]] frequenciesOfFake = np.sum(BoWOfFake,axis=0) ## sum counts of the same worlds for word in indexDictOfWordFake.keys(): countOfFakesDict[word] = frequenciesOfFake[indexDictOfWordFake[word]] testHeadLines = readCSV("test.csv", ngram, stemed) #write results to the csv file with open('output.csv', mode='w') as csv_file: fieldnames = ['Id', 'Category'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() correctnessCount = 0 for line in testHeadLines.keys(): #stop_words="english" vectorizerTest = CountVectorizer(lowercase=True, stop_words=stopEnglish, analyzer='word', ngram_range=(ngram, ngram), max_df=1.0, min_df=1, max_features=None) temp = [] temp.append(testHeadLines[line]["Id"]) #print(temp) vectorizerTest.fit(temp) ##fit in vector testindexDictOfWord = vectorizerTest.vocabulary_ ##assign index for each word by vocab function test_bag_of_words = vectorizerTest.transform(temp) test_BoW= test_bag_of_words.toarray() ##bag of words array test_uniqlistOfWords = vectorizerTest.get_feature_names() ## create unique list by feature function #print(testHeadLines[line]["Id"]) result = naiveBayes(countOfRealsDict, BoWOfReal, countOfFakesDict, BoWOfFake, testindexDictOfWord, test_BoW) ## write results to csv writer.writerow({'Id': testHeadLines[line]["Id"], 'Category': result}) #print(result) if result == testHeadLines[line]["Category"]: correctnessCount += 1 #print("--------------------------------------") uniqWordsofFiles = list(set(list(countOfRealsDict.keys()) + list(countOfFakesDict.keys())))##for bayes calculations #print("uniq word Count: ", len(uniqWordsofFiles)) # feature names print("correctness count: ", correctnessCount) accuracy = calculationofAccuracy(correctnessCount, len(testHeadLines.keys())) print("Accuracy = ", accuracy) return [countOfRealsDict, countOfFakesDict, uniqlistOfRealWords, uniqlistOfFakeWords]
def test10Fold(): global allWords splits = tenFoldCrossValidation() count = 0 total = 0 print("Naive Bayes") for split in splits: nb = naiveBayes() trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10)) count = 0 total = 0 print("Random Forest") for split in splits: nb = RandomForest(100) trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10)) count = 0 total = 0 print("Neural 5") for split in splits: nb = neuralNetwork((5, ), 1000) trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10)) count = 0 total = 0 print("Neural 3") for split in splits: nb = neuralNetwork((3, ), 1000) trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10)) count = 0 total = 0 print("SVM") for split in splits: nb = svm() trainFeatures = [] trainClasses = [] testFeatures = [] testClasses = [] for example in split.train: trainFeatures.append(example.features) trainClasses.append(example.klass) for example in split.test: testFeatures.append(example.features) testClasses.append(example.klass) nb.train(trainFeatures, trainClasses) nb.test(testFeatures, testClasses) accuracy = nb.getCorrectCount() / len(testClasses) total = total + accuracy print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy)) count = count + 1 print("[INFO]\tAccuracy:", str(total / 10))
def naiveBayesClassification(data): # setup the data # total length of dataset totalLen = len(data) # index of last element lastElement = len(data[0]) - 1 # get all indices where row is not a spam (0) zeroesIndices = np.where(data[:, lastElement] == 0)[0] # get all indices where row is a spam (1) onesIndices = np.where(data[:, lastElement] == 1)[0] # store all non spam rows in a separate numpy array zeroes = copy.deepcopy(data[zeroesIndices, :]) # store all spam rows in a separate numpy array ones = copy.deepcopy(data[onesIndices, :]) # split train and test set 50% 50% # with equal amount of 0s and 1s # initialize train set testSet = np.empty((0, len(data[0])), dtype=np.float64) # stack half of 0s and 1s into the train set testSet = np.vstack((testSet, zeroes[0:len(zeroes) // 2, :])) testSet = np.vstack((testSet, ones[0:len(ones) // 2, :])) # initialize test set trainSet = np.empty((0, len(data[0])), dtype=np.float64) # stack half of 0s and 1s into the test set trainSet = np.vstack((trainSet, zeroes[len(zeroes) // 2:len(zeroes), :])) trainSet = np.vstack((trainSet, ones[len(ones) // 2:len(ones), :])) # extract labels for both sets trainLabels = trainSet[:, -1] testLabels = testSet[:, -1] # # both sets ignore the class column # trainSet= trainSet[:, :-1] # testSet= testSet[:, :-1] # total length of train and test set should # be equal to total dataset length assert (len(trainSet) + len(testSet) == len(data)) # uncomment to test 40% 60% distribution # z = 0 # o = 0 # t = len(trainSet) # for i in trainSet: # print(i[len(i) - 1]) # if i[len(i) - 1] == 0: # z+=1 # if i[len(i) - 1] == 1: # o+=1 # print('z/t', z/t) # print('o/t', o/t) # return None # get how many attributes there are in the given train set numAttributes = len(trainSet[0]) - 1 # get the last column (class values) from the train set lastColumn = trainSet[:, len(trainSet[0]) - 1] # get only the unique values (eliminate duplicates) classes = set(lastColumn) # get how many classes there are for the given dataset # and create a naive bayes class with that number of classes, # classes, and number of attributes in the given set NB = naiveBayes(len(classes), numAttributes, classes) # train and predict using provided naive bayes code NB.train(trainSet) NB.trainingOutput() predictions = NB.classify(testSet) # compute confusion matrix and print it confusionMatrix = confusion_matrix(testLabels, predictions) print(confusionMatrix)
from naiveBayes import naiveBayes from data import data folds = 8 naiveBayesIris = naiveBayes() print("Iris Dataset\n===================================\n") # Get Iris data irisData = data.read('./db/Iris/iris.csv') # Train and test the model with Iris data irisAccuracyScores = naiveBayesIris.crossval_predict(irisData["data"], irisData["labels"], folds) # Calculate accuracy score irisTotalAccuracy = 0.0 for i in irisAccuracyScores: irisTotalAccuracy += i irisTotalAccuracy = irisTotalAccuracy / len(irisAccuracyScores) print("\n\nTotal accuracy: " + str(round(irisTotalAccuracy * 100, 2)) + "%\n\n") naiveBayesbanknote = naiveBayes() print("Banknote Dataset\n===================================\n") # Get Iris data banknoteData = data.read('./db/banknote_authentication/banknote_authentication.csv') # Train and test the model with banknote data banknoteAccuracyScores = naiveBayesbanknote.crossval_predict(banknoteData["data"], banknoteData["labels"], folds) # Calculate accuracy score banknoteTotalAccuracy = 0.0 for i in banknoteAccuracyScores: banknoteTotalAccuracy += i banknoteTotalAccuracy = banknoteTotalAccuracy / len(banknoteAccuracyScores)
#plt.plot(cArr, supportVectors, label='C vs Support Vectors') #plt.plot(epochs, devArr, label='Dev Error') #plt.title('Error Rates for Unaveraged and Averaged Perceptron') #plt.legend() #plt.xlabel('C') #plt.ylabel('Support Vectors') #plt.show() if algorithm == 2: # Run on Python 3 knn_fit(npData, newTarget) if algorithm == 3: # Run on Python 3 m = gradient_booster(npData, newTarget) if algorithm == 4: # Run on Python 3 gnb = naiveBayes(npData, newTarget) if algorithm == 5: # Run on Python 3 kbest = kbestfeatures(npData, newTarget) if algorithm == 6: # Run on Python 2 model, prediction = logisticRegression(npData, newTarget)
# -*- coding: utf-8 -*- """ Created on Thu Jun 14 21:54:46 2018 @author: ssheh Testimi i algoritmit te naiveBayes.py """ from naiveBayes import naiveBayes k = naiveBayes(shteguDataBaze="tabele_shembull.csv", AtributiKlase="play") k.kalkulo_propabilitetin_AtributitKlase() k.hipoteza = { "outlook": "sunny", "temp": "high", "humidity": "high", "windy": "true" } k.Kalkulo_propabilitetin_kushtezues(k.hipoteza) k.klasifiko()
import pandas as pd from naiveBayes import naiveBayes luis = pd.read_csv("haha.csv") luis.dropna(axis=0, inplace=True) luis.drop(['ACTIVITY'], axis=1, inplace=True) data = luis.values.tolist() target = [row[-1] for row in data] for row in data: del row[-1] clf = naiveBayes(data, target) data_train, target_train, data_test, target_test = clf.split_train_test( data, target, 0.7) # use 70% of the data as training # split class # basically generates a dictionary where the key is a target class, and the values are the list of instances that fall in that class splitClass = clf.classSplitter(data_train, target_train) # get probabilities of each class appearing. This will probably be fumbled since we have a random set classProbs = clf.classProbabilities(target_train) condiProbs = clf.get_conditional_probs(splitClass) # alright, now we have trained the set. # let's test the effectiveness accuracy = clf.test_errors(data_train, target_train, data_test, target_test) print accuracy
programStart = time() bowAll = bagOfWords(allSentencesList) bowSpam = bagOfWords(spamSentencesList) bowHam = bagOfWords(hamSentencesList) # print(bowAll, bowSpam, bowHam, sep = "\n") print() count = 0 spamCount = 0 hamCount = 0 for testList in testSentencesList: res = naiveBayes(preprocess(testList[1]), bowAll, bowSpam, bowHam) # print(testList[0], res) if (res == testList[0]): if (((res == "ham") and (hamCount < 2)) or ((res == "spam") and (spamCount < 2))): print("Input:", testList[1], end="") print("Expected output:", testList[0]) print("Predicted output:", res) print() if (res == "ham"): hamCount += 1 else: spamCount += 1 count += 1 programEnd = time()
# -*- coding: utf-8 -*- """ Created on Thu Jun 14 21:54:46 2018 @author: ssheh Testimi i algoritmit te naiveBayes.py """ import csv from naiveBayes import naiveBayes reader = csv.DictReader(open('TeDhenatTestuese.csv')) #k = naiveBayes(shteguDataBaze = "Iris.csv", AtributiKlase = "Species" ) #k = naiveBayes(shteguDataBaze = "WaterBears.csv", AtributiKlase = "Species" ) k = naiveBayes(shteguDataBaze="TeDhenatTrajnuese.csv", AtributiKlase="Klasa") k.kalkulo_propabilitetin_AtributitKlase() #k.hipoteza = {"SepalLengthCm":"5.1","SepalWidthCm":"3.5","PetalLengthCm":"1.4","PetalWidthCm":"0.2"} #k.hipoteza = {"SSI":"30.16","BTWa":"26.05","BTWs":"20.87","BTWp":"19.98","BTWL":"0.5","BTPA":"0.77"} #k.hipoteza = {"Instanca1":"0","Instanca2":"1","Instanca3":"1","Instanca4":"2","Instanca5":"3","Instanca6":"5"} for row in reader: k.hipoteza = row k.Kalkulo_propabilitetin_kushtezues(k.hipoteza) k.klasifiko() print("\n\n")
import sys sys.path.append('/home/patcha/Dropbox/Doutorado/Codigos/Python/utils/') import numpy as np from dataManipulation import data from naiveBayes import naiveBayes # loading the data set print 'Loading the dataset...' irisAll = np.genfromtxt('/home/patcha/Datasets/Iris/iris.csv', delimiter=',') iris = data(dataset=irisAll, percTrain=0.7, percVal=0, percTest=0.3, normType=None, shuf=True, posOut='last', outBin=False) print iris nb = naiveBayes(iris.trainIn, iris.trainOut, iris.nClass) splited = nb.splitByLabel() stats = nb.statsByLabel(splited) nb.getResult(stats, iris.testIn, iris.testOut)
# 学習データ(全体の90%) Xtr = myData.X[:dtrNum] Ytr = myData.Y[:dtrNum] # 評価データ(全体の10%) Xte = myData.X[dtrNum:] Yte = myData.Y[dtrNum:] #------------------- #------------------- # 3. ナイーブベイズの学習 # 事前確率の設定 priors = np.array([[0.5, 0.5]]) myModel = naiveBayes.naiveBayes(Xtr, Ytr, priors) myModel.train() #------------------- #------------------- # 4. ナイーブベイズの評価 print(f"学習データの正解率:{np.round(myModel.accuracy(Xtr,Ytr),decimals=2)}") print(f"評価データの正解率:{np.round(myModel.accuracy(Xte,Yte),decimals=2)}") #------------------- #------------------- # 5. 予測結果のCSVファイルへの出力 myModel.writeResult2CSV( Xtr, Ytr, fName=f"../results/naiveBayes_result_train_{myData.dataType}.csv")