Пример #1
0
	def computerTrainError(self):
		posPath = '/Users/sunxinzi/Documents/Machine_Learning/Project/Sentiment Prediction/train/pos/'
		negPath = '/Users/sunxinzi/Documents/Machine_Learning/Project/Sentiment Prediction/train/neg/'
		testPath = '/Users/sunxinzi/Documents/Machine_Learning/Project/Sentiment Prediction/train/pos/'

		process = preprocess()

		posTrain = []
		negTrain = []
		testData = []

		posTrain = process.getCleanTxt(posPath, 'pos')
		negTrain = process.getCleanTxt(negPath, 'neg')
		testData = process.getCleanTestData(testPath)

		classifier = naiveBayes()
		result = classifier.test(posTrain, negTrain, testData)

		errorCount = 0
		for i in result:
			if i[1] == 'neg':
				errorCount = errorCount + 1

		testPath = '/Users/sunxinzi/Documents/Machine_Learning/Project/Sentiment Prediction/train/neg/'

		testData = []

		testData = process.getCleanTestData(testPath)
		result = classifier.test(posTrain, negTrain, testData)

		for i in result:
			if i[1] == 'pos':
				errorCount = errorCount + 1

		return errorCount
Пример #2
0
def semTratamento(textos, polaridades, classes):
    s1, s2 = naiveBayes(textos, polaridades, classes)
    arq = open(pasta + 'resultados sem tratamento.txt', 'w')
    arq.writelines(s1)
    arq.write('\n')
    arq.writelines(s2)
    arq.close()
Пример #3
0
from modifyInput import modifier
from naiveBayes import naiveBayes
import pandas as pd

link = "Medical_data.csv"
testframe = pd.read_csv("test_medical.csv")
totalTestpoints = testframe.shape[0]
classifier = naiveBayes(link)
classifier.fit()
count = 0
for i in range(0, totalTestpoints):

    original = testframe.iat[i, 0]
    predicted = classifier.predict((testframe.iloc[i, 1:4]).tolist())
    if predicted == original:
        count = count + 1

print("accuracy=", 100 * count / totalTestpoints)
# generate 365 days worth of data
# this will take a format of [0, 1, 0, 1, 1, 0] - these are all the data variables
days = 365
while days > 0:
    randList = lambda n: [randint(0, 1) for b in range(1, n+1)] # code attained at http://code.activestate.com/recipes/577944-random-binary-list/
    oneDay = randList(20)
    dailyMood.append(oneDay)
    days -= 1

# let's assume there'll be 4 classes for our moods - angry, happy, sad, neutral
randMood = lambda n: [randint(0, 3) for b in range (1, n+1)]
target = randMood(365)

# test naive bayes on generated array
# instantiate object
clf = naiveBayes(dailyMood, target)

data_train, target_train, data_test, target_test = clf.split_train_test(dailyMood, target, 0.7) # use 70% of the data as training
# split class
# basically generates a dictionary where the key is a target class, and the values are the list of instances that fall in that class
splitClass = clf.classSplitter(data_train, target_train)

# get probabilities of each class appearing. This will probably be fumbled since we have a random set
classProbs = clf.classProbabilities(target_train)
condiProbs = clf.get_conditional_probs(splitClass)

# alright, now we have trained the set.
# let's test the effectiveness
accuracy = clf.test_errors(data_train, target_train, data_test, target_test)
print accuracy
Пример #5
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 14 21:54:46 2018

@author: ssheh

Testimi i algoritmit te naiveBayes.py
"""
import csv

from naiveBayes import naiveBayes

reader = csv.DictReader(open('TestOrigins.csv'))

k = naiveBayes(shteguDataBaze="Origins.csv", AtributiKlase="Buys_Computer")

k.kalkulo_propabilitetin_AtributitKlase()

for row in reader:
    k.hipoteza = row
    k.Kalkulo_propabilitetin_kushtezues(k.hipoteza)
    k.klasifiko()
    print("\n\n")
Пример #6
0
from naiveBayes import naiveBayes
from knn import knn_string
from svm import runMachine

test_messages = [
    "hey dude what's up",
    "HEY BRO, WHATSUP! REPLY NOW AT 83147661764. WIN WIN WIN", "i Like pie",
    "TEXT ME to win a FREE car free 4 lyfe yo 374237432"
]

# Demo Naive Bayes
print("Naive Bayes")
print("------------")
for msg in test_messages:
    print(naiveBayes(msg))

print("K-Nearest Neighbors")
print("--------------------")
for msg in test_messages:
    knn_string(msg)

print("Support Vector Machine")
print("----------------------")
for msg in test_messages:
    runMachine(msg)
Пример #7
0
def bagOfWords(linesOfReal,linesOfFake, ngram, stopEnglish, stemed):
    # create the transform / stop_words="english"
    vectorizerReal = CountVectorizer(lowercase=True, stop_words=stopEnglish, analyzer='word',
                                     ngram_range=(ngram, ngram), max_df=1.0, min_df=1,max_features=None)

    trainReal = linesOfReal[:]


    vectorizerReal.fit(trainReal)  ##fit in vector
    indexDictOfWordReal = vectorizerReal.vocabulary_  ##assign index for each word by vocab function
    bag_of_words_real = vectorizerReal.transform(trainReal)
    BoWOfReal = bag_of_words_real.toarray()  ##bag of words array
    uniqlistOfRealWords = vectorizerReal.get_feature_names()  ## create unique list by feature function
    #print("uniq real: ",len(uniqlistOfRealWords))
    #print("uniq real: ",uniqlistOfRealWords)

    # create the transform / stop_words="english"
    vectorizerFake = CountVectorizer(lowercase=True, stop_words=stopEnglish, analyzer='word',
                                     ngram_range=(ngram, ngram), max_df=1.0, min_df=1,max_features=None)

    trainFake = linesOfFake[:]

    vectorizerFake.fit(trainFake)  ##fit in vector
    indexDictOfWordFake = vectorizerFake.vocabulary_  ##assign index for each word by vocab function
    bag_of_words_fake = vectorizerFake.transform(trainFake)
    BoWOfFake = bag_of_words_fake.toarray()  ##bag of words array
    uniqlistOfFakeWords = vectorizerFake.get_feature_names()  ## create unique list by feature function

    countOfRealsDict = {}
    countOfFakesDict = {}

    frequenciesOfReal = np.sum(BoWOfReal,axis=0) ## sum counts of the same worlds
    for word in indexDictOfWordReal.keys():
        countOfRealsDict[word] = frequenciesOfReal[indexDictOfWordReal[word]]
    frequenciesOfFake = np.sum(BoWOfFake,axis=0) ## sum counts of the same worlds
    for word in indexDictOfWordFake.keys():
        countOfFakesDict[word] = frequenciesOfFake[indexDictOfWordFake[word]]

    testHeadLines = readCSV("test.csv", ngram, stemed)

    #write results to the csv file
    with open('output.csv', mode='w') as csv_file:
        fieldnames = ['Id', 'Category']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        correctnessCount = 0
        for line in testHeadLines.keys():
            #stop_words="english"
            vectorizerTest = CountVectorizer(lowercase=True, stop_words=stopEnglish, analyzer='word',
                                             ngram_range=(ngram, ngram), max_df=1.0, min_df=1, max_features=None)
            temp = []
            temp.append(testHeadLines[line]["Id"])
            #print(temp)
            vectorizerTest.fit(temp)  ##fit in vector
            testindexDictOfWord = vectorizerTest.vocabulary_  ##assign index for each word by vocab function
            test_bag_of_words = vectorizerTest.transform(temp)
            test_BoW= test_bag_of_words.toarray()  ##bag of words array
            test_uniqlistOfWords = vectorizerTest.get_feature_names()  ## create unique list by feature function


            #print(testHeadLines[line]["Id"])
            result = naiveBayes(countOfRealsDict, BoWOfReal, countOfFakesDict, BoWOfFake, testindexDictOfWord, test_BoW)

            ## write results to csv
            writer.writerow({'Id': testHeadLines[line]["Id"], 'Category': result})


            #print(result)
            if result == testHeadLines[line]["Category"]:
                correctnessCount += 1
            #print("--------------------------------------")

        uniqWordsofFiles = list(set(list(countOfRealsDict.keys()) + list(countOfFakesDict.keys())))##for bayes calculations
        #print("uniq word Count: ", len(uniqWordsofFiles))  # feature names
        print("correctness count: ", correctnessCount)
        accuracy = calculationofAccuracy(correctnessCount, len(testHeadLines.keys()))
        print("Accuracy = ", accuracy)
        return [countOfRealsDict, countOfFakesDict, uniqlistOfRealWords, uniqlistOfFakeWords]
def test10Fold():
    global allWords
    splits = tenFoldCrossValidation()

    count = 0
    total = 0
    print("Naive Bayes")
    for split in splits:
        nb = naiveBayes()
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))

    count = 0
    total = 0
    print("Random Forest")
    for split in splits:
        nb = RandomForest(100)
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))

    count = 0
    total = 0
    print("Neural 5")
    for split in splits:
        nb = neuralNetwork((5, ), 1000)
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))

    count = 0
    total = 0
    print("Neural 3")
    for split in splits:
        nb = neuralNetwork((3, ), 1000)
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))

    count = 0
    total = 0
    print("SVM")
    for split in splits:
        nb = svm()
        trainFeatures = []
        trainClasses = []
        testFeatures = []
        testClasses = []
        for example in split.train:
            trainFeatures.append(example.features)
            trainClasses.append(example.klass)
        for example in split.test:
            testFeatures.append(example.features)
            testClasses.append(example.klass)

        nb.train(trainFeatures, trainClasses)
        nb.test(testFeatures, testClasses)
        accuracy = nb.getCorrectCount() / len(testClasses)
        total = total + accuracy
        print("[INFO]\tFold ", str(count), " Accuracy:", str(accuracy))
        count = count + 1

    print("[INFO]\tAccuracy:", str(total / 10))
Пример #9
0
def naiveBayesClassification(data):

    # setup the data

    # total length of dataset
    totalLen = len(data)

    # index of last element
    lastElement = len(data[0]) - 1

    # get all indices where row is not a spam (0)
    zeroesIndices = np.where(data[:, lastElement] == 0)[0]

    # get all indices where row is a spam (1)
    onesIndices = np.where(data[:, lastElement] == 1)[0]

    # store all non spam rows in a separate numpy array
    zeroes = copy.deepcopy(data[zeroesIndices, :])

    # store all spam rows in a separate numpy array
    ones = copy.deepcopy(data[onesIndices, :])

    # split train and test set 50% 50%
    # with equal amount of 0s and 1s

    # initialize train set
    testSet = np.empty((0, len(data[0])), dtype=np.float64)

    # stack half of 0s and 1s into the train set
    testSet = np.vstack((testSet, zeroes[0:len(zeroes) // 2, :]))
    testSet = np.vstack((testSet, ones[0:len(ones) // 2, :]))

    # initialize test set
    trainSet = np.empty((0, len(data[0])), dtype=np.float64)

    # stack half of 0s and 1s into the test set
    trainSet = np.vstack((trainSet, zeroes[len(zeroes) // 2:len(zeroes), :]))
    trainSet = np.vstack((trainSet, ones[len(ones) // 2:len(ones), :]))

    # extract labels for both sets
    trainLabels = trainSet[:, -1]
    testLabels = testSet[:, -1]

    # # both sets ignore the class column
    # trainSet= trainSet[:, :-1]
    # testSet= testSet[:, :-1]

    # total length of train and test set should
    # be equal to total dataset length
    assert (len(trainSet) + len(testSet) == len(data))

    # uncomment to test 40% 60% distribution
    # z = 0
    # o = 0
    # t = len(trainSet)
    # for i in trainSet:
    # 	print(i[len(i) - 1])
    # 	if i[len(i) - 1] == 0:
    # 		z+=1
    # 	if i[len(i) - 1] == 1:
    # 		o+=1
    # print('z/t', z/t)
    # print('o/t', o/t)
    # return None

    # get how many attributes there are in the given train set
    numAttributes = len(trainSet[0]) - 1

    # get the last column (class values) from the train set
    lastColumn = trainSet[:, len(trainSet[0]) - 1]

    # get only the unique values (eliminate duplicates)
    classes = set(lastColumn)

    # get how many classes there are for the given dataset
    # and create a naive bayes class with that number of classes,
    # classes, and number of attributes in the given set
    NB = naiveBayes(len(classes), numAttributes, classes)

    # train and predict using provided naive bayes code
    NB.train(trainSet)
    NB.trainingOutput()
    predictions = NB.classify(testSet)

    # compute confusion matrix and print it
    confusionMatrix = confusion_matrix(testLabels, predictions)
    print(confusionMatrix)
Пример #10
0
from naiveBayes import naiveBayes
from data import data

folds = 8

naiveBayesIris = naiveBayes()
print("Iris Dataset\n===================================\n")
# Get Iris data
irisData = data.read('./db/Iris/iris.csv')
# Train and test the model with Iris data
irisAccuracyScores = naiveBayesIris.crossval_predict(irisData["data"], irisData["labels"], folds)
# Calculate accuracy score
irisTotalAccuracy = 0.0
for i in irisAccuracyScores:
    irisTotalAccuracy += i
irisTotalAccuracy = irisTotalAccuracy / len(irisAccuracyScores)
print("\n\nTotal accuracy: " +
      str(round(irisTotalAccuracy * 100, 2)) + "%\n\n")


naiveBayesbanknote = naiveBayes()
print("Banknote Dataset\n===================================\n")
# Get Iris data
banknoteData = data.read('./db/banknote_authentication/banknote_authentication.csv')
# Train and test the model with banknote data
banknoteAccuracyScores = naiveBayesbanknote.crossval_predict(banknoteData["data"], banknoteData["labels"], folds)
# Calculate accuracy score
banknoteTotalAccuracy = 0.0
for i in banknoteAccuracyScores:
    banknoteTotalAccuracy += i
banknoteTotalAccuracy = banknoteTotalAccuracy / len(banknoteAccuracyScores)
Пример #11
0
                #plt.plot(cArr, supportVectors, label='C vs Support Vectors')
            #plt.plot(epochs, devArr, label='Dev Error')
        #plt.title('Error Rates for Unaveraged and Averaged Perceptron')
        #plt.legend()
        #plt.xlabel('C')
        #plt.ylabel('Support Vectors')
        #plt.show()


    if algorithm == 2:
        # Run on Python 3
       knn_fit(npData, newTarget)

    if algorithm == 3:
        # Run on Python 3
       m = gradient_booster(npData, newTarget)

    if algorithm == 4:
        # Run on Python 3
       gnb = naiveBayes(npData, newTarget)

    if algorithm == 5:
        # Run on Python 3
       kbest = kbestfeatures(npData, newTarget)

    if algorithm == 6:
        # Run on Python 2

        model, prediction = logisticRegression(npData, newTarget)
Пример #12
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 14 21:54:46 2018

@author: ssheh

Testimi i algoritmit te naiveBayes.py
"""
from naiveBayes import naiveBayes

k = naiveBayes(shteguDataBaze="tabele_shembull.csv", AtributiKlase="play")
k.kalkulo_propabilitetin_AtributitKlase()
k.hipoteza = {
    "outlook": "sunny",
    "temp": "high",
    "humidity": "high",
    "windy": "true"
}

k.Kalkulo_propabilitetin_kushtezues(k.hipoteza)
k.klasifiko()
Пример #13
0
import pandas as pd
from naiveBayes import naiveBayes

luis = pd.read_csv("haha.csv")

luis.dropna(axis=0, inplace=True)
luis.drop(['ACTIVITY'], axis=1, inplace=True)

data = luis.values.tolist()

target = [row[-1] for row in data]

for row in data:
    del row[-1]

clf = naiveBayes(data, target)

data_train, target_train, data_test, target_test = clf.split_train_test(
    data, target, 0.7)  # use 70% of the data as training
# split class
# basically generates a dictionary where the key is a target class, and the values are the list of instances that fall in that class
splitClass = clf.classSplitter(data_train, target_train)

# get probabilities of each class appearing. This will probably be fumbled since we have a random set
classProbs = clf.classProbabilities(target_train)
condiProbs = clf.get_conditional_probs(splitClass)

# alright, now we have trained the set.
# let's test the effectiveness
accuracy = clf.test_errors(data_train, target_train, data_test, target_test)
print accuracy
Пример #14
0
programStart = time()

bowAll = bagOfWords(allSentencesList)
bowSpam = bagOfWords(spamSentencesList)
bowHam = bagOfWords(hamSentencesList)

# print(bowAll, bowSpam, bowHam, sep = "\n")

print()

count = 0
spamCount = 0
hamCount = 0

for testList in testSentencesList:
    res = naiveBayes(preprocess(testList[1]), bowAll, bowSpam, bowHam)
    # print(testList[0], res)
    if (res == testList[0]):
        if (((res == "ham") and (hamCount < 2))
                or ((res == "spam") and (spamCount < 2))):
            print("Input:", testList[1], end="")
            print("Expected output:", testList[0])
            print("Predicted output:", res)
            print()
            if (res == "ham"):
                hamCount += 1
            else:
                spamCount += 1
        count += 1

programEnd = time()
Пример #15
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 14 21:54:46 2018

@author: ssheh

Testimi i algoritmit te naiveBayes.py
"""
import csv
from naiveBayes import naiveBayes

reader = csv.DictReader(open('TeDhenatTestuese.csv'))
#k = naiveBayes(shteguDataBaze = "Iris.csv", AtributiKlase = "Species" )
#k = naiveBayes(shteguDataBaze = "WaterBears.csv", AtributiKlase = "Species" )
k = naiveBayes(shteguDataBaze="TeDhenatTrajnuese.csv", AtributiKlase="Klasa")
k.kalkulo_propabilitetin_AtributitKlase()
#k.hipoteza = {"SepalLengthCm":"5.1","SepalWidthCm":"3.5","PetalLengthCm":"1.4","PetalWidthCm":"0.2"}
#k.hipoteza = {"SSI":"30.16","BTWa":"26.05","BTWs":"20.87","BTWp":"19.98","BTWL":"0.5","BTPA":"0.77"}
#k.hipoteza = {"Instanca1":"0","Instanca2":"1","Instanca3":"1","Instanca4":"2","Instanca5":"3","Instanca6":"5"}
for row in reader:
    k.hipoteza = row
    k.Kalkulo_propabilitetin_kushtezues(k.hipoteza)
    k.klasifiko()
    print("\n\n")
Пример #16
0
import sys

sys.path.append('/home/patcha/Dropbox/Doutorado/Codigos/Python/utils/')

import numpy as np
from dataManipulation import data
from naiveBayes import naiveBayes

# loading the data set
print 'Loading the dataset...'

irisAll = np.genfromtxt('/home/patcha/Datasets/Iris/iris.csv', delimiter=',')

iris = data(dataset=irisAll,
            percTrain=0.7,
            percVal=0,
            percTest=0.3,
            normType=None,
            shuf=True,
            posOut='last',
            outBin=False)

print iris

nb = naiveBayes(iris.trainIn, iris.trainOut, iris.nClass)
splited = nb.splitByLabel()

stats = nb.statsByLabel(splited)

nb.getResult(stats, iris.testIn, iris.testOut)
Пример #17
0
# 学習データ(全体の90%)
Xtr = myData.X[:dtrNum]
Ytr = myData.Y[:dtrNum]

# 評価データ(全体の10%)
Xte = myData.X[dtrNum:]
Yte = myData.Y[dtrNum:]
#-------------------

#-------------------
# 3. ナイーブベイズの学習

# 事前確率の設定
priors = np.array([[0.5, 0.5]])

myModel = naiveBayes.naiveBayes(Xtr, Ytr, priors)
myModel.train()
#-------------------

#-------------------
# 4. ナイーブベイズの評価
print(f"学習データの正解率:{np.round(myModel.accuracy(Xtr,Ytr),decimals=2)}")
print(f"評価データの正解率:{np.round(myModel.accuracy(Xte,Yte),decimals=2)}")
#-------------------

#-------------------
# 5. 予測結果のCSVファイルへの出力
myModel.writeResult2CSV(
    Xtr,
    Ytr,
    fName=f"../results/naiveBayes_result_train_{myData.dataType}.csv")