Exemplo n.º 1
0
 def test_pred(self):
     print("test_pred")
     nb = NaiveBayesClassifier()
     nb.mean = [1, 2]
     nb.variance = [3, 1]
     # testing condition where it is not trained
     self.assertEqual(nb.predict([1.4, 12, 3, 9]), None)
     self.assertEqual(nb.predict([2, 3, 4, 5]), None)
     self.assertEqual(nb.predict([1]), None)
     self.assertEqual(nb.predict([2, -3, 4, -5, -7, -7]), None)
     nb.mean = [1, 1]
     nb.variance = [1, 5]
     nb.in_classes = [1, 0]
     nb.prior_probs = [0.3, 0.2]
     pred = nb.predict([5, 3, 4, 7])
     # Testing condition simulating trained model.
     self.assertEqual(pred[0], [0])
     self.assertEqual(pred[1], [0])
     self.assertEqual(pred[2], [0])
     self.assertEqual(pred[3], [0])
Exemplo n.º 2
0
from NaiveBayes import NaiveBayesClassifier
#importing the class

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt


#Calculating accuracy --(actual-prediction)/(total no of samples)
def accuracy(y_actual, yhat):
    accuracy = np.sum(y_actual == yhat) / len(y_actual)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
#Dividing the dataset into train and test -80-20 division
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)  #fitting the model
predictions = nb.predict(X_test)  #predicting on the test set

print("Accuracy", accuracy(y_test, predictions))  #calculating the accuracy
Exemplo n.º 3
0
class SentimentAnalyzer:
    threshold = 1

    def __init__(self, file_location):
        self.features = set([])
        raw_data = []
        training_data = []
        word_freq = {}
        #self.word_freq = {}
        with open(file_location, 'rb') as data:
            data_reader = csv.DictReader(data)
            for row in data_reader:
                # print row
                h_tokens = nltk.word_tokenize(row['headline'].lower())
                #self.features = self.features.union(set(h_tokens))

                for token in h_tokens:
                    if token in word_freq:
                        word_freq[token] += 1
                    else:
                        word_freq[token] = 1

                #for token in h_tokens:
                #    if token in self.word_freq:
                #        self.word_freq[token] += 1
                #    else:
                #        self.word_freq[token] = 1

                raw_data.append(
                    (h_tokens, 0, float(row[' anger']) / 100))  # anger
                raw_data.append(
                    (h_tokens, 1, float(row[' disgust']) / 100))  # disgust
                raw_data.append(
                    (h_tokens, 2, float(row[' fear']) / 100))  # fear
                raw_data.append((h_tokens, 3, float(row[' joy']) / 100))  # joy
                raw_data.append(
                    (h_tokens, 4, float(row[' sadness']) / 100))  # sadness
                raw_data.append(
                    (h_tokens, 5, float(row[' surprise']) / 100))  # surprise

        for key in word_freq.keys():
            if word_freq[key] > self.threshold:
                self.features.add(key)

        print "F-vec size: " + str(len(self.features))

        for data in raw_data:
            f_vector = []
            for f in self.features:
                f_vector.append(1 if f in data[0] else 0)
            training_data.append((f_vector, data[1], data[2]))

        self.classifier = NaiveBayesClassifier(6, len(self.features))
        self.classifier.train(training_data)

    def predict(self, text):
        token_set = set(nltk.word_tokenize(text.lower()))
        f_vector = []
        for f in self.features:
            f_vector.append(1 if f in token_set else 0)
        return self.classifier.predict(f_vector)

    def predict_all(self, text):
        token_set = set(nltk.word_tokenize(text.lower()))
        f_vector = []
        for f in self.features:
            f_vector.append(1 if f in token_set else 0)
        return self.classifier.predict_all(f_vector)

    def test(self, test_file_location):
        test_data = open(test_file_location, 'rb')
        test_reader = csv.DictReader(test_data)
        total = 0
        correct = 0

        for row in test_reader:
            total += 1
            emotions = map(float, [
                row[' anger'], row[' disgust'], row[' fear'], row[' joy'],
                row[' sadness'], row[' surprise']
            ])
            acceptable_emotions = []
            for i in xrange(len(emotions)):
                if emotions[i] > 1:
                    acceptable_emotions.append(i)
            acceptable_emotions = sorted(acceptable_emotions,
                                         reverse=True,
                                         key=lambda x: emotions[x])[:3]

            #print acceptable_emotions
            #print emotion
            prediction = self.predict(row['headline'])[0]
            #print prediction

            if prediction in acceptable_emotions:
                correct += 1

        return float(correct) / total