Exemplo n.º 1
0
def train( data_dir, outfile , test_pct = 0.3, verbose = True):
    '''
    train naive bayes classifier using ML estimates.
    if test percentage (test_pct) > 0, hold out that percentage of training 
    data files from each class and use for statistics.
    '''
    labels = os.listdir( data_dir )
    if verbose:
        print(labels)
    nbc = NaiveBayesClassifier( labels )

    def load_label_dir( ddir ):
        '''
        load all datafiles within single directory
        '''
        fnames = os.listdir( os.path.join( data_dir, ddir ) )
        def w0(f):
            ''' worker function (full path) '''
            return load_words( os.path.join( data_dir, ddir, f ) )
        return [ (fname, w0(fname) ) for fname in fnames ]

    te_dl = []
    n_tr = 0
    for label in labels:
        for (f, ws) in load_label_dir( label ):
            if random.random() < test_pct:
                te_dl.append( (f, ws, label) )
            else:
                nbc.add_example( label, ws )
                n_tr += 1
    
    #if we've picked a test set, use it
    def show_stats(s, lbl = None):
        ''' display F1 stats '''
        f1 = s['F1'] * 100.0
        pr = s['precision'] * 100.0
        rc = s['recall'] * 100.0
        if None != lbl:
            print('%s : F1=%f precision=%f recall=%f' % ( lbl, f1, pr, rc) )
        else:
            print('F1=%f precision=%f recall=%f' % ( f1, pr, rc) )
    if test_pct > 0.0:
        preds = []
        obs = []
        for (f, ws, l) in te_dl:
            obs.append(l)
            c = nbc.classify( ws )
            preds.append(c)
        sts = calc_F1( preds, obs )
        show_stats( sts['overall'] )
        for l in labels:
            show_stats( sts[l], l )
    #store trained classifier
    store_classifier( outfile, nbc )
Exemplo n.º 2
0
 def test_fit(self):
     print("test_fit")
     nb = NaiveBayesClassifier()
     Xis = np.array([[3, 4], [2, 3]])
     yis = [0, 1]
     nb.prior_probs = np.zeros(2, dtype=np.float64)
     # Testing value updation with known calculation.
     self.assertEqual(nb.prior_probs[0], 0)
     self.assertEqual(nb.prior_probs[1], 0)
     self.assertEqual(nb.fit(Xis, yis), None)
     self.assertEqual(nb.prior_probs[0], 0.5)
     self.assertEqual(nb.prior_probs[1], 0.5)
Exemplo n.º 3
0
    def __init__(self, file_location):
        self.features = set([])
        raw_data = []
        training_data = []
        word_freq = {}
        #self.word_freq = {}
        with open(file_location, 'rb') as data:
            data_reader = csv.DictReader(data)
            for row in data_reader:
                # print row
                h_tokens = nltk.word_tokenize(row['headline'].lower())
                #self.features = self.features.union(set(h_tokens))

                for token in h_tokens:
                    if token in word_freq:
                        word_freq[token] += 1
                    else:
                        word_freq[token] = 1

                #for token in h_tokens:
                #    if token in self.word_freq:
                #        self.word_freq[token] += 1
                #    else:
                #        self.word_freq[token] = 1

                raw_data.append(
                    (h_tokens, 0, float(row[' anger']) / 100))  # anger
                raw_data.append(
                    (h_tokens, 1, float(row[' disgust']) / 100))  # disgust
                raw_data.append(
                    (h_tokens, 2, float(row[' fear']) / 100))  # fear
                raw_data.append((h_tokens, 3, float(row[' joy']) / 100))  # joy
                raw_data.append(
                    (h_tokens, 4, float(row[' sadness']) / 100))  # sadness
                raw_data.append(
                    (h_tokens, 5, float(row[' surprise']) / 100))  # surprise

        for key in word_freq.keys():
            if word_freq[key] > self.threshold:
                self.features.add(key)

        print "F-vec size: " + str(len(self.features))

        for data in raw_data:
            f_vector = []
            for f in self.features:
                f_vector.append(1 if f in data[0] else 0)
            training_data.append((f_vector, data[1], data[2]))

        self.classifier = NaiveBayesClassifier(6, len(self.features))
        self.classifier.train(training_data)
Exemplo n.º 4
0
 def test_prob_density_function(self):
     print("test_prob_density_function")
     # Creating object of classifier for unit testing
     nb = NaiveBayesClassifier()
     nb.mean = [1]
     nb.variance = [3]
     # Testing probability calc with known calculation.
     self.assertAlmostEqual(nb.prob_den_func(0, 3), 0.11825507)
     nb.mean = [1, 2]
     nb.variance = [3, 1]
     self.assertAlmostEqual(nb.prob_den_func(1, 0.1), 0.06561581)
Exemplo n.º 5
0
 def test_pred(self):
     print("test_pred")
     nb = NaiveBayesClassifier()
     nb.mean = [1, 2]
     nb.variance = [3, 1]
     # testing condition where it is not trained
     self.assertEqual(nb.predict([1.4, 12, 3, 9]), None)
     self.assertEqual(nb.predict([2, 3, 4, 5]), None)
     self.assertEqual(nb.predict([1]), None)
     self.assertEqual(nb.predict([2, -3, 4, -5, -7, -7]), None)
     nb.mean = [1, 1]
     nb.variance = [1, 5]
     nb.in_classes = [1, 0]
     nb.prior_probs = [0.3, 0.2]
     pred = nb.predict([5, 3, 4, 7])
     # Testing condition simulating trained model.
     self.assertEqual(pred[0], [0])
     self.assertEqual(pred[1], [0])
     self.assertEqual(pred[2], [0])
     self.assertEqual(pred[3], [0])
Exemplo n.º 6
0
from NaiveBayes import NaiveBayesClassifier
#importing the class

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt


#Calculating accuracy --(actual-prediction)/(total no of samples)
def accuracy(y_actual, yhat):
    accuracy = np.sum(y_actual == yhat) / len(y_actual)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
#Dividing the dataset into train and test -80-20 division
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)  #fitting the model
predictions = nb.predict(X_test)  #predicting on the test set

print("Accuracy", accuracy(y_test, predictions))  #calculating the accuracy
Exemplo n.º 7
0
x_train, x_test, y_train, y_test = train_test_split(car_data, car_targets, test_size=0.3)

classifier = GaussianNB()
classifier.fit(x_train, y_train)
predicted = classifier.predict(x_test)

total_correct = 0
total = 0
for i in range(len(predicted)):
    if np.array_equal(predicted[i], y_test[i]):
        total_correct += 1
    total += 1
print("Accuracy of existing classifier: " + str(total_correct/total))


classifier = NaiveBayesClassifier()
# x_train = [["comedy", "deep", "yes"],
#            ["comedy", "shallow", "yes"],
#            ["drama", "deep", "yes"],
#            ["drama", "shallow", "no"],
#            ["comedy", "deep", "no"],
#            ["comedy", "shallow", "no"],
#            ["drama", "deep", "no"]]
# y_train = ["low", "high", "high", "low", "high", "high", "low"]

classifier.fit(x_train, y_train)
predicted = classifier.predict(x_test)

total_correct = 0
total = 0
for i in range(len(predicted)):
Exemplo n.º 8
0
arff.entropy_discretize_numerics("class", gain_threshold=GAIN_THRESHOLD)
print("DONE PREPARING DATA")

validation_results = []
for run_num in range(1, NUMBER_OF_TRIALS + 1):
    print("SELECTING TRAINING DATA")
    # We just need to remove a random 10% of records from arff.data
    ten_percent = len(arff.data) // 10
    training_records = []
    for i in range(ten_percent):
        index = random.randrange(len(arff.data))
        training_records.append(arff.data.pop(index))
    print("DONE SELECTING TRAINING DATA")

    print("BUILDING MODEL")
    nb = NaiveBayesClassifier(arff)
    nb.build_model("class")
    print("DONE BUILDING MODEL")

    confusion_matrices = {}
    for core_value in arff.attributes[arff.attr_position["class"]][1]:
        confusion_matrices[core_value] = {}
    for core_value in arff.attributes[arff.attr_position["class"]][1]:
        confusion_matrices[core_value]["tp"] = 0
        confusion_matrices[core_value]["tn"] = 0
        confusion_matrices[core_value]["fp"] = 0
        confusion_matrices[core_value]["fn"] = 0

    for record in training_records:
        classification = nb.classify_record(record)
        if classification == record[arff.attr_position["class"]]:
Exemplo n.º 9
0
class SentimentAnalyzer:
    threshold = 1

    def __init__(self, file_location):
        self.features = set([])
        raw_data = []
        training_data = []
        word_freq = {}
        #self.word_freq = {}
        with open(file_location, 'rb') as data:
            data_reader = csv.DictReader(data)
            for row in data_reader:
                # print row
                h_tokens = nltk.word_tokenize(row['headline'].lower())
                #self.features = self.features.union(set(h_tokens))

                for token in h_tokens:
                    if token in word_freq:
                        word_freq[token] += 1
                    else:
                        word_freq[token] = 1

                #for token in h_tokens:
                #    if token in self.word_freq:
                #        self.word_freq[token] += 1
                #    else:
                #        self.word_freq[token] = 1

                raw_data.append(
                    (h_tokens, 0, float(row[' anger']) / 100))  # anger
                raw_data.append(
                    (h_tokens, 1, float(row[' disgust']) / 100))  # disgust
                raw_data.append(
                    (h_tokens, 2, float(row[' fear']) / 100))  # fear
                raw_data.append((h_tokens, 3, float(row[' joy']) / 100))  # joy
                raw_data.append(
                    (h_tokens, 4, float(row[' sadness']) / 100))  # sadness
                raw_data.append(
                    (h_tokens, 5, float(row[' surprise']) / 100))  # surprise

        for key in word_freq.keys():
            if word_freq[key] > self.threshold:
                self.features.add(key)

        print "F-vec size: " + str(len(self.features))

        for data in raw_data:
            f_vector = []
            for f in self.features:
                f_vector.append(1 if f in data[0] else 0)
            training_data.append((f_vector, data[1], data[2]))

        self.classifier = NaiveBayesClassifier(6, len(self.features))
        self.classifier.train(training_data)

    def predict(self, text):
        token_set = set(nltk.word_tokenize(text.lower()))
        f_vector = []
        for f in self.features:
            f_vector.append(1 if f in token_set else 0)
        return self.classifier.predict(f_vector)

    def predict_all(self, text):
        token_set = set(nltk.word_tokenize(text.lower()))
        f_vector = []
        for f in self.features:
            f_vector.append(1 if f in token_set else 0)
        return self.classifier.predict_all(f_vector)

    def test(self, test_file_location):
        test_data = open(test_file_location, 'rb')
        test_reader = csv.DictReader(test_data)
        total = 0
        correct = 0

        for row in test_reader:
            total += 1
            emotions = map(float, [
                row[' anger'], row[' disgust'], row[' fear'], row[' joy'],
                row[' sadness'], row[' surprise']
            ])
            acceptable_emotions = []
            for i in xrange(len(emotions)):
                if emotions[i] > 1:
                    acceptable_emotions.append(i)
            acceptable_emotions = sorted(acceptable_emotions,
                                         reverse=True,
                                         key=lambda x: emotions[x])[:3]

            #print acceptable_emotions
            #print emotion
            prediction = self.predict(row['headline'])[0]
            #print prediction

            if prediction in acceptable_emotions:
                correct += 1

        return float(correct) / total
    with open(fn, 'r') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject:", "", line).strip()
                data.append((subject, is_spam))


def split_data(data, p):
    return data[:int(len(data) * p)], data[int(len(data) * p):]


def in_random_order(data):
    indices = [i for i, _ in enumerate(data)]
    random.shuffle(indices)
    result = []
    for i in indices:
        result.append(data[i])
    return result


random.seed(0)
train_data, test_data = split_data(in_random_order(data), 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(message, is_spam, classifier.classify(message))
              for message, is_spam in test_data]

counts = Counter(
    (is_spam, spam_prob > 0.5) for (_, is_spam, spam_prob) in classified)
Exemplo n.º 11
0
def load_classifier( infile ):
    inp = open(infile, 'r')
    jtxt = inp.read()
    inp.close()
    jd = json.loads( jtxt )
    return NaiveBayesClassifier.unfold_classifier( jd )