예제 #1
0
 def predict(self, s):
     obb = prepro.preprocess()
     s = re.sub('[^A-Za-z ]', '', s)
     s = obb.collapse_terms(s)
     input_ = word_tokenize(s)
     input_ = [w.lower() for w in input_ if w.lower() in self.vocab]
     pre_c1 = 1.0
     pre_c2 = 1.0
     for i in input_:
         pre_c1 *= self.prob_c1[i]
         pre_c2 *= self.prob_c2[i]
     if pre_c1 * (self.count_c1 /
                  float(self.count_c1 + self.count_c2)) < pre_c2 * (
                      self.count_c2 / float(self.count_c1 + self.count_c2)):
         return 1
     else:
         return 0
예제 #2
0
 def on_post(self, req, resp):
     items = req.media.get('texts')
     results = []
     for item in items:
         tid = item.get('id')
         text = item.get('text')
         try:
             prediction = self.classifier.predict(preprocess(text), self.k)
         except Exception as e:
             self.logger.error(
                 'Error occurred during prediction: {}'.format(e))
             raise falcon.HTTPInternalServerError(
                 title='Internal server error',
                 description=
                 'The service was unavailable. Please try again later.')
         scores = {}
         for label, score in prediction:
             scores[label[9:]] = score
         results.append({'id': tid, 'scores': scores})
     resp.media = results
예제 #3
0
from prepro import preprocess
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize

#global data
data_train, data_test, features = np.array(preprocess()).ravel()

#prior probability of spam and ham
pspam = 0.0
pham = 0.0
#counter array to calculate probability
counter_words = dict((key, [0, 0]) for key in features)

#array to store probability of each word in spam and ham
prob_spam = dict((key, 0.0) for key in features)
prob_ham = dict((key, 0.0) for key in features)


def prior_prob():
    total = data_train.shape[0]
    spamcounter = 0
    hamcounter = 0
    for value in data_train['CLASS']:
        if value:
            spamcounter += 1
        else:
            hamcounter += 1
    pspam = spamcounter / float(total)
예제 #4
0
        print("Confusion matrix : ")
        mat = confusion_matrix(self.test_data['CLASS'], op)
        acc = accuracy_score(self.test_data['CLASS'], op)
        fs = f1_score(self.test_data['CLASS'], op)
        print(mat)
        print("Accuracy : " + str(acc * 100) + "%")
        print("F-Score : " + str(fs))
        print("MCC : " + str(matthews_corrcoef(self.test_data['CLASS'], op)))

    def run(self):
        op = []
        for idx, d in self.data.iterrows():
            op.append(self.predict(d['CONTENT']))
        print("Result on Whole Dataset : ")
        print("Confusion matrix : ")
        mat = confusion_matrix(self.data['CLASS'], op)
        acc = accuracy_score(self.data['CLASS'], op)
        fs = f1_score(self.data['CLASS'], op)
        print(mat)
        print("Accuracy : " + str(acc * 100) + "%")
        print("F-Score : " + str(fs))
        print("MCC : " + str(matthews_corrcoef(self.data['CLASS'], op)))


if __name__ == "__main__":
    ob1 = prepro.preprocess(['data'], ['CLASS', 'CONTENT'])
    data = ob1.read_and_clean()
    ob2 = NBC(data)
    ob2.train()
    ob2.test_run()
예제 #5
0
from prepro import preprocess
from NB import NBC

if __name__ == "__main__":
    file_name = 'data'

    #object for preprocess data read,clean,transform
    process = preprocess(file_name)
    data = process.read_and_clean()

    #Naive Bayes classifier object
    classifier = NBC(data,split_size=0.2)
    classifier.train()
    classifier.test_run()
    classifier.run()
예제 #6
0
from nltk.stem.porter import PorterStemmer
import prepro as p
import out as o

file_name = input(
    "Please enter the name of Queries file, you can also input the address of file:"
)
query = open(
    file_name, "r"
)  #Following code snippit is to tokenize query data and calculate tfidf values for it.
query_line = query.readline()
query_data = dict()
count = 1

while (query_line):
    k = p.preprocess(query_line)
    query_data.update({count: k})
    query_line = query.readline()
    count += 1

qindex = dict()  # This is used to calculate tf for each word in a document
for key, value in query_data.items():
    for each in value:
        if each not in qindex:
            qindex.update({each: {key: 1}})
        else:
            if key not in qindex[each]:
                qindex[each].update({key: 1})
            else:
                qindex[each][key] = qindex[each][key] + 1