def run(self):
        if not self.usedev:
            for grams in self.allgrams:
                c = NaiveBayesClassifier(self.rawfname,
                                         grams=grams)
                c.trainClassifier()
                self.stdout = True
                self.evaluate(c)
            return
            
        for grams in self.allgrams:
            c = NaiveBayesClassifier(self.rawfname,
                                     grams=grams)
            c.trainClassifier()
            
            for w in self.allweights:
                c.setWeight(w)                                
        
                for t1 in self.allthresholds:
                    for t2 in self.allthresholds:
                        c.setThresholds(neg=t1, pos=t2)
                        cinfo, accpos, accneg, accall, corrall = self.evaluate(c)
                        self.results.append([cinfo, accpos, accneg,
                                             accall, corrall])

        if self.csvout:
            self.flushToCSV()
Exemplo n.º 2
0
def assignment_e_naivebayes_2():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
Exemplo n.º 3
0
def classify_tweets(request):
        consumer_key="Wb4W1n264iHhcrqcXt54bA"
        consumer_secret="2NFs7pO610XKQUOs5hPAz8wCEO4uxmP3111HPhsmgc"
        access_token="36641014-28RR3YAp6MxFxJ706gsp5a7bRy0sYDsjLCwixs2iM"
        access_token_secret="qOGQg84VvurJKX9qSF3Zgl973BxF6ryt7Yruoxtw"
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth)
        query = request.POST.get('query')
        result=api.search(query)
        tweets=[]
        classification=[]
        for tweet in result:
                    try:
                            tweets.append(str(tweet.text))
                    except:
                            pass
        posScore=0
        negScore=0
        for tweet in tweets:
                tokens=tweet.split()
                data_preprocess.remove_noise_words(tokens)
                data_preprocess.remove_names(tokens)
                data_preprocess.remove_links(tokens)
                tweet_counts=[]
                token_counts=[]
                category_counts=defaultdict(lambda:defaultdict(int))
                p=tweet_category_count.objects.get(id=1)
                tweet_counts.append(p.positive_count)
                tweet_counts.append(p.negative_count)
                p=token_category_count.objects.get(id=1)
                token_counts.append(p.positive_count)
                token_counts.append(p.negative_count)
                for token in tokens:
                        try:
                                p=pos_tokens.objects.get(ptoken=token)
                                category_counts[token]['pos']=p.pcount
                        except:
                                category_counts[token]['pos']=0
                for token in tokens:
                        try:
                                p=neg_tokens.objects.get(ntoken=token)
                                category_counts[token]['neg']=p.ncount
                        except:
                                category_counts[token]['neg']=0
		
		
                classifier=NaiveBayesClassifier()
                result=classifier.classify(tokens,category_counts,tweet_counts,token_counts)
                if(result=='pos'):
                        posScore+=1
                else:
                        negScore+=1
                classification.append(result)
        return render_to_response("index.html",{'tweets':tweets,'pos_neg':classification,'posScore':posScore,'negScore':negScore})
Exemplo n.º 4
0
def assignment_e_naivebayes_1():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language
Exemplo n.º 5
0
 def test_china_example_from_textbook(self):
     import math
     from corpus import InMemoryDocument, InMemoryCorpus
     from naivebayesclassifier import NaiveBayesClassifier
     china = InMemoryCorpus()
     china.add_document(
         InMemoryDocument(0, {"body": "Chinese Beijing Chinese"}))
     china.add_document(
         InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
     china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
     not_china = InMemoryCorpus()
     not_china.add_document(
         InMemoryDocument(0, {"body": "Tokyo Japan Chinese"}))
     training_set = {"china": china, "not china": not_china}
     classifier = NaiveBayesClassifier(training_set, ["body"],
                                       self._normalizer, self._tokenizer)
     results = []
     classifier.classify("Chinese Chinese Chinese Tokyo Japan",
                         lambda m: results.append(m))
     self.assertEqual(len(results), 2)
     self.assertEqual(results[0]["category"], "china")
     self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4)
     self.assertEqual(results[1]["category"], "not china")
     self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
Exemplo n.º 6
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from naivebayesclassifier import NaiveBayesClassifier
    print("Initializing naive Bayes classifier from news corpora...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    languages = ["en", "no", "da", "de"]
    training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer)
    print(f"Enter some text and classify it into {languages}.")
    print(f"Returned scores are log-probabilities.")

    def evaluator(text):
        results = []
        classifier.classify(text, lambda m: results.append(m))
        return results
    simple_repl("text", evaluator)
Exemplo n.º 7
0
 def test_language_detection_trained_on_some_news_corpora(self):
     import os.path
     from corpus import InMemoryCorpus
     from naivebayesclassifier import NaiveBayesClassifier
     training_set = {
         language: InMemoryCorpus(os.path.join(data_path,
                                               f"{language}.txt"))
         for language in ["en", "no", "da", "de"]
     }
     classifier = NaiveBayesClassifier(training_set, ["body"],
                                       self._normalizer, self._tokenizer)
     self._classify_buffer_and_verify_top_categories(
         "Vil det riktige språket identifiseres? Dette er bokmål.",
         classifier, ["no"])
     self._classify_buffer_and_verify_top_categories(
         "I don't believe that the number of tokens exceeds a billion.",
         classifier, ["en"])
     self._classify_buffer_and_verify_top_categories(
         "De danske drenge drikker snaps!", classifier, ["da"])
     self._classify_buffer_and_verify_top_categories(
         "Der Kriminalpolizei! Haben sie angst?", classifier, ["de"])
Exemplo n.º 8
0
class NaiveBayesClassifierTest(unittest.TestCase):
    def test_predict(self):
        STOP_WORDS = set(line.strip().decode('utf-8')
                         for line in open("stopwords.dic", 'r'))

        def tokenize(text):
            try:
                seg_list = jieba.cut(text, cut_all=False)
                return set(
                    [x.strip() for x in seg_list if x not in STOP_WORDS])
            except Exception, e:
                print e
                return []

        classifier = NaiveBayesClassifier(tokenizer=tokenize)
        # classifier.fit(u'naive_train_data')
        # classifier.dump('naive_classifier.dat')
        classifier.load('naive_classifier.dat')
        classifier.reduce(400)
        start = time()
        total = 0.0
        errors = 0.0
        for root, dirs, files in os.walk(u'naive_test_data/', topdown=True):
            for name in files:
                if root.startswith('.') or name.startswith('.'):
                    continue
                category = root.split('/')[-1]
                text = open(os.path.join(root, name),
                            'r').read().decode('utf-8')
                predict = classifier.predict(text)
                total += 1
                if category != predict:
                    errors += 1
                    print 'predict: %s, actual: %s, errors percentage: %0.2f' % (
                        predict.encode('utf-8'), category.encode('utf-8'),
                        100 * errors / total)
        print 'testing completed, total: %d, errors: %d, error rate:%0.2f, costs: %0.2f' % (
            total, errors, 100 * errors / total, time() - start)
        return errors / total
    def run(self):
        
        for grams in self.allgrams:
            c = NaiveBayesClassifier(self.rawfname,
                                     grams=grams)
            c.trainClassifier()
            self.stdout = False

            return self.evaluate(c)
        
            
        for grams in self.allgrams:
            c = NaiveBayesClassifier(self.rawfname,
                                     grams=grams)
            c.trainClassifier()
            
            for w in self.allweights:
                c.setWeight(w)                                
        
                for t1 in self.allthresholds:
                    for t2 in self.allthresholds:
                        c.setThresholds(neg=t1, pos=t2)
                        cinfo, accpos, accneg, accall, corrall = self.evaluate(c)
                        self.results.append([cinfo, accpos, accneg, accall, corrall])
Exemplo n.º 10
0
from maxentclassifier import MaximumEntropyClassifier
from naivebayesclassifier import NaiveBayesClassifier
import random
import csv

fname = 'training.csv'


nb = NaiveBayesClassifier(fname, grams=[1, 2])
nb.setThresholds(neg=1.0, pos=20.0)
nb.setWeight(0.000000000005)
nb.trainClassifier()
ment = MaximumEntropyClassifier(fname)
ment.trainClassifier()
classifiers = [nb, ment]

def csvdata_to_list(data):
    d=[]
    for row in data:
        d.append(row)
    return d

def search(text,data):
    output = []
    i=0
    for d in data:
        
        if d[0].lower().find(text) != -1:
           
            output.append([])
            output[i].append(d[0])
Exemplo n.º 11
0
'''
import tornado.ioloop
import tornado.web
import urllib
import tweepy
import os


from maxentclassifier import MaximumEntropyClassifier
from naivebayesclassifier import NaiveBayesClassifier

# name of training set file
fname = 'trainingandtestdata/training.csv'

# train classifiers here first
nb = NaiveBayesClassifier(fname, grams=[1,2])
nb.setThresholds(neg=1.0, pos=20.0)
nb.setWeight(0.000000000005)
nb.trainClassifier()
ment = MaximumEntropyClassifier(fname)
ment.trainClassifier()
classifiers = [nb, ment]


class MainHandler(tornado.web.RequestHandler):
    '''
    Handles request to main page
    '''
    def get(self):
        query = self.get_argument("query", "").strip()
        cchosen = int(self.get_argument("classifier-type", 0))
Exemplo n.º 12
0
    processed = re.sub(r'—', r"-", line)
    processed = re.sub(r'([^\w\s\'])', r' \1 ', line)
    processed = processed.lower()

    return (processed.split())


#End def

parser = argparse.ArgumentParser()
parser.add_argument('train', help='The filename that points to training set.')
parser.add_argument('test', help='The filename that points to test set.')
args = parser.parse_args()

# Train our classifier
nbc = NaiveBayesClassifier(featurizer, classer, (AGREE_CLASS, DISAGREE_CLASS))
with open(args.train, 'r', encoding='UTF-8') as csv_train:
    train_reader = csv.reader(csv_train, delimiter=',')
    next(train_reader)

    for row in train_reader:
        rating = float(row[1])
        if rating >= -1 and rating < 1:
            continue
        nbc.add_sample(row)
#End with
nbc.smooth()

false_counts = Counter()
true_counts = Counter()
real_counts = Counter()
Exemplo n.º 13
0
def assignment_e():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language

    # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
Exemplo n.º 14
0
    def run(self):
        if not self.usedev:
            for grams in self.allgrams:
                c = NaiveBayesClassifier(self.rawfname, grams=grams)
                c.trainClassifier()
                self.stdout = True
                self.evaluate(c)
            return

        for grams in self.allgrams:
            c = NaiveBayesClassifier(self.rawfname, grams=grams)
            c.trainClassifier()

            for w in self.allweights:
                c.setWeight(w)

                for t1 in self.allthresholds:
                    for t2 in self.allthresholds:
                        c.setThresholds(neg=t1, pos=t2)
                        cinfo, accpos, accneg, accall, corrall = self.evaluate(
                            c)
                        self.results.append(
                            [cinfo, accpos, accneg, accall, corrall])

        if self.csvout:
            self.flushToCSV()
Exemplo n.º 15
0
from pymongo import MongoClient
from bson.objectid import ObjectId
import jieba
import re
from bs4 import BeautifulSoup
from naivebayesclassifier import NaiveBayesClassifier
from weighter.informationgain import InformationGain


if __name__ == '__main__':	
    STOP_WORDS = set(line.strip().decode('utf-8') for line in open("stopwords.dic", 'r'))
    def tokenize(text):
        try:      
            # print text.encode('utf-8')      
            seg_list = jieba.cut(text, cut_all=False)
            zh_vocabulaly = re.compile(ur"([\u4E00-\u9FA5]+$)")                                    
            return [x.strip() for x in seg_list if zh_vocabulaly.match(x) and x not in STOP_WORDS]
        except Exception, e:
            print e
            return []
    client = MongoClient()
    documents = client.rss.documents
    classifier = NaiveBayesClassifier(tokenizer=tokenize)    
    classifier.load(u'raw_features_1.dat')    
    classifier.reduce(max_size=404, weighter=InformationGain)
    for x in documents.find({},{'_id':'1','content':1}):
        content = BeautifulSoup(x['content']).text.encode('utf-8')
        category = classifier.predict_text(content)
        print category.encode('utf-8')
        documents.update({'_id': x['_id']},{'$set': {'category':category}})
Exemplo n.º 16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-



from naivebayesclassifier import NaiveBayesClassifier
from cmd import Cmd

l = NaiveBayesClassifier('data')

while True:
    name = raw_input()
    name_unicode = name.decode('utf-8')
    final_p = l.classify(name_unicode[-1], force_class_average = True)
    best_p = 0
    best_ans = -1
    for i in final_p:
        if final_p[i] > best_p:
            best_p = final_p[i]
            best_ans = i
    print status[best_ans], best_p
    if name == 'exit':
        break