Exemplo n.º 1
0
def assignment_e_naivebayes_2():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
Exemplo n.º 2
0
def classify_tweets(request):
        consumer_key="Wb4W1n264iHhcrqcXt54bA"
        consumer_secret="2NFs7pO610XKQUOs5hPAz8wCEO4uxmP3111HPhsmgc"
        access_token="36641014-28RR3YAp6MxFxJ706gsp5a7bRy0sYDsjLCwixs2iM"
        access_token_secret="qOGQg84VvurJKX9qSF3Zgl973BxF6ryt7Yruoxtw"
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth)
        query = request.POST.get('query')
        result=api.search(query)
        tweets=[]
        classification=[]
        for tweet in result:
                    try:
                            tweets.append(str(tweet.text))
                    except:
                            pass
        posScore=0
        negScore=0
        for tweet in tweets:
                tokens=tweet.split()
                data_preprocess.remove_noise_words(tokens)
                data_preprocess.remove_names(tokens)
                data_preprocess.remove_links(tokens)
                tweet_counts=[]
                token_counts=[]
                category_counts=defaultdict(lambda:defaultdict(int))
                p=tweet_category_count.objects.get(id=1)
                tweet_counts.append(p.positive_count)
                tweet_counts.append(p.negative_count)
                p=token_category_count.objects.get(id=1)
                token_counts.append(p.positive_count)
                token_counts.append(p.negative_count)
                for token in tokens:
                        try:
                                p=pos_tokens.objects.get(ptoken=token)
                                category_counts[token]['pos']=p.pcount
                        except:
                                category_counts[token]['pos']=0
                for token in tokens:
                        try:
                                p=neg_tokens.objects.get(ntoken=token)
                                category_counts[token]['neg']=p.ncount
                        except:
                                category_counts[token]['neg']=0
		
		
                classifier=NaiveBayesClassifier()
                result=classifier.classify(tokens,category_counts,tweet_counts,token_counts)
                if(result=='pos'):
                        posScore+=1
                else:
                        negScore+=1
                classification.append(result)
        return render_to_response("index.html",{'tweets':tweets,'pos_neg':classification,'posScore':posScore,'negScore':negScore})
Exemplo n.º 3
0
def assignment_e_naivebayes_1():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language
Exemplo n.º 4
0
 def test_china_example_from_textbook(self):
     import math
     from corpus import InMemoryDocument, InMemoryCorpus
     from naivebayesclassifier import NaiveBayesClassifier
     china = InMemoryCorpus()
     china.add_document(
         InMemoryDocument(0, {"body": "Chinese Beijing Chinese"}))
     china.add_document(
         InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
     china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
     not_china = InMemoryCorpus()
     not_china.add_document(
         InMemoryDocument(0, {"body": "Tokyo Japan Chinese"}))
     training_set = {"china": china, "not china": not_china}
     classifier = NaiveBayesClassifier(training_set, ["body"],
                                       self._normalizer, self._tokenizer)
     results = []
     classifier.classify("Chinese Chinese Chinese Tokyo Japan",
                         lambda m: results.append(m))
     self.assertEqual(len(results), 2)
     self.assertEqual(results[0]["category"], "china")
     self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4)
     self.assertEqual(results[1]["category"], "not china")
     self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
Exemplo n.º 5
0
false_counts = Counter()
true_counts = Counter()
real_counts = Counter()

# Now evaluate the trainied classifier.
with open(args.test, 'r', encoding='UTF-8') as csv_test:
    test_reader = csv.reader(csv_test, delimiter=',')
    next(test_reader)

    for row in test_reader:
        rating = float(row[1])
        if rating >= -1 and rating < 1:
            continue

        cls = nbc.classify(row)
        actual_cls = classer(row)
        real_counts[actual_cls] += 1

        if cls == actual_cls:
            true_counts[cls] += 1
        else:
            false_counts[cls] += 1
#End with

correct = 0
for cls, count in true_counts.items():
    correct += count
#End for

incorrect = 0
Exemplo n.º 6
0
def assignment_e():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language

    # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
Exemplo n.º 7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-



from naivebayesclassifier import NaiveBayesClassifier
from cmd import Cmd

l = NaiveBayesClassifier('data')

while True:
    name = raw_input()
    name_unicode = name.decode('utf-8')
    final_p = l.classify(name_unicode[-1], force_class_average = True)
    best_p = 0
    best_ans = -1
    for i in final_p:
        if final_p[i] > best_p:
            best_p = final_p[i]
            best_ans = i
    print status[best_ans], best_p
    if name == 'exit':
        break