def assignment_e_naivebayes_2(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
def classify_tweets(request): consumer_key="Wb4W1n264iHhcrqcXt54bA" consumer_secret="2NFs7pO610XKQUOs5hPAz8wCEO4uxmP3111HPhsmgc" access_token="36641014-28RR3YAp6MxFxJ706gsp5a7bRy0sYDsjLCwixs2iM" access_token_secret="qOGQg84VvurJKX9qSF3Zgl973BxF6ryt7Yruoxtw" auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) query = request.POST.get('query') result=api.search(query) tweets=[] classification=[] for tweet in result: try: tweets.append(str(tweet.text)) except: pass posScore=0 negScore=0 for tweet in tweets: tokens=tweet.split() data_preprocess.remove_noise_words(tokens) data_preprocess.remove_names(tokens) data_preprocess.remove_links(tokens) tweet_counts=[] token_counts=[] category_counts=defaultdict(lambda:defaultdict(int)) p=tweet_category_count.objects.get(id=1) tweet_counts.append(p.positive_count) tweet_counts.append(p.negative_count) p=token_category_count.objects.get(id=1) token_counts.append(p.positive_count) token_counts.append(p.negative_count) for token in tokens: try: p=pos_tokens.objects.get(ptoken=token) category_counts[token]['pos']=p.pcount except: category_counts[token]['pos']=0 for token in tokens: try: p=neg_tokens.objects.get(ntoken=token) category_counts[token]['neg']=p.ncount except: category_counts[token]['neg']=0 classifier=NaiveBayesClassifier() result=classifier.classify(tokens,category_counts,tweet_counts,token_counts) if(result=='pos'): posScore+=1 else: negScore+=1 classification.append(result) return render_to_response("index.html",{'tweets':tweets,'pos_neg':classification,'posScore':posScore,'negScore':negScore})
def assignment_e_naivebayes_1(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language
def test_china_example_from_textbook(self): import math from corpus import InMemoryDocument, InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier china = InMemoryCorpus() china.add_document( InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document( InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], self._normalizer, self._tokenizer) results = [] classifier.classify("Chinese Chinese Chinese Tokyo Japan", lambda m: results.append(m)) self.assertEqual(len(results), 2) self.assertEqual(results[0]["category"], "china") self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4) self.assertEqual(results[1]["category"], "not china") self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
false_counts = Counter() true_counts = Counter() real_counts = Counter() # Now evaluate the trainied classifier. with open(args.test, 'r', encoding='UTF-8') as csv_test: test_reader = csv.reader(csv_test, delimiter=',') next(test_reader) for row in test_reader: rating = float(row[1]) if rating >= -1 and rating < 1: continue cls = nbc.classify(row) actual_cls = classer(row) real_counts[actual_cls] += 1 if cls == actual_cls: true_counts[cls] += 1 else: false_counts[cls] += 1 #End with correct = 0 for cls, count in true_counts.items(): correct += count #End for incorrect = 0
def assignment_e(): # Use these throughout below. These are really language-specific functions, so it's a huge # simplification to use these for a language identifier. normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() results = [] # Callback for receiving results. Received scores are log-probabilities. def match_collector(match: dict): results.append(match) print("*** WINNER", match["score"], match["category"]) # Use this as the training set for our language identifier. print("LOADING...") training_set = { language: InMemoryCorpus("data/" + language + ".txt") for language in ["en", "no", "da", "de"] } # Assess probabilities from the training set. print("TRAINING...") classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) # Classify some previously unseen text fragments. print("CLASSIFYING...") for (buffer, language) in [ ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.", "no"), ("I don't believe that the number of tokens exceeds a billion.", "en"), ("De danske drenge drikker snaps!", "da"), ("Der Kriminalpolizei! Haben sie angst?", "de") ]: print(buffer) results.clear() classifier.classify(buffer, match_collector) assert results[0]["category"] == language # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook. china = InMemoryCorpus() china.add_document(InMemoryDocument(0, {"body": "Chinese Beijing Chinese"})) china.add_document( InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"})) china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"})) not_china = InMemoryCorpus() not_china.add_document(InMemoryDocument(0, {"body": "Tokyo Japan Chinese"})) training_set = {"china": china, "not china": not_china} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) buffer = "Chinese Chinese Chinese Tokyo Japan" print(buffer) results.clear() classifier.classify(buffer, match_collector) assert len(results) == 2 assert results[0]["category"] == "china" assert results[1]["category"] == "not china" assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001) assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
#!/usr/bin/env python # -*- coding: utf-8 -*- from naivebayesclassifier import NaiveBayesClassifier from cmd import Cmd l = NaiveBayesClassifier('data') while True: name = raw_input() name_unicode = name.decode('utf-8') final_p = l.classify(name_unicode[-1], force_class_average = True) best_p = 0 best_ans = -1 for i in final_p: if final_p[i] > best_p: best_p = final_p[i] best_ans = i print status[best_ans], best_p if name == 'exit': break