def resolve_certainty(certainty_info): '''Resolve certainty with Naive Bayes''' if certainty_info == '': return 'No certainty info.' else: nb = NB() for observation, certainty in csv( 'library/templatetags/c_training_data.csv'): v = Document(observation, type=int(certainty), stopwords=True) nb.train(v) return nb.classify(Document(certainty_info))
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.items(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.iteritems(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
def set_classifier(self): if self.name == 'SLP': return SLP(train=self.train_data, iterations=self.iterations) elif self.name == 'NB': return NB(train=self.train_data) else: print "Unknown classifier name"
def classifyTweets(filename, trainingSet): print('Classifying {}...\n'.format(filename)) data = open('{}/processed{}'.format(INPUT_PATH, filename.capitalize()), 'rb') reader = csv.reader(data) info = list(reader) classifier = NB(train = trainingSet, alpha = 0.0001) tweets = [] for i in range(len(info)): tweet = info[i][0] result = classifier.classify(Document(tweet)) tweets.append([tweet, result]) # Write all tweets to file with open('{}/results.csv'.format(OUTPUT_PATH), 'wb+') as f: writer = csv.writer(f) writer.writerows(tweets)
def __init__(self): self.nb = NB() self.stats = Statistics() try: # self.nb = self.nb.load("./nb_training.p") self.new_nb_model = True except IOError: self.new_nb_model = False print("Creating new NB model")
class NBModel: def __init__(self): self.nb = NB() self.stats = Statistics() try: # self.nb = self.nb.load("./nb_training.p") self.new_nb_model = True except IOError: self.new_nb_model = False print("Creating new NB model") def naive_bayes_train(self, reviews): for review in reviews: if review.rating is not None and review.rating < 10 and review.rating > 1: v = Document(review.text, type=int(review.rating), stopwords=True) self.nb.train(v) #self.nb.save("./nb_training.p") # print self.nb.classes def nb_test_imdb(self, reviews): arr = [] for review in reviews: if review.rating is not None: v = Document(review.text, type=int(review.rating), stopwords=True) arr.append(v) print self.nb.test(arr, target=None) def nb_classify_tweets(self, tvshow, tweets): ratingSum = 0 tweet_docs = [(self.nb.classify(Document(tweet)), tweet) for tweet in tweets] for tweet in tweet_docs: ratingSum += tweet[0] self.nb_stats() Statistics().printStats(tvshow, ratingSum, len(tweet_docs)) print self.nb.distribution def nb_stats(self): print('----------- Classifier stats -----------') # print("Features: ", self.nb.features) print("Classes: ", self.nb.classes) print("Skewness: ", self.nb.skewness) print("Distribution: ", self.nb.distribution) print("Majority: ", self.nb.majority) print("Minority: ", self.nb.minority)
def __init__(self, corpus, **kargs): """ Initializes the NBClassifier class with a corpus and a NB instance (input): corpus = a corpus of pattern Documents constructed from Grams """ self.corpus = corpus self.documents = self.corpus.documents self.model = Model(documents=self.documents, weight='TF-IDF') #self.documents.words = self.documents.keywords self.split_idx = len(self) / 4 self.nb = NB()
class Classifications(): #static variables _category_path = os.path.join(os.path.dirname(__file__), "classifiers/category.slp") _rating_path = os.path.join(os.path.dirname(__file__), "classifiers/rating.slp") _rating_nlp_path = os.path.join(os.path.dirname(__file__), "classifiers/rating_nlp.svm") _sentiment_path = os.path.join(os.path.dirname(__file__), "classifiers/sentiment.nb") _category = SLP.load(_category_path) _rating = SLP.load(_rating_path) _rating_nlp = SVM.load(_rating_nlp_path) _sentiment = NB.load(_sentiment_path) @staticmethod def selectWords(review): ''' a function that gets a review and selects the nouns, adjectives, verbs and exclamation mark ''' review = parsetree(review, lemmata=True)[0] #lemmatize the review #select adjectives (JJ), nouns (NN), verbs (VB) and exclamation marks review = [ w.lemma for w in review if w.tag.startswith(('JJ', 'NN', 'VB', '!')) ] review = count(review) #a dictionary of (word, count) return review @staticmethod def classify(text): predicted_category = Classifications._category.classify(Document(text), discrete=True) predicted_rate = Classifications._rating.classify(Document(text), discrete=True) predicted_rate_nlp = Classifications._rating_nlp.classify( Classifications.selectWords(text), discrete=True) predicted_sentiment_dict = Classifications._sentiment.classify( Classifications.selectWords(text), discrete=False) predicted_sentiment = True if str( sorted(predicted_sentiment_dict.items(), key=operator.itemgetter(1), reverse=True)[1][0]) in ['True', '3.0', '4.0', '5.0' ] else False return { 'text': text, 'rate': predicted_rate, 'category': predicted_category, 'rate_nlp': predicted_rate_nlp, 'positivity': predicted_sentiment }
def __init__(self): self.nb = NB() self.stats = Statistics() try: print("dir: " + os.getcwd()) if os.getcwd().endswith("tv_ratings_frontend"): print("Working in django") self.nb = self.nb.load("ratings_frontend/backend/pattern_ml/nb_training.p") else: print("Not working in django") self.nb = self.nb.load("./nb_training.p") self.new_nb_model = True print("Using existing pickled model") except IOError: self.new_nb_model = False print("Creating new NB model")
def normal_test(data, type): print '----------------------------------------------------' print 'TEST FUNCTION STARTED FOR ' + type + '!' total_data_size = len(data) training_size = int(round(total_data_size/2)) test_size = training_size print 'Total Size: ' + str(total_data_size) print 'Training Size: ' + str(training_size) print 'Test Size: ' + str(test_size) print 'Training Started for ' + type + '!' classification_methods = { #uncomment based on what classification algorithm you would like to test 'NB' : NB(train=data[:training_size], baseline=MAJORITY, method=MULTINOMIAL), 'KNN2' : KNN(train=data[:training_size], baseline=MAJORITY, k=2, distance=COSINE), 'KNN3' : KNN(train=data[:training_size], baseline=MAJORITY, k=3, distance=COSINE), 'KNN4' : KNN(train=data[:training_size], baseline=MAJORITY, k=4, distance=COSINE), 'KNN5' : KNN(train=data[:training_size], baseline=MAJORITY, k=5, distance=COSINE), 'KNN6' : KNN(train=data[:training_size], baseline=MAJORITY, k=6, distance=COSINE), 'KNN7' : KNN(train=data[:training_size], baseline=MAJORITY, k=7, distance=COSINE), 'KNN8' : KNN(train=data[:training_size], baseline=MAJORITY, k=8, distance=COSINE), 'KNN9' : KNN(train=data[:training_size], baseline=MAJORITY, k=9, distance=COSINE), 'KNN10' : KNN(train=data[:training_size], baseline=MAJORITY, k=10, distance=COSINE), 'SLP1' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=1), 'SLP2' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=2), 'SLP3' : SLP(train=data[:training_size], baseline=MAJORITY, iterations=3), 'SVM' : SVM(train=data[:training_size], type=CLASSIFICATION, kernel=POLYNOMIAL), } print 'Normal Testing Started!' # uncomment to start the normal test for classification in classification_methods.keys(): #measure the time it takes to classify! start = timeit.default_timer() #normal test accuracy, precision, recall, f1 = classification_methods[classification].test(data[training_size:training_size+test_size]) stop = timeit.default_timer() print '*' + classification + '*' print 'Accuracy: ' + str(accuracy) print 'Precision: ' + str(precision) print 'Recall: ' + str(recall) print 'F1-score: ' + str(f1) print 'Time: ' + str(stop - start) print
def __init__(self, basefile): """ Initialize the emoticon sentiment. Setup variables. Args: basefile (str): The location of the comments file that should serve as the basis for the classifier. """ self.base = basefile self.comments = [] self.nb = NB() self.stop = stopwords.words('english') self.positive = [':-)', ':)', ':D', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':^)', ':っ)', ':-D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3', '=3', 'B^D', ':-))', ':*', ':^*', ';-)', ';)', '*-)', '*)', ';-]', ';]', ';D', ';^)'] self.negative = ['>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC', ':<', ':-[', ':[', ':{', ':-||', ':@', '>:(', 'D:<', 'D:', 'D8', 'D;', 'D=', 'DX', 'v.v', '>:)', '>;)', '>:-)', '}:-)', '}:)', '3:-)', '3:)', ':-###..', ':###..', '>:/', ':-/', ':-.', ':/']
def learnCategories(tn): nb = NB() for (cat, content) in csv('resource/%s.catdata' % tn, separator=';', headers=True): if not cat or not content: continue t = cat # toASCII(cat) v = Document(content, type=t, stemmer=None, stopwords=False, language='fr') nb.train(v) # cr = csv('resource/%s.catdata' % tn, separator = ';', headers = True) # for (i, r) in enumerate(cr): # v = Document(str(i), type = r[0], stemmer = None, stopwords = False, language = 'fr') # nb.train(v) logging.info('TRAINED %s on %d categories', tn, len(nb.classes)) nb.save('resource/%s.classifier' % tn)
""" views imports app, auth, and models, but none of these import views """ from flask import render_template, redirect, request, url_for, jsonify from flask.ext.classy import FlaskView from app import app from auth import auth from models import User # Classifier, CSV Loading from pattern.vector import Document, NB from pattern.db import Datasheet # Load classifier nb = NB.load("project/data/amazonClassifier") @app.route('/classify', methods=['POST']) def classify_review(): text = request.form.get('text') return jsonify(result=nb.classify(text.strip())) class BaseView(FlaskView): '''Basic views, such as the home and about page.''' route_base = '/' def index(self): return render_template('home.html')
for score, message in data: document = Document(message, type=int(score) > 0) documents.append(document) m = Model(documents) print("number of documents:", len(m)) print("number of words:", len(m.vector)) print("number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m))) print() # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print(classifier.classify("win money")) # False: most likely spam. print(classifier.classify("fix bug")) # True: most likely a real message. print() # False: people don't talk like this on developer lists... print(classifier.classify("customer")) # True: because most likely everyone knows everyone. print(classifier.classify("guys")) print()
for concept, w1 in m.lsa.vectors[d.id].items(): for feature, w2 in m.lsa.concepts[concept].items(): if w1 != 0 and w2 != 0: print(feature, w1 * w2) # clustering d1 = Document('Cats are independent pets.', name='cat') d2 = Document('Dogs are trustworthy pets.', name='dog') d3 = Document('Boxes are made of cardboard.', name='box') m = Model((d1, d2, d3)) print m.cluster(method=HIERARCHICAL, k=2) # hierarchical clustering cluster = Cluster((1, Cluster((2, Cluster((3, 4)))))) print cluster.depth print cluster.flatten(1) # training a classifier nb = NB() for review, rating in csv('data/input/reviews.csv'): v = Document(review, type=int(rating), stopwords=True) nb.train(v) print nb.classes print nb.classify(Document('A good movie!')) # testing a classifier data = csv('data/input/reviews.csv') data = [(review, int(rating)) for review, rating in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ] nb = NB(train=data[:500]) accuracy, precision, recall, f1 = nb.test(data[500:]) print accuracy # binary classification
class NBClassifier: """ This class interfaces a pattern corpus with a pattern.vector nb classifier """ def __init__(self, corpus, **kargs): """ Initializes the NBClassifier class with a corpus and a NB instance (input): corpus = a corpus of pattern Documents constructed from Grams """ self.corpus = corpus self.documents = self.corpus.documents self.model = Model(documents=self.documents, weight='TF-IDF') #self.documents.words = self.documents.keywords self.split_idx = len(self) / 4 self.nb = NB() def __len__(self): return len(self.documents) def classify_document(self, document): """ classify document with nb instance (input): document = Document instance with same format as classifier train set (output): classification result """ return ( self.nb.classify(Document(document, stemmer=PORTER), discrete=True) ) def nb_train(self): """ This function trains the classifier with (3/4) of the documents (input): None (outpu): trains self.nb """ train_start = datetime.datetime.now() print "training with {0} docs".format(len(self) - self.split_idx) documents = self.documents[:-self.split_idx] random.shuffle(documents) [self.nb.train(doc) for doc in documents] train_end = datetime.datetime.now() print "training {0} docs took {1} seconds".format(len(documents), (train_end - train_start).seconds) def nb_test(self, gold_standard=None): """ Evaluates the classifier on (1/4) of the documents (input): None (output): accuracy, precision, recall, f1 """ if not gold_standard: test_docs = self.documents[-self.split_idx:] conf_matrix = self.nb.confusion_matrix(test_docs) print conf_matrix.table return self.nb.test(test_docs) else: return self.nb.test(gold_standard) def get_documentlabel_distribution(self): """ Gets the label distribution of the document set passed into self.nb classifier (input): None (output): distribution = dictionary representation of the label:frequency distribution rankdistribution = a ranked list of label keys by frequency """ distribution = self.nb.distribution rankdistribution = sorted( distribution, key=lambda x: distribution[x], reverse=True) for each, key in enumerate(rankdistribution[:10]): print "{0}:\t{1} ({2})".format(each, key, distribution[key]) return distribution, rankdistribution def run(self): """ executes training / testing of classifier """ print "training and testing {0} total documents...".format(len(self)) self.nb_train() print "testing" result = self.nb_test() print "finalizing" self.nb.finalize() return result
class Emoticon_Sentiment: """ Use Emoticon sentiment to classify and predict comments. This class provides a binary sentiment analyzer (1 = positive, -1 = negative). It is automatically created using emoticons from thousands of comments. """ def __init__(self, basefile): """ Initialize the emoticon sentiment. Setup variables. Args: basefile (str): The location of the comments file that should serve as the basis for the classifier. """ self.base = basefile self.comments = [] self.nb = NB() self.stop = stopwords.words('english') self.positive = [':-)', ':)', ':D', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':^)', ':っ)', ':-D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3', '=3', 'B^D', ':-))', ':*', ':^*', ';-)', ';)', '*-)', '*)', ';-]', ';]', ';D', ';^)'] self.negative = ['>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC', ':<', ':-[', ':[', ':{', ':-||', ':@', '>:(', 'D:<', 'D:', 'D8', 'D;', 'D=', 'DX', 'v.v', '>:)', '>;)', '>:-)', '}:-)', '}:)', '3:-)', '3:)', ':-###..', ':###..', '>:/', ':-/', ':-.', ':/'] def read_base_comments(self): """ Read the base comments. Read the file that loads the comments that serve as the base for the classifier. Returns: list: list of comments from the text file """ f = open(self.base, 'r') self.comments = json.load(f) f.close() return self.comments def preprocess(self, comment): """ Preprocess comment. Take a comment and pre-processes it (all to lowercase, remove links, delete punctuation, remove stopwords and numbers) Args: comment (str): The text string to be processed Returns: str: the processed text string """ h = HTMLParser.HTMLParser() text = h.unescape(comment.lower()) p = re.compile(r"(\b(?:(?:https?|ftp|file)://|www\.|ftp\.)" "[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*" "[-A-Za-z0-9+&@#/%=~_()|])") text = re.sub(p, '', text) exclude = set(string.punctuation) tokens = word_tokenize(text) cleaned = [token for token in tokens if token not in exclude] text = ' '.join([w.encode('ascii', errors='ignore') for w in cleaned if w.lower() not in self.stop and not w.isdigit()]) return text def parse_comments(self): """ Parse comment, check for emoticon, remove neutrals. Parse a list of comments, by searching for emoticons. Once an emoticon is found, it flags it positive or negative. With multiple finds, it sums their value; neutrals are thrown out. Returns: list: 2-dimensional list of parsed comments and their sentiment flags """ extended_comments = [[self.comments[i], 0, False] for i in range(len(self.comments))] # loop through comments and search for emoticons: +1 if positive, -1 # if negative, and remove emoticons from text for comment in extended_comments: for n in self.negative: if comment[0].encode('utf-8').find(n) > -1: comment[1] = comment[1]-1 comment[0] = comment[0].replace(n, '') comment[2] = True for p in self.positive: if comment[0].encode('utf-8').find(p) > -1: comment[1] = comment[1]+1 comment[0] = comment[0].replace(p, '') comment[2] = True # throw out comments that have no emoticons parsed_comments = [[comment[0], comment[1]] for comment in extended_comments if comment[2]] # normalize the values by converting all positives (1,2,3,etc) to 1, # and all negatives (-1,-2,-3 etc) to -1 for item in parsed_comments: if item[1] > 1: item[1] = 1 elif item[1] < -1: item[1] = -1 # throw out neutrals parsed_comments = [[self.preprocess(i[0]), i[1]] for i in parsed_comments if i[1] != 0] return parsed_comments def train_classifier(self, comments): """ take the flagged comments and trains a Naive Bayes classifier. Args: comments (list): The list of flagged comments to be trained """ shuffle(comments) for comment in comments: v = Document(comment[0], type=int(comment[1])) self.nb.train(v) def predict(self, text): """ Take a comment and makes a sentiment prediction. Args: text (str): The text string to be analyzed for sentiment Returns: int: 1 if positive sentiment, -1 if negative """ return self.nb.classify(text)
class NBModel: def __init__(self): self.nb = NB() self.stats = Statistics() try: print("dir: " + os.getcwd()) if os.getcwd().endswith("tv_ratings_frontend"): print("Working in django") self.nb = self.nb.load("ratings_frontend/backend/pattern_ml/nb_training.p") else: print("Not working in django") self.nb = self.nb.load("./nb_training.p") self.new_nb_model = True print("Using existing pickled model") except IOError: self.new_nb_model = False print("Creating new NB model") def nb_train_text(self, reviews): for review in reviews: if review.rating is not None:# and review.rating < 10 and review.rating > 1: v = Document(review.text, type=int(review.rating), stopwords=True) self.nb.train(v) self.nb.save("./nb_training.p") # print self.nb.classes def nb_train_summary(self, reviews): for review in reviews: if review.rating is not None:# and review.rating < 10 and review.rating > 1: v = Document(review.summary, type=int(review.rating), stopwords=True) self.nb.train(v) def nb_train_all_text(self, review_set): for review_list in review_set: self.nb_train_text(review_list) self.nb.save_model() def save_model(self): # print "" self.nb.save('./nb_training.p') def nb_test_imdb(self, reviews): arr = [] for review in reviews: if review.rating is not None: v = Document(self.review_to_words(review.text), type=int(review.rating), stopwords=True) arr.append(v) print self.nb.test(arr, target=None) def nb_classify_tweets(self, tvshow, tweets): ratingSum = 0 tweet_docs = [(self.nb.classify(Document(self.review_to_words(tweet))), self.review_to_words(tweet)) for tweet in tweets] for tweet in tweet_docs: ratingSum += tweet[0] #print tweet # print tweet self.nb_stats() Statistics().printStats(tvshow, ratingSum, len(tweet_docs)) print self.nb.distribution return Statistics().get_stats(tvshow, ratingSum, len(tweet_docs)) def nb_stats(self): print('----------- Classifier stats -----------') # print("Features: ", self.nb.features) print("Classes: ", self.nb.classes) print("Skewness: ", self.nb.skewness) print("Distribution: ", self.nb.distribution) print("Majority: ", self.nb.majority) print("Minority: ", self.nb.minority) def review_to_words(self, raw_review): no_url = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", raw_review) # Remove numerics letters_only = re.sub("[^a-zA-Z]", " ", no_url) # to lowercase words = letters_only.lower().split() # remove stop words - the, of , a .... stops = set(stopwords.words("english")) meaningful_words = [w for w in words if not w in stops] return (" ".join(meaningful_words))
""" views imports app, auth, and models, but none of these import views """ from flask import render_template, redirect, request, url_for, jsonify from flask.ext.classy import FlaskView from app import app from auth import auth from models import User # Classifier, CSV Loading from pattern.vector import Document, NB from pattern.db import Datasheet # Load classifier nb = NB.load("project/data/amazonClassifier") @app.route('/classify', methods=['POST']) def classify_review(): text = request.form.get('text') return jsonify(result=nb.classify(text.strip())) class BaseView(FlaskView): '''Basic views, such as the home and about page.''' route_base = '/' def index(self): return render_template('home.html') BaseView.register(app)
import xml.etree.ElementTree as xmlTree from pattern.vector import Document, NB, count, words from pattern.web import plaintext from pattern.db import csv from collections import Counter nb = NB() wordStats = Counter() opinionStats = Counter({'positive': 0, 'negative': 0, 'overall': 0}) for grade, opinion in csv('trainData.csv', separator = '\t'): comment = Document(opinion, type=int(grade), stopwords = True) nb.train(comment) tree = xmlTree.parse("Posts.xml") root = tree.getroot() for row in root: doc = Document(plaintext(row.attrib['Body']), filter = lambda w: w.strip("'").isalpha() and len(w) > 1, stopwords = False) opinion = nb.classify(doc) opinionStats['overall'] +=1 if opinion > 0: opinionStats['positive'] += 1 else: opinionStats['negative'] += 1 wordStats += Counter(doc.words) print wordStats.most_common(10) print opinionStats
from pattern.vector import count, words, chngrams from pattern.vector import kfoldcv, fsel # Statistical machine learning is a branch of AI that can be used # to learn the "type" of unknown things, based on a "training set" # of known things. For these known things, we already know the type, # and we have a description of each thing, called a "vector". # A vector is a just Python dictionary of features and feature weights. # ------------------------------------------------------------------------------------ # The simplest classification algorithm is Naive Bayes, # but it works quite well with text. # A trivial example with animal features: nb = NB() nb.train({"swim": 1, "fin": 2, "legs": 0, "wings": 0}, type="fish") nb.train({"swim": 0, "fin": 0, "legs": 1, "wings": 2}, type="bird") nb.train({"swim": 1, "fin": 0, "legs": 4, "wings": 0}, type="mammal") print nb.classify({"legs": 4}) print # ------------------------------------------------------------------------------------ # For text, usually the word order is discarded in favor of word count. # This is called a "bag-of-words" model, i.e., we count each word in a # document. We can then compare it to other documents to see if they # have frequent words in common. If so, they probably belong to the # same class or type.
for score, message in data: document = Document(message, type=int(score) > 0) documents.append(document) m = Model(documents) print "number of documents:", len(m) print "number of words:", len(m.vector) print "number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m)) print # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print classifier.classify("win money") # False: most likely spam. print classifier.classify("fix bug") # True: most likely a real message. print print classifier.classify( "customer") # False: people don't talk like this on developer lists... print classifier.classify( "guys") # True: because most likely everyone knows everyone. print
from pattern.vector import Document, NB, SVM from pattern.db import Datasheet data = Datasheet.load('reviews.csv', headers=True) print data datalist = [(review, int(rating)) for review, rating in data] print datalist datadocs = [Document(review, type=rating, stopwords=True) for review, rating in datalist] print datadocs #naive Bayes #training set nb = NB(train=datadocs[:500]) print 'nb distribution = ', nb.distribution print 'nb confusion matrix = ', nb.confusion_matrix(datadocs[500:]) print 'nb confusion matrix for each class = ', nb.confusion_matrix(datadocs[500:])(True) # (TP, TN, FP, FN) print 'nb features = ', nb.features #test set accuracy, precision, recall, f1 = nb.test(datadocs[500:]) print 'nb accuracy = ', accuracy, 'nb precision =', precision, 'nbrecall = ', \ recall #test SVM testsvm = SVM(train=datadocs[:500]) print 'svm features = ', testsvm.features saccuracy, sprecision, srecall, sf1 = testsvm.test(datadocs[500:]) print 'svm accuracy =', saccuracy #classifier training example with test classificaiton
documents = [] for score, message in data: document = Document(message, type=int(score) > 0) documents.append(document) m = Model(documents) print "number of documents:", len(m) print "number of words:", len(m.vector) print "number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m)) print # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print classifier.classify("win money") # False: most likely spam. print classifier.classify("fix bug") # True: most likely a real message. print print classifier.classify("customer") # False: people don't talk like this on developer lists... print classifier.classify("guys") # True: because most likely everyone knows everyone. print # To test the accuracy of a classifier, # we typically use 10-fold cross validation.
result.close() result=open('result.txt',"r") documents=[] for linea in result.readlines(): document = Document(linea.split("/")[1], type=linea.split("/")[0]) documents.append(document) m = Model(documents) nb=NB() for document in m: nb.train(document) print nb.classify("opinion") print nb.classify("noticia") from pattern.vector import k_fold_cv print k_fold_cv(NB, documents=m, folds=10) #for palabra in lines: # print d.get(palabra) """ 1. Sacar bien los resultados