def resolve_certainty(certainty_info): '''Resolve certainty with Naive Bayes''' if certainty_info == '': return 'No certainty info.' else: nb = NB() for observation, certainty in csv( 'library/templatetags/c_training_data.csv'): v = Document(observation, type=int(certainty), stopwords=True) nb.train(v) return nb.classify(Document(certainty_info))
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.items(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
def extractSentiment(characterSentences): """ Trains a Naive Bayes classifier object with the reviews.csv file, analyzes the sentence, and returns the tone. """ nb = NB() characterTones = defaultdict(list) for review, rating in csv("reviews.csv"): nb.train(Document(review, type=int(rating), stopwords=True)) for key, value in characterSentences.iteritems(): for x in value: characterTones[key].append(nb.classify(str(x))) return characterTones
def classifyTweets(filename, trainingSet): print('Classifying {}...\n'.format(filename)) data = open('{}/processed{}'.format(INPUT_PATH, filename.capitalize()), 'rb') reader = csv.reader(data) info = list(reader) classifier = NB(train = trainingSet, alpha = 0.0001) tweets = [] for i in range(len(info)): tweet = info[i][0] result = classifier.classify(Document(tweet)) tweets.append([tweet, result]) # Write all tweets to file with open('{}/results.csv'.format(OUTPUT_PATH), 'wb+') as f: writer = csv.writer(f) writer.writerows(tweets)
class NBModel: def __init__(self): self.nb = NB() self.stats = Statistics() try: # self.nb = self.nb.load("./nb_training.p") self.new_nb_model = True except IOError: self.new_nb_model = False print("Creating new NB model") def naive_bayes_train(self, reviews): for review in reviews: if review.rating is not None and review.rating < 10 and review.rating > 1: v = Document(review.text, type=int(review.rating), stopwords=True) self.nb.train(v) #self.nb.save("./nb_training.p") # print self.nb.classes def nb_test_imdb(self, reviews): arr = [] for review in reviews: if review.rating is not None: v = Document(review.text, type=int(review.rating), stopwords=True) arr.append(v) print self.nb.test(arr, target=None) def nb_classify_tweets(self, tvshow, tweets): ratingSum = 0 tweet_docs = [(self.nb.classify(Document(tweet)), tweet) for tweet in tweets] for tweet in tweet_docs: ratingSum += tweet[0] self.nb_stats() Statistics().printStats(tvshow, ratingSum, len(tweet_docs)) print self.nb.distribution def nb_stats(self): print('----------- Classifier stats -----------') # print("Features: ", self.nb.features) print("Classes: ", self.nb.classes) print("Skewness: ", self.nb.skewness) print("Distribution: ", self.nb.distribution) print("Majority: ", self.nb.majority) print("Minority: ", self.nb.minority)
d1 = Document('Cats are independent pets.', name='cat') d2 = Document('Dogs are trustworthy pets.', name='dog') d3 = Document('Boxes are made of cardboard.', name='box') m = Model((d1, d2, d3)) print m.cluster(method=HIERARCHICAL, k=2) # hierarchical clustering cluster = Cluster((1, Cluster((2, Cluster((3, 4)))))) print cluster.depth print cluster.flatten(1) # training a classifier nb = NB() for review, rating in csv('data/input/reviews.csv'): v = Document(review, type=int(rating), stopwords=True) nb.train(v) print nb.classes print nb.classify(Document('A good movie!')) # testing a classifier data = csv('data/input/reviews.csv') data = [(review, int(rating)) for review, rating in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ] nb = NB(train=data[:500]) accuracy, precision, recall, f1 = nb.test(data[500:]) print accuracy # binary classification data = csv('data/input/reviews.csv') data = [(review, int(rating) >= 3) for review, rating in data] data = [ Document(review, type=rating, stopwords=True) for review, rating in data ]
class Emoticon_Sentiment: """ Use Emoticon sentiment to classify and predict comments. This class provides a binary sentiment analyzer (1 = positive, -1 = negative). It is automatically created using emoticons from thousands of comments. """ def __init__(self, basefile): """ Initialize the emoticon sentiment. Setup variables. Args: basefile (str): The location of the comments file that should serve as the basis for the classifier. """ self.base = basefile self.comments = [] self.nb = NB() self.stop = stopwords.words('english') self.positive = [':-)', ':)', ':D', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':^)', ':っ)', ':-D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3', '=3', 'B^D', ':-))', ':*', ':^*', ';-)', ';)', '*-)', '*)', ';-]', ';]', ';D', ';^)'] self.negative = ['>:[', ':-(', ':(', ':-c', ':c', ':-<', ':っC', ':<', ':-[', ':[', ':{', ':-||', ':@', '>:(', 'D:<', 'D:', 'D8', 'D;', 'D=', 'DX', 'v.v', '>:)', '>;)', '>:-)', '}:-)', '}:)', '3:-)', '3:)', ':-###..', ':###..', '>:/', ':-/', ':-.', ':/'] def read_base_comments(self): """ Read the base comments. Read the file that loads the comments that serve as the base for the classifier. Returns: list: list of comments from the text file """ f = open(self.base, 'r') self.comments = json.load(f) f.close() return self.comments def preprocess(self, comment): """ Preprocess comment. Take a comment and pre-processes it (all to lowercase, remove links, delete punctuation, remove stopwords and numbers) Args: comment (str): The text string to be processed Returns: str: the processed text string """ h = HTMLParser.HTMLParser() text = h.unescape(comment.lower()) p = re.compile(r"(\b(?:(?:https?|ftp|file)://|www\.|ftp\.)" "[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*" "[-A-Za-z0-9+&@#/%=~_()|])") text = re.sub(p, '', text) exclude = set(string.punctuation) tokens = word_tokenize(text) cleaned = [token for token in tokens if token not in exclude] text = ' '.join([w.encode('ascii', errors='ignore') for w in cleaned if w.lower() not in self.stop and not w.isdigit()]) return text def parse_comments(self): """ Parse comment, check for emoticon, remove neutrals. Parse a list of comments, by searching for emoticons. Once an emoticon is found, it flags it positive or negative. With multiple finds, it sums their value; neutrals are thrown out. Returns: list: 2-dimensional list of parsed comments and their sentiment flags """ extended_comments = [[self.comments[i], 0, False] for i in range(len(self.comments))] # loop through comments and search for emoticons: +1 if positive, -1 # if negative, and remove emoticons from text for comment in extended_comments: for n in self.negative: if comment[0].encode('utf-8').find(n) > -1: comment[1] = comment[1]-1 comment[0] = comment[0].replace(n, '') comment[2] = True for p in self.positive: if comment[0].encode('utf-8').find(p) > -1: comment[1] = comment[1]+1 comment[0] = comment[0].replace(p, '') comment[2] = True # throw out comments that have no emoticons parsed_comments = [[comment[0], comment[1]] for comment in extended_comments if comment[2]] # normalize the values by converting all positives (1,2,3,etc) to 1, # and all negatives (-1,-2,-3 etc) to -1 for item in parsed_comments: if item[1] > 1: item[1] = 1 elif item[1] < -1: item[1] = -1 # throw out neutrals parsed_comments = [[self.preprocess(i[0]), i[1]] for i in parsed_comments if i[1] != 0] return parsed_comments def train_classifier(self, comments): """ take the flagged comments and trains a Naive Bayes classifier. Args: comments (list): The list of flagged comments to be trained """ shuffle(comments) for comment in comments: v = Document(comment[0], type=int(comment[1])) self.nb.train(v) def predict(self, text): """ Take a comment and makes a sentiment prediction. Args: text (str): The text string to be analyzed for sentiment Returns: int: 1 if positive sentiment, -1 if negative """ return self.nb.classify(text)
from pattern.db import csv from collections import Counter nb = NB() wordStats = Counter() opinionStats = Counter({'positive': 0, 'negative': 0, 'overall': 0}) for grade, opinion in csv('trainData.csv', separator = '\t'): comment = Document(opinion, type=int(grade), stopwords = True) nb.train(comment) tree = xmlTree.parse("Posts.xml") root = tree.getroot() for row in root: doc = Document(plaintext(row.attrib['Body']), filter = lambda w: w.strip("'").isalpha() and len(w) > 1, stopwords = False) opinion = nb.classify(doc) opinionStats['overall'] +=1 if opinion > 0: opinionStats['positive'] += 1 else: opinionStats['negative'] += 1 wordStats += Counter(doc.words) print wordStats.most_common(10) print opinionStats
# of known things. For these known things, we already know the type, # and we have a description of each thing, called a "vector". # A vector is a just Python dictionary of features and feature weights. # ------------------------------------------------------------------------------------ # The simplest classification algorithm is Naive Bayes, # but it works quite well with text. # A trivial example with animal features: nb = NB() nb.train({"swim": 1, "fin": 2, "legs": 0, "wings": 0}, type="fish") nb.train({"swim": 0, "fin": 0, "legs": 1, "wings": 2}, type="bird") nb.train({"swim": 1, "fin": 0, "legs": 4, "wings": 0}, type="mammal") print nb.classify({"legs": 4}) print # ------------------------------------------------------------------------------------ # For text, usually the word order is discarded in favor of word count. # This is called a "bag-of-words" model, i.e., we count each word in a # document. We can then compare it to other documents to see if they # have frequent words in common. If so, they probably belong to the # same class or type. # For example, if we have 10,000 product reviews + star rating, # we can transform each review to a vector of adjective => count, # use the star rating as vector type (e.g., * = -1, ***** = +1), # and use this training set to predict the star rating of other reviews.
print "number of words:", len(m.vector) print "number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m)) print # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print classifier.classify("win money") # False: most likely spam. print classifier.classify("fix bug") # True: most likely a real message. print print classifier.classify( "customer") # False: people don't talk like this on developer lists... print classifier.classify( "guys") # True: because most likely everyone knows everyone. print # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv print k_fold_cv(Bayes, documents=m, folds=10)
class NBClassifier: """ This class interfaces a pattern corpus with a pattern.vector nb classifier """ def __init__(self, corpus, **kargs): """ Initializes the NBClassifier class with a corpus and a NB instance (input): corpus = a corpus of pattern Documents constructed from Grams """ self.corpus = corpus self.documents = self.corpus.documents self.model = Model(documents=self.documents, weight='TF-IDF') #self.documents.words = self.documents.keywords self.split_idx = len(self) / 4 self.nb = NB() def __len__(self): return len(self.documents) def classify_document(self, document): """ classify document with nb instance (input): document = Document instance with same format as classifier train set (output): classification result """ return ( self.nb.classify(Document(document, stemmer=PORTER), discrete=True) ) def nb_train(self): """ This function trains the classifier with (3/4) of the documents (input): None (outpu): trains self.nb """ train_start = datetime.datetime.now() print "training with {0} docs".format(len(self) - self.split_idx) documents = self.documents[:-self.split_idx] random.shuffle(documents) [self.nb.train(doc) for doc in documents] train_end = datetime.datetime.now() print "training {0} docs took {1} seconds".format(len(documents), (train_end - train_start).seconds) def nb_test(self, gold_standard=None): """ Evaluates the classifier on (1/4) of the documents (input): None (output): accuracy, precision, recall, f1 """ if not gold_standard: test_docs = self.documents[-self.split_idx:] conf_matrix = self.nb.confusion_matrix(test_docs) print conf_matrix.table return self.nb.test(test_docs) else: return self.nb.test(gold_standard) def get_documentlabel_distribution(self): """ Gets the label distribution of the document set passed into self.nb classifier (input): None (output): distribution = dictionary representation of the label:frequency distribution rankdistribution = a ranked list of label keys by frequency """ distribution = self.nb.distribution rankdistribution = sorted( distribution, key=lambda x: distribution[x], reverse=True) for each, key in enumerate(rankdistribution[:10]): print "{0}:\t{1} ({2})".format(each, key, distribution[key]) return distribution, rankdistribution def run(self): """ executes training / testing of classifier """ print "training and testing {0} total documents...".format(len(self)) self.nb_train() print "testing" result = self.nb_test() print "finalizing" self.nb.finalize() return result
recall #test SVM testsvm = SVM(train=datadocs[:500]) print 'svm features = ', testsvm.features saccuracy, sprecision, srecall, sf1 = testsvm.test(datadocs[500:]) print 'svm accuracy =', saccuracy #classifier training example with test classificaiton nb2 = NB() for review, rating in data: v = Document(review, type=int(rating)) #print v.vector nb2.train(v) print 'nb2 classes', nb2.classes print 'test classification', nb2.classify(Document('A poor movie!')) #cosine similarity example from pattern.vector import Vector, distance v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1}) v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1}) print 'cosine similarity between two vectors', 1 - distance(v1, v2) # a model is a collection of Document objects #todo now that we've built SVM, needs to create documents out of dummy tweets #todo then add them to the SVM classifier as a training and test set #todo build db schema to save tweets and relationships to db #todo get some sample twitter data in db
class NBModel: def __init__(self): self.nb = NB() self.stats = Statistics() try: print("dir: " + os.getcwd()) if os.getcwd().endswith("tv_ratings_frontend"): print("Working in django") self.nb = self.nb.load("ratings_frontend/backend/pattern_ml/nb_training.p") else: print("Not working in django") self.nb = self.nb.load("./nb_training.p") self.new_nb_model = True print("Using existing pickled model") except IOError: self.new_nb_model = False print("Creating new NB model") def nb_train_text(self, reviews): for review in reviews: if review.rating is not None:# and review.rating < 10 and review.rating > 1: v = Document(review.text, type=int(review.rating), stopwords=True) self.nb.train(v) self.nb.save("./nb_training.p") # print self.nb.classes def nb_train_summary(self, reviews): for review in reviews: if review.rating is not None:# and review.rating < 10 and review.rating > 1: v = Document(review.summary, type=int(review.rating), stopwords=True) self.nb.train(v) def nb_train_all_text(self, review_set): for review_list in review_set: self.nb_train_text(review_list) self.nb.save_model() def save_model(self): # print "" self.nb.save('./nb_training.p') def nb_test_imdb(self, reviews): arr = [] for review in reviews: if review.rating is not None: v = Document(self.review_to_words(review.text), type=int(review.rating), stopwords=True) arr.append(v) print self.nb.test(arr, target=None) def nb_classify_tweets(self, tvshow, tweets): ratingSum = 0 tweet_docs = [(self.nb.classify(Document(self.review_to_words(tweet))), self.review_to_words(tweet)) for tweet in tweets] for tweet in tweet_docs: ratingSum += tweet[0] #print tweet # print tweet self.nb_stats() Statistics().printStats(tvshow, ratingSum, len(tweet_docs)) print self.nb.distribution return Statistics().get_stats(tvshow, ratingSum, len(tweet_docs)) def nb_stats(self): print('----------- Classifier stats -----------') # print("Features: ", self.nb.features) print("Classes: ", self.nb.classes) print("Skewness: ", self.nb.skewness) print("Distribution: ", self.nb.distribution) print("Majority: ", self.nb.majority) print("Minority: ", self.nb.minority) def review_to_words(self, raw_review): no_url = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", raw_review) # Remove numerics letters_only = re.sub("[^a-zA-Z]", " ", no_url) # to lowercase words = letters_only.lower().split() # remove stop words - the, of , a .... stops = set(stopwords.words("english")) meaningful_words = [w for w in words if not w in stops] return (" ".join(meaningful_words))
print("number of words:", len(m.vector)) print("number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m))) print() # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print(classifier.classify("win money")) # False: most likely spam. print(classifier.classify("fix bug")) # True: most likely a real message. print() # False: people don't talk like this on developer lists... print(classifier.classify("customer")) # True: because most likely everyone knows everyone. print(classifier.classify("guys")) print() # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv print(k_fold_cv(NB, documents=m, folds=10))
print "number of documents:", len(m) print "number of words:", len(m.vector) print "number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m)) print # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print classifier.classify("win money") # False: most likely spam. print classifier.classify("fix bug") # True: most likely a real message. print print classifier.classify("customer") # False: people don't talk like this on developer lists... print classifier.classify("guys") # True: because most likely everyone knows everyone. print # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv print k_fold_cv(Bayes, documents=m, folds=10) # This yields 4 scores: (Accuracy, Precision, Recall, F-score).
print("number of words:", len(m.vector)) print("number of words (average):", sum(len(d.features) for d in m.documents) / float(len(m))) print() # Train Naive Bayes on all documents. # Each document has a type: True for actual e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = NB() for document in m: classifier.train(document) # We can now ask it questions about unknown e-mails: print(classifier.classify("win money")) # False: most likely spam. print(classifier.classify("fix bug")) # True: most likely a real message. print() # False: people don't talk like this on developer lists... print(classifier.classify("customer")) # True: because most likely everyone knows everyone. print(classifier.classify("guys")) print() # To test the accuracy of a classifier, # we typically use 10-fold cross validation. # This means that 10 individual tests are performed, # each with 90% of the corpus as training data and 10% as testing data. from pattern.vector import k_fold_cv
documents=[] for linea in result.readlines(): document = Document(linea.split("/")[1], type=linea.split("/")[0]) documents.append(document) m = Model(documents) nb=NB() for document in m: nb.train(document) print nb.classify("opinion") print nb.classify("noticia") from pattern.vector import k_fold_cv print k_fold_cv(NB, documents=m, folds=10) #for palabra in lines: # print d.get(palabra) """ 1. Sacar bien los resultados #HuracanPatricia #AguaEnMarte 2. Modificar poster de acuerdo a los resultados y enfocarse en el desarrollo """