def train_classifiers(self): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats( movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats( movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeats = negfeats + posfeats # train naive bayes self.classifier = NaiveBayesClassifier.train(trainfeats)
def train_classifiers(self): negids = movie_reviews.fileids('neg') posids = movie_reviews.fileids('pos') negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids] posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids] trainfeats = negfeats + posfeats # train naive bayes self.classifier = NaiveBayesClassifier.train(trainfeats)
def getCategoryProbabilityFromDocument(self, document): if self._nltkClassifier == None: raise ClassifierException("The classifier must first be loaded by the loadClassifier() method") docwords = [word for word in document_as_words(document) if word not in nltk.corpus.stopwords.words('english')] bigrams = bigram_feats(docwords,200) words = word_feats(docwords,2000) featureset = dict(bigrams.items() + words.items()) probdist = self._nltkClassifier.prob_classify(featureset) results = [] samples = probdist.samples() for i in samples: prob = probdist.prob(i) if (prob >= 0.001): results.append((probdist.prob(i),i)) results = sorted(results, reverse=True) return results
def proximity_semantic_score(self, i, sentences): score = 0 # read the current sentence and the two sentences before and after it for sent in sentences: for word in sent: if self.classifier.classify(word_feats(word)) == 'neg': score += 1 for neg in self.negative_search_terms: if neg == word: score += 1000 return score
def min_proximity_query(self, verb_set, noun_set, sentence): pack = ('', '', len(sentence) + 1) # (verb,noun,min_dist) for verb in verb_set: if verb not in sentence: continue for noun in noun_set: if noun not in sentence: continue dist = abs(sentence.index(verb) - sentence.index(noun)) if dist < pack[2] and self.classifier.classify(word_feats(verb + ' ' + noun)) == 'neg': pack = (verb, noun, dist) return pack[0].lower(), pack[1].lower()
def min_proximity_query(self, verb_set, noun_set, sentence): pack = ('', '', len(sentence) + 1) # (verb,noun,min_dist) for verb in verb_set: if verb not in sentence: continue for noun in noun_set: if noun not in sentence: continue dist = abs(sentence.index(verb) - sentence.index(noun)) if dist < pack[2] and self.classifier.classify( word_feats(verb + ' ' + noun)) == 'neg': pack = (verb, noun, dist) return pack[0].lower(), pack[1].lower()
def rocchio(self, max_num_sentences): # Scrape all the web pages of interest print 'Crawling target webpages and grabbing information...' html_sentences = [] for url in self.web_links: print 'Grabbing info from url:', url html_sentences += scrap_web_page(url) print "\nProcessing %d sentences." % len(html_sentences) # Count the number of pos/neg sentences and record all negative # sentences print 'Running Rocchio()' neg_sentences = [] poscount, negcount = 0, 0 for i in xrange(len(html_sentences)): sent = html_sentences[i] sentiment = self.classifier.classify(word_feats(sent)) if sentiment == 'pos': poscount += 2.5 # positives are worth 250% that of negatives else: negcount += 1 neg_sentences.append(sent) poscount = math.ceil(poscount) # Compute sentiment with Rocchio as the balancer print 'pos_count =', poscount print 'neg_count =', negcount sentiment = 'neutral' level = 'mildly' if poscount > negcount: sentiment = 'safe' elif poscount < negcount: sentiment = 'dangerous' if poscount >= 2 * negcount or negcount >= 2 * negcount: level = 'relatively' elif poscount >= 4 * negcount or negcount >= 4 * poscount: level = 'very' # Record danger level as interpreted by Rocchio using Naive Bayes as a # subroutine self.danger_r1 = level + ' ' + sentiment # Randomly sample from the remaining negative sentences samp = min(max_num_sentences, len(neg_sentences)) html_sentences = random.sample(neg_sentences, samp) return self.create_results(html_sentences)
def create_results(self, orig_sentences): print 'Running nltk subroutines in create_results()' # create a list of all tokens in each sentence sentences = [nltk.word_tokenize(sent) for sent in orig_sentences] # record the tokenized sentences separately tokenized_sentences = sentences # multi-threaded pos_tag assignment print 'Running multi-threaded "part-of-speech" tagging of web page results' t1 = myThread(1, 3, tokenized_sentences) t2 = myThread(2, 3, tokenized_sentences) t3 = myThread(3, 3, tokenized_sentences) # TODO :: currently hard-coded to 3 threads for now threads = [] t1.start() t2.start() t3.start() threads.append(t1) threads.append(t2) threads.append(t3) # Wait for all threads to be done for t in threads: t.join() # tag parts of speech for each word sentences = t1.results + t2.results + t3.results print "Constructing Grammars..." # Verb Extraction Grammar grammar = r""" VERBS: {<V.*>} }<VBZ>{ """ # Verb Regex Parser (Finds effects) cp_effect = nltk.RegexpParser(grammar) # Noun Extraction Grammar grammar2 = r'NOUNS: {<NN|NP>}' # Noun Regex Parser (Finds causes) cp_cause = nltk.RegexpParser(grammar2) verbs = [] nouns = [] # Gather only the most negative sentences and process them with nltk # # The reason behind this is that we want to have two lists produced: # # List verbs[] and List nouns[], both of which are lists of lists where # each sublist corresponds to an individual 'negative sentence'. print "Parsing all sentences and collecting verbs and nouns..." for sent in sentences: # Collect Negative Verbs some_verbs = [] tree1 = cp_effect.parse(sent) for subtree in tree1.subtrees(): if subtree.node in ['VERBS']: term = self.lmtzr.lemmatize(subtree[0][0], 'v') if self.classifier.classify(word_feats(term)) == 'neg': some_verbs.append(term) verbs.append(some_verbs) # Collect Nouns some_nouns = [] tree2 = cp_cause.parse(sent) for subtree in tree2.subtrees(): if subtree.node in ['NOUNS']: term = self.lmtzr.lemmatize(subtree[0][0], 'n') some_nouns.append(term) nouns.append(some_nouns) # Find the most negative verb/noun pairs and produce 'good' phrases print "Collecting (verb,noun) pairings..." phrases = [] num_sents = len(tokenized_sentences) for i in xrange(num_sents): verb_set = verbs[i] noun_set = nouns[i] if len(verb_set) == 0 or len(noun_set) == 0: continue v, n = self.min_proximity_query(verb_set, noun_set, tokenized_sentences[i]) rating = self.proximity_semantic_score(i, orig_sentences[i - 1:i + 1]) if len(v) > 2 and len(n) > 2: ss = v + ' ' + n phrases.append((v + ' ' + n, rating, orig_sentences[i])) print "len phrases = ", len(phrases) print "len nouns = ", len(nouns) print "len verbs = ", len(verbs) # Returning 10 randomly sampled results, fix this! print "Removing duplicates and porting phrases by negativity scores." phrases = list(set(phrases)) phrases = sorted(phrases, key=lambda tup: tup[1], reverse=True) print "Returning top 10 phrases." return phrases[:10]
def create_results(self, orig_sentences): print 'Running nltk subroutines in create_results()' # create a list of all tokens in each sentence sentences = [nltk.word_tokenize(sent) for sent in orig_sentences] # record the tokenized sentences separately tokenized_sentences = sentences # multi-threaded pos_tag assignment print 'Running multi-threaded "part-of-speech" tagging of web page results' t1 = myThread(1, 3, tokenized_sentences) t2 = myThread(2, 3, tokenized_sentences) t3 = myThread(3, 3, tokenized_sentences) # TODO :: currently hard-coded to 3 threads for now threads = [] t1.start() t2.start() t3.start() threads.append(t1) threads.append(t2) threads.append(t3) # Wait for all threads to be done for t in threads: t.join() # tag parts of speech for each word sentences = t1.results + t2.results + t3.results print "Constructing Grammars..." # Verb Extraction Grammar grammar = r""" VERBS: {<V.*>} }<VBZ>{ """ # Verb Regex Parser (Finds effects) cp_effect = nltk.RegexpParser(grammar) # Noun Extraction Grammar grammar2 = r'NOUNS: {<NN|NP>}' # Noun Regex Parser (Finds causes) cp_cause = nltk.RegexpParser(grammar2) verbs = [] nouns = [] # Gather only the most negative sentences and process them with nltk # # The reason behind this is that we want to have two lists produced: # # List verbs[] and List nouns[], both of which are lists of lists where # each sublist corresponds to an individual 'negative sentence'. print "Parsing all sentences and collecting verbs and nouns..." for sent in sentences: # Collect Negative Verbs some_verbs = [] tree1 = cp_effect.parse(sent) for subtree in tree1.subtrees(): if subtree.node in ['VERBS']: term = self.lmtzr.lemmatize(subtree[0][0], 'v') if self.classifier.classify(word_feats(term)) == 'neg': some_verbs.append(term) verbs.append(some_verbs) # Collect Nouns some_nouns = [] tree2 = cp_cause.parse(sent) for subtree in tree2.subtrees(): if subtree.node in ['NOUNS']: term = self.lmtzr.lemmatize(subtree[0][0], 'n') some_nouns.append(term) nouns.append(some_nouns) # Find the most negative verb/noun pairs and produce 'good' phrases print "Collecting (verb,noun) pairings..." phrases = [] num_sents = len(tokenized_sentences) for i in xrange(num_sents): verb_set = verbs[i] noun_set = nouns[i] if len(verb_set) == 0 or len(noun_set) == 0: continue v, n = self.min_proximity_query( verb_set, noun_set, tokenized_sentences[i]) rating = self.proximity_semantic_score( i, orig_sentences[i - 1:i + 1]) if len(v) > 2 and len(n) > 2: ss = v + ' ' + n phrases.append((v + ' ' + n, rating, orig_sentences[i])) print "len phrases = ", len(phrases) print "len nouns = ", len(nouns) print "len verbs = ", len(verbs) # Returning 10 randomly sampled results, fix this! print "Removing duplicates and porting phrases by negativity scores." phrases = list(set(phrases)) phrases = sorted(phrases, key=lambda tup: tup[1], reverse=True) print "Returning top 10 phrases." return phrases[:10]