示例#1
0
    def train_classifiers(self):
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        negfeats = [(word_feats(
            movie_reviews.words(fileids=[f])), 'neg') for f in negids]
        posfeats = [(word_feats(
            movie_reviews.words(fileids=[f])), 'pos') for f in posids]
        trainfeats = negfeats + posfeats

        # train naive bayes
        self.classifier = NaiveBayesClassifier.train(trainfeats)
示例#2
0
    def train_classifiers(self):
        negids = movie_reviews.fileids('neg')
        posids = movie_reviews.fileids('pos')
        negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg')
                    for f in negids]
        posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos')
                    for f in posids]
        trainfeats = negfeats + posfeats

        # train naive bayes
        self.classifier = NaiveBayesClassifier.train(trainfeats)
示例#3
0
 def getCategoryProbabilityFromDocument(self, document):
     
     if self._nltkClassifier == None:
         raise ClassifierException("The classifier must first be loaded by the loadClassifier() method")
     
     docwords = [word for word in document_as_words(document) if word not in nltk.corpus.stopwords.words('english')]
     
     bigrams = bigram_feats(docwords,200)
     words = word_feats(docwords,2000)
     
     
     
     featureset = dict(bigrams.items() + words.items())
     
     probdist = self._nltkClassifier.prob_classify(featureset)
     
     results = []
     samples = probdist.samples()
     for i in samples:
         prob = probdist.prob(i)
         if (prob >= 0.001):
             results.append((probdist.prob(i),i))
                 
     results = sorted(results, reverse=True)
     return results        
示例#4
0
 def proximity_semantic_score(self, i, sentences):
     score = 0
     # read the current sentence and the two sentences before and after it
     for sent in sentences:
         for word in sent:
             if self.classifier.classify(word_feats(word)) == 'neg':
                 score += 1
                 for neg in self.negative_search_terms:
                     if neg == word:
                         score += 1000
     return score
示例#5
0
 def proximity_semantic_score(self, i, sentences):
     score = 0
     # read the current sentence and the two sentences before and after it
     for sent in sentences:
         for word in sent:
             if self.classifier.classify(word_feats(word)) == 'neg':
                 score += 1
                 for neg in self.negative_search_terms:
                     if neg == word:
                         score += 1000
     return score
示例#6
0
 def min_proximity_query(self, verb_set, noun_set, sentence):
     pack = ('', '', len(sentence) + 1)  # (verb,noun,min_dist)
     for verb in verb_set:
         if verb not in sentence:
             continue
         for noun in noun_set:
             if noun not in sentence:
                 continue
             dist = abs(sentence.index(verb) - sentence.index(noun))
             if dist < pack[2] and self.classifier.classify(word_feats(verb + ' ' + noun)) == 'neg':
                 pack = (verb, noun, dist)
     return pack[0].lower(), pack[1].lower()
示例#7
0
 def min_proximity_query(self, verb_set, noun_set, sentence):
     pack = ('', '', len(sentence) + 1)  # (verb,noun,min_dist)
     for verb in verb_set:
         if verb not in sentence:
             continue
         for noun in noun_set:
             if noun not in sentence:
                 continue
             dist = abs(sentence.index(verb) - sentence.index(noun))
             if dist < pack[2] and self.classifier.classify(
                     word_feats(verb + ' ' + noun)) == 'neg':
                 pack = (verb, noun, dist)
     return pack[0].lower(), pack[1].lower()
示例#8
0
    def rocchio(self, max_num_sentences):

        # Scrape all the web pages of interest
        print 'Crawling target webpages and grabbing information...'
        html_sentences = []
        for url in self.web_links:
            print 'Grabbing info from url:', url
            html_sentences += scrap_web_page(url)
        print "\nProcessing %d sentences." % len(html_sentences)

        # Count the number of pos/neg sentences and record all negative
        # sentences
        print 'Running Rocchio()'
        neg_sentences = []
        poscount, negcount = 0, 0
        for i in xrange(len(html_sentences)):
            sent = html_sentences[i]
            sentiment = self.classifier.classify(word_feats(sent))
            if sentiment == 'pos':
                poscount += 2.5  # positives are worth 250% that of negatives
            else:
                negcount += 1
                neg_sentences.append(sent)
        poscount = math.ceil(poscount)

        # Compute sentiment with Rocchio as the balancer
        print 'pos_count =', poscount
        print 'neg_count =', negcount
        sentiment = 'neutral'
        level = 'mildly'
        if poscount > negcount:
            sentiment = 'safe'
        elif poscount < negcount:
            sentiment = 'dangerous'
        if poscount >= 2 * negcount or negcount >= 2 * negcount:
            level = 'relatively'
        elif poscount >= 4 * negcount or negcount >= 4 * poscount:
            level = 'very'

        # Record danger level as interpreted by Rocchio using Naive Bayes as a
        # subroutine
        self.danger_r1 = level + ' ' + sentiment

        # Randomly sample from the remaining negative sentences
        samp = min(max_num_sentences, len(neg_sentences))
        html_sentences = random.sample(neg_sentences, samp)
        return self.create_results(html_sentences)
示例#9
0
    def rocchio(self, max_num_sentences):

        # Scrape all the web pages of interest
        print 'Crawling target webpages and grabbing information...'
        html_sentences = []
        for url in self.web_links:
            print 'Grabbing info from url:', url
            html_sentences += scrap_web_page(url)
        print "\nProcessing %d sentences." % len(html_sentences)

        # Count the number of pos/neg sentences and record all negative
        # sentences
        print 'Running Rocchio()'
        neg_sentences = []
        poscount, negcount = 0, 0
        for i in xrange(len(html_sentences)):
            sent = html_sentences[i]
            sentiment = self.classifier.classify(word_feats(sent))
            if sentiment == 'pos':
                poscount += 2.5  # positives are worth 250% that of negatives
            else:
                negcount += 1
                neg_sentences.append(sent)
        poscount = math.ceil(poscount)

        # Compute sentiment with Rocchio as the balancer
        print 'pos_count =', poscount
        print 'neg_count =', negcount
        sentiment = 'neutral'
        level = 'mildly'
        if poscount > negcount:
            sentiment = 'safe'
        elif poscount < negcount:
            sentiment = 'dangerous'
        if poscount >= 2 * negcount or negcount >= 2 * negcount:
            level = 'relatively'
        elif poscount >= 4 * negcount or negcount >= 4 * poscount:
            level = 'very'

        # Record danger level as interpreted by Rocchio using Naive Bayes as a
        # subroutine
        self.danger_r1 = level + ' ' + sentiment

        # Randomly sample from the remaining negative sentences
        samp = min(max_num_sentences, len(neg_sentences))
        html_sentences = random.sample(neg_sentences, samp)
        return self.create_results(html_sentences)
示例#10
0
    def create_results(self, orig_sentences):
        print 'Running nltk subroutines in create_results()'

        # create a list of all tokens in each sentence
        sentences = [nltk.word_tokenize(sent) for sent in orig_sentences]
        # record the tokenized sentences separately
        tokenized_sentences = sentences

        # multi-threaded pos_tag assignment
        print 'Running multi-threaded "part-of-speech" tagging of web page results'
        t1 = myThread(1, 3, tokenized_sentences)
        t2 = myThread(2, 3, tokenized_sentences)
        t3 = myThread(3, 3, tokenized_sentences)

        # TODO :: currently hard-coded to 3 threads for now
        threads = []
        t1.start()
        t2.start()
        t3.start()

        threads.append(t1)
        threads.append(t2)
        threads.append(t3)

        # Wait for all threads to be done
        for t in threads:
            t.join()

        # tag parts of speech for each word
        sentences = t1.results + t2.results + t3.results

        print "Constructing Grammars..."
        # Verb Extraction Grammar
        grammar = r"""
                  VERBS: {<V.*>}
                          }<VBZ>{
                  """
        # Verb Regex Parser (Finds effects)
        cp_effect = nltk.RegexpParser(grammar)

        # Noun Extraction Grammar
        grammar2 = r'NOUNS: {<NN|NP>}'
        # Noun Regex Parser (Finds causes)
        cp_cause = nltk.RegexpParser(grammar2)

        verbs = []
        nouns = []

        # Gather only the most negative sentences and process them with nltk
        #
        # The reason behind this is that we want to have two lists produced:
        #
        # List verbs[] and List nouns[], both of which are lists of lists where
        # each sublist corresponds to an individual 'negative sentence'.

        print "Parsing all sentences and collecting verbs and nouns..."
        for sent in sentences:

            # Collect Negative Verbs
            some_verbs = []
            tree1 = cp_effect.parse(sent)
            for subtree in tree1.subtrees():
                if subtree.node in ['VERBS']:
                    term = self.lmtzr.lemmatize(subtree[0][0], 'v')
                    if self.classifier.classify(word_feats(term)) == 'neg':
                        some_verbs.append(term)
            verbs.append(some_verbs)

            # Collect Nouns
            some_nouns = []
            tree2 = cp_cause.parse(sent)
            for subtree in tree2.subtrees():
                if subtree.node in ['NOUNS']:
                    term = self.lmtzr.lemmatize(subtree[0][0], 'n')
                    some_nouns.append(term)
            nouns.append(some_nouns)

        # Find the most negative verb/noun pairs and produce 'good' phrases
        print "Collecting (verb,noun) pairings..."
        phrases = []
        num_sents = len(tokenized_sentences)
        for i in xrange(num_sents):
            verb_set = verbs[i]
            noun_set = nouns[i]
            if len(verb_set) == 0 or len(noun_set) == 0:
                continue
            v, n = self.min_proximity_query(verb_set, noun_set,
                                            tokenized_sentences[i])
            rating = self.proximity_semantic_score(i,
                                                   orig_sentences[i - 1:i + 1])
            if len(v) > 2 and len(n) > 2:
                ss = v + ' ' + n
                phrases.append((v + ' ' + n, rating, orig_sentences[i]))

        print "len phrases = ", len(phrases)
        print "len nouns = ", len(nouns)
        print "len verbs = ", len(verbs)

        # Returning 10 randomly sampled results, fix this!
        print "Removing duplicates and porting phrases by negativity scores."
        phrases = list(set(phrases))
        phrases = sorted(phrases, key=lambda tup: tup[1], reverse=True)
        print "Returning top 10 phrases."
        return phrases[:10]
示例#11
0
    def create_results(self, orig_sentences):
        print 'Running nltk subroutines in create_results()'

        # create a list of all tokens in each sentence
        sentences = [nltk.word_tokenize(sent) for sent in orig_sentences]
        # record the tokenized sentences separately
        tokenized_sentences = sentences

        # multi-threaded pos_tag assignment
        print 'Running multi-threaded "part-of-speech" tagging of web page results'
        t1 = myThread(1, 3, tokenized_sentences)
        t2 = myThread(2, 3, tokenized_sentences)
        t3 = myThread(3, 3, tokenized_sentences)

        # TODO :: currently hard-coded to 3 threads for now
        threads = []
        t1.start()
        t2.start()
        t3.start()

        threads.append(t1)
        threads.append(t2)
        threads.append(t3)

        # Wait for all threads to be done
        for t in threads:
            t.join()

        # tag parts of speech for each word
        sentences = t1.results + t2.results + t3.results

        print "Constructing Grammars..."
        # Verb Extraction Grammar
        grammar = r"""
                  VERBS: {<V.*>}
                          }<VBZ>{
                  """
        # Verb Regex Parser (Finds effects)
        cp_effect = nltk.RegexpParser(grammar)

        # Noun Extraction Grammar
        grammar2 = r'NOUNS: {<NN|NP>}'
        # Noun Regex Parser (Finds causes)
        cp_cause = nltk.RegexpParser(grammar2)

        verbs = []
        nouns = []

        # Gather only the most negative sentences and process them with nltk
        #
        # The reason behind this is that we want to have two lists produced:
        #
        # List verbs[] and List nouns[], both of which are lists of lists where
        # each sublist corresponds to an individual 'negative sentence'.

        print "Parsing all sentences and collecting verbs and nouns..."
        for sent in sentences:

            # Collect Negative Verbs
            some_verbs = []
            tree1 = cp_effect.parse(sent)
            for subtree in tree1.subtrees():
                if subtree.node in ['VERBS']:
                    term = self.lmtzr.lemmatize(subtree[0][0], 'v')
                    if self.classifier.classify(word_feats(term)) == 'neg':
                        some_verbs.append(term)
            verbs.append(some_verbs)

            # Collect Nouns
            some_nouns = []
            tree2 = cp_cause.parse(sent)
            for subtree in tree2.subtrees():
                if subtree.node in ['NOUNS']:
                    term = self.lmtzr.lemmatize(subtree[0][0], 'n')
                    some_nouns.append(term)
            nouns.append(some_nouns)

        # Find the most negative verb/noun pairs and produce 'good' phrases
        print "Collecting (verb,noun) pairings..."
        phrases = []
        num_sents = len(tokenized_sentences)
        for i in xrange(num_sents):
            verb_set = verbs[i]
            noun_set = nouns[i]
            if len(verb_set) == 0 or len(noun_set) == 0:
                continue
            v, n = self.min_proximity_query(
                verb_set, noun_set, tokenized_sentences[i])
            rating = self.proximity_semantic_score(
                i, orig_sentences[i - 1:i + 1])
            if len(v) > 2 and len(n) > 2:
                ss = v + ' ' + n
                phrases.append((v + ' ' + n, rating, orig_sentences[i]))

        print "len phrases = ", len(phrases)
        print "len nouns = ", len(nouns)
        print "len verbs = ", len(verbs)

        # Returning 10 randomly sampled results, fix this!
        print "Removing duplicates and porting phrases by negativity scores."
        phrases = list(set(phrases))
        phrases = sorted(phrases, key=lambda tup: tup[1], reverse=True)
        print "Returning top 10 phrases."
        return phrases[:10]