def __init__(self, Lex, classifier_obj=None): # calls superclass super(DocSentiScore, self).__init__() if not classifier_obj: # default classifier to be used on sentences self.sentence_classifier = AV_AggressivePottsSentiScore(Lex) else: self.sentence_classifier = classifier_obj self.sentence_classifier.set_lexicon(Lex) self.set_lexicon(Lex)
def __init__(self, Lex, classifier_obj=None): # calls superclass super(DocSentiScore, self).__init__() if not classifier_obj: # default classifier to be used on sentences self.sentence_classifier = AV_AggressivePottsSentiScore(Lex) else: self.sentence_classifier = classifier_obj self.sentence_classifier.set_lexicon(Lex) self.set_lexicon(Lex) self.question_neg_weight = 0.0 self.min_question_size = 0.0 self.verbose = False
def __init__(self, Lex, classifier_obj=None): # calls superclass super(DocSentiScore, self).__init__() if not classifier_obj: # default classifier to be used on sentences self.sentence_classifier=AV_AggressivePottsSentiScore(Lex) else: self.sentence_classifier=classifier_obj self.sentence_classifier.set_lexicon(Lex) self.set_lexicon(Lex)
def __init__(self, Lex, classifier_obj=None): # calls superclass super(DocSentiScore, self).__init__() if not classifier_obj: # default classifier to be used on sentences self.sentence_classifier=AV_AggressivePottsSentiScore(Lex) else: self.sentence_classifier=classifier_obj self.sentence_classifier.set_lexicon(Lex) self.set_lexicon(Lex) self.question_neg_weight = 0.0 self.min_question_size = 0.0 self.verbose = False
class SentenceDocSentiScore(DocSentiScore): ''' Subclass of DocSentiScore implementing a sentence-based lexicon-based classifier. this approach breaks a document into sentences and generates sentiment scores based on aggregate sentiment of each sentence, instead of individual tokens. ''' def __init__(self, Lex, classifier_obj=None): # calls superclass super(DocSentiScore, self).__init__() if not classifier_obj: # default classifier to be used on sentences self.sentence_classifier = AV_AggressivePottsSentiScore(Lex) else: self.sentence_classifier = classifier_obj self.sentence_classifier.set_lexicon(Lex) self.set_lexicon(Lex) self.question_neg_weight = 0.0 self.min_question_size = 0.0 self.verbose = False def _sent_tokenize(self, doc, separator): ''' Takes a pos-tagged doc and tag separator as input and return list of strings each containing a sentence from the original document. ''' end_of_sentence = [x + separator + '.' for x in ('.', '!', '?')] cur_sent = [] sentences = [] for token in doc.split(' '): cur_sent.append(token) if token in end_of_sentence: sentences.append((' '.join(cur_sent), len(cur_sent))) cur_sent = [] # finally, if cur_sent is not empty count as last sentence if cur_sent: sentences.append((' '.join(cur_sent), len(cur_sent))) return sentences def _is_question(self, sentence): if len(sentence) > 0: # looking for 1st char of the last pos-tagged token in sentence, format "char/POS" if len(sentence[-1]) > 0: return (sentence[-1][0] == '?') return False def classify_document(self, Doc, tagged=True, verbose=False, **kwargs): ''' Tokenize input document into sentences, then call classifier on each sentence separately. Return aggregate positive and negative scores in tuple (postotal,negtotal) ''' # Process input parameters, if any self.set_parameters(**kwargs) self.verbose = verbose assert self.L and self.L.is_loaded, 'Lexicon has not been assigned, or not loaded' # POS-taging and tag detection if not tagged: tagged_doc = self.pos_tag(Doc) else: tagged_doc = Doc tagsep = self._detect_tag(tagged_doc) assert tagsep, 'Unable to detect tag separator' # tokenize into sentences tagged_sentences = self._sent_tokenize(tagged_doc, tagsep) self._debug('[sent classifier] - Found %d sentences' % len(tagged_sentences)) # initialize data structure containing results self.resultdata = { 'annotated_doc': '', 'doc': Doc, 'resultpos': 0, 'resultneg': 0, 'tokens_found': 0, 'tokens_negated': 0, 'found_list': collections.Counter(), 'unscored_list': [] } (tot_pos, tot_neg) = (0.0, 0.0) for (sentence, sent_sz) in tagged_sentences: try: self._debug('[sent classifier] %s' % sentence) self.sentence_classifier.set_lexicon(self.L) (cur_pos, cur_neg) = self.sentence_classifier.classify_document( sentence, tagged=True, verbose=verbose) # adjust for question if self._is_question(sentence.split( ' ')) and sent_sz > self.min_question_size: self._debug( '[sent classifier] applying adjustment of %.2f to question sentence' % self.question_neg_weight) cur_neg += self.question_neg_weight tot_pos += cur_pos tot_neg += cur_neg # update algorithm results self.resultdata[ 'tokens_found'] += self.sentence_classifier.resultdata[ 'tokens_found'] self.resultdata[ 'annotated_doc'] += self.sentence_classifier.resultdata[ 'annotated_doc'] self.resultdata[ 'tokens_negated'] += self.sentence_classifier.resultdata[ 'tokens_negated'] self.resultdata[ 'unscored_list'] += self.sentence_classifier.resultdata[ 'unscored_list'] self.resultdata['found_list'].update( self.sentence_classifier.resultdata['found_list']) except Exception, e: print '[sent classifier] - Error processing sentence (%s): %s' % ( sentence, str(e)) raise self.resultdata['resultpos'] = tot_pos self.resultdata['resultneg'] = tot_neg return (tot_pos, tot_neg)
class SentenceDocSentiScore(DocSentiScore): ''' Subclass of DocSentiScore implementing a sentence-based lexicon-based classifier. this approach breaks a document into sentences and generates sentiment scores based on aggregate sentiment of each sentence, instead of individual tokens. ''' def __init__(self, Lex, classifier_obj=None): # calls superclass super(DocSentiScore, self).__init__() if not classifier_obj: # default classifier to be used on sentences self.sentence_classifier=AV_AggressivePottsSentiScore(Lex) else: self.sentence_classifier=classifier_obj self.sentence_classifier.set_lexicon(Lex) self.set_lexicon(Lex) def _sent_tokenize(self, doc, separator): ''' Takes a pos-tagged doc and tag separator as input and return list of strings each containing a sentence from the original document. ''' # list of all POS-tagged indicators of end of sentence end_of_sentence = [x+separator+'.' for x in ['.','!','?']] cur_sent = [] sentences = [] for token in doc.split(' '): cur_sent.append(token) if token in end_of_sentence: sentences.append(' '.join(cur_sent)) cur_sent = [] # finally, if cur_sent is not empty count as last sentence if cur_sent: sentences.append(' '.join(cur_sent)) return sentences def _calc_sentence_scores(self, sent_scores): ''' Given list of tuples indicating individual scores for each sentence, returns aggregate (pos,neg) scores for document. The default method simply returns sum of scores for positive and negative results. ''' pos_total = sum([x[0] for x in sent_scores]) neg_total = sum([x[1] for x in sent_scores]) return (pos_total, neg_total) def classify_document(self, Doc, tagged=True, verbose=True, **kwargs): ''' Tokenize input document into sentences, then call classifier on each sentence separately. Return aggregate positive and negative scores in tuple (postotal,negtotal) ''' # Process input parameters, if any self.set_parameters(**kwargs) self.verbose = verbose assert self.L and self.L.is_loaded, 'Lexicon has not been assigned, or not loaded' # POS-taging and tag detection if not tagged: tagged_doc = self.pos_tag(Doc) else: tagged_doc = Doc tagsep = self._detect_tag(tagged_doc) assert tagsep, 'Unable to detect tag separator' # tokenize into sentences tagged_sentences = self._sent_tokenize(tagged_doc, tagsep) self._debug('[sent classifier] - Found %d sentences' % len(tagged_sentences)) sent_scores = [] # initialize data structure containing results self.resultdata = { 'annotated_doc': '', 'doc': Doc, 'resultpos': 0, 'resultneg': 0, 'tokens_found': 0, 'tokens_negated': 0, 'found_list': collections.Counter(), 'unscored_list': [] } for sentence in tagged_sentences: # classify sentence try: self._debug('[sent classifier] %s' % sentence) (cur_pos, cur_neg)=self.sentence_classifier.classify_document(sentence, tagged=True, verbose=verbose) if cur_pos>cur_neg: sent_scores.append((1,0)) elif cur_neg>cur_pos: sent_scores.append((0,1)) else: sent_scores.append((0,0)) self._debug('[sent classifier] - sentence scores: %s' % str(sent_scores)) # update algorithm results self.resultdata['tokens_found'] += self.sentence_classifier.resultdata['tokens_found'] self.resultdata['annotated_doc'] += self.sentence_classifier.resultdata['annotated_doc'] self.resultdata['tokens_negated'] += self.sentence_classifier.resultdata['tokens_negated'] self.resultdata['unscored_list'] += self.sentence_classifier.resultdata['unscored_list'] self.resultdata['found_list'].update(self.sentence_classifier.resultdata['found_list']) except Exception,e: self._debug('[sent classifier] - Error processing sentence: %s' % str(e)) raise #continue (resultpos, resultneg) = self._calc_sentence_scores(sent_scores) self.resultdata['resultpos'] = resultpos self.resultdata['resultneg'] = resultneg self.resultdata['sentence_scores'] = sent_scores return (resultpos, resultneg)
class SentenceDocSentiScore(DocSentiScore): ''' Subclass of DocSentiScore implementing a sentence-based lexicon-based classifier. this approach breaks a document into sentences and generates sentiment scores based on aggregate sentiment of each sentence, instead of individual tokens. ''' def __init__(self, Lex, classifier_obj=None): # calls superclass super(DocSentiScore, self).__init__() if not classifier_obj: # default classifier to be used on sentences self.sentence_classifier=AV_AggressivePottsSentiScore(Lex) else: self.sentence_classifier=classifier_obj self.sentence_classifier.set_lexicon(Lex) self.set_lexicon(Lex) self.question_neg_weight = 0.0 self.min_question_size = 0.0 self.verbose = False def _sent_tokenize(self, doc, separator): ''' Takes a pos-tagged doc and tag separator as input and return list of strings each containing a sentence from the original document. ''' end_of_sentence = [x + separator + '.' for x in ('.', '!', '?')] cur_sent = [] sentences = [] for token in doc.split(' '): cur_sent.append(token) if token in end_of_sentence: sentences.append((' '.join(cur_sent), len(cur_sent))) cur_sent = [] # finally, if cur_sent is not empty count as last sentence if cur_sent: sentences.append((' '.join(cur_sent), len(cur_sent))) return sentences def _is_question(self, sentence): if len(sentence) > 0: # looking for 1st char of the last pos-tagged token in sentence, format "char/POS" if len(sentence[-1]) > 0: return (sentence[-1][0] == '?') return False def classify_document(self, Doc, tagged=True, verbose=False, **kwargs): ''' Tokenize input document into sentences, then call classifier on each sentence separately. Return aggregate positive and negative scores in tuple (postotal,negtotal) ''' # Process input parameters, if any self.set_parameters(**kwargs) self.verbose = verbose assert self.L and self.L.is_loaded, 'Lexicon has not been assigned, or not loaded' # POS-taging and tag detection if not tagged: tagged_doc = self.pos_tag(Doc) else: tagged_doc = Doc tagsep = self._detect_tag(tagged_doc) assert tagsep, 'Unable to detect tag separator' # tokenize into sentences tagged_sentences = self._sent_tokenize(tagged_doc, tagsep) self._debug('[sent classifier] - Found %d sentences' % len(tagged_sentences)) # initialize data structure containing results self.resultdata = { 'annotated_doc': '', 'doc': Doc, 'resultpos': 0, 'resultneg': 0, 'tokens_found': 0, 'tokens_negated': 0, 'found_list': collections.Counter(), 'unscored_list': [] } (tot_pos, tot_neg) = (0.0, 0.0) for (sentence, sent_sz) in tagged_sentences: try: self._debug('[sent classifier] %s' % sentence) self.sentence_classifier.set_lexicon(self.L) (cur_pos, cur_neg) = self.sentence_classifier.classify_document(sentence, tagged=True, verbose=verbose) # adjust for question if self._is_question(sentence.split(' ')) and sent_sz > self.min_question_size: self._debug('[sent classifier] applying adjustment of %.2f to question sentence' % self.question_neg_weight) cur_neg += self.question_neg_weight tot_pos += cur_pos tot_neg += cur_neg # update algorithm results self.resultdata['tokens_found'] += self.sentence_classifier.resultdata['tokens_found'] self.resultdata['annotated_doc'] += self.sentence_classifier.resultdata['annotated_doc'] self.resultdata['tokens_negated'] += self.sentence_classifier.resultdata['tokens_negated'] self.resultdata['unscored_list'] += self.sentence_classifier.resultdata['unscored_list'] self.resultdata['found_list'].update(self.sentence_classifier.resultdata['found_list']) except Exception, e: print '[sent classifier] - Error processing sentence (%s): %s' % (sentence, str(e)) raise self.resultdata['resultpos'] = tot_pos self.resultdata['resultneg'] = tot_neg return (tot_pos, tot_neg)
class SentenceDocSentiScore(DocSentiScore): ''' Subclass of DocSentiScore implementing a sentence-based lexicon-based classifier. this approach breaks a document into sentences and generates sentiment scores based on aggregate sentiment of each sentence, instead of individual tokens. ''' def __init__(self, Lex, classifier_obj=None): # calls superclass super(DocSentiScore, self).__init__() if not classifier_obj: # default classifier to be used on sentences self.sentence_classifier = AV_AggressivePottsSentiScore(Lex) else: self.sentence_classifier = classifier_obj self.sentence_classifier.set_lexicon(Lex) self.set_lexicon(Lex) def _sent_tokenize(self, doc, separator): ''' Takes a pos-tagged doc and tag separator as input and return list of strings each containing a sentence from the original document. ''' # list of all POS-tagged indicators of end of sentence end_of_sentence = [x + separator + '.' for x in ['.', '!', '?']] cur_sent = [] sentences = [] for token in doc.split(' '): cur_sent.append(token) if token in end_of_sentence: sentences.append(' '.join(cur_sent)) cur_sent = [] # finally, if cur_sent is not empty count as last sentence if cur_sent: sentences.append(' '.join(cur_sent)) return sentences def _calc_sentence_scores(self, sent_scores): ''' Given list of tuples indicating individual scores for each sentence, returns aggregate (pos,neg) scores for document. The default method simply returns sum of scores for positive and negative results. ''' pos_total = sum([x[0] for x in sent_scores]) neg_total = sum([x[1] for x in sent_scores]) return (pos_total, neg_total) def classify_document(self, Doc, tagged=True, verbose=True, **kwargs): ''' Tokenize input document into sentences, then call classifier on each sentence separately. Return aggregate positive and negative scores in tuple (postotal,negtotal) ''' # Process input parameters, if any self.set_parameters(**kwargs) self.verbose = verbose assert self.L and self.L.is_loaded, 'Lexicon has not been assigned, or not loaded' # POS-taging and tag detection if not tagged: tagged_doc = self.pos_tag(Doc) else: tagged_doc = Doc tagsep = self._detect_tag(tagged_doc) assert tagsep, 'Unable to detect tag separator' # tokenize into sentences tagged_sentences = self._sent_tokenize(tagged_doc, tagsep) self._debug('[sent classifier] - Found %d sentences' % len(tagged_sentences)) sent_scores = [] # initialize data structure containing results self.resultdata = { 'annotated_doc': '', 'doc': Doc, 'resultpos': 0, 'resultneg': 0, 'tokens_found': 0, 'tokens_negated': 0, 'found_list': collections.Counter(), 'unscored_list': [] } for sentence in tagged_sentences: # classify sentence try: self._debug('[sent classifier] %s' % sentence) (cur_pos, cur_neg) = self.sentence_classifier.classify_document( sentence, tagged=True, verbose=verbose) if cur_pos > cur_neg: sent_scores.append((1, 0)) elif cur_neg > cur_pos: sent_scores.append((0, 1)) else: sent_scores.append((0, 0)) self._debug('[sent classifier] - sentence scores: %s' % str(sent_scores)) # update algorithm results self.resultdata[ 'tokens_found'] += self.sentence_classifier.resultdata[ 'tokens_found'] self.resultdata[ 'annotated_doc'] += self.sentence_classifier.resultdata[ 'annotated_doc'] self.resultdata[ 'tokens_negated'] += self.sentence_classifier.resultdata[ 'tokens_negated'] self.resultdata[ 'unscored_list'] += self.sentence_classifier.resultdata[ 'unscored_list'] self.resultdata['found_list'].update( self.sentence_classifier.resultdata['found_list']) except Exception, e: self._debug( '[sent classifier] - Error processing sentence: %s' % str(e)) raise #continue (resultpos, resultneg) = self._calc_sentence_scores(sent_scores) self.resultdata['resultpos'] = resultpos self.resultdata['resultneg'] = resultneg self.resultdata['sentence_scores'] = sent_scores return (resultpos, resultneg)