Пример #1
0
 def _negation_calc(self, tags, window):
     '''
      for a list of tokens, calculate array of negated words based on a negation detection
      algorithm (NegEx in our case).
      returns arran vNEG containing [0,1] for each index of token on original tags list, indicating negation.
     '''
     vNEG = negdetect.getNegationArray(tags, window)
     return vNEG
Пример #2
0
 def _negation_calc(self, tags, window):
     '''
      for a list of tokens, calculate array of negated words based on a negation detection
      algorithm (NegEx in our case).
      returns arran vNEG containing [0,1] for each index of token on original tags list, indicating negation.
     '''
     vNEG = negdetect.getNegationArray(tags, window)
     return vNEG
Пример #3
0
def docSentiScore(L, Doc, aflag, vflag, rflag, nflag, negflag, negwindow=5, 
                  w = lambda x,y,z:x, 
                  scoringmethod = SCOREALL):
    '''
     docSentiScore - Computes document sentiment score for POS-tagged Doc using sentiment lexicon L.
     The following mandatory flags specify which POS tags to take into account:
      - aflag
      - vflag
      - rflag (adverbs)
      - nflag
     In addition, negflag is a boolean indicating whether to use negation detection, with window scope negwindow.
     w(x,i,N) is a weight adjustment function based on word position within the document.
     scoringmethod is a list of non-exlcusive parameters used to switch on/off scoring features.

     Returns tuple (posscore, negscore, doc) containing final document scores and annotated document.
    '''

    # 0. Initialize
    tags = Doc.split()
    annotatedTags = []
    doclen = len(tags)
    i=0
    postotal=0
    negtotal=0
    notfoundcounter=0
    foundcounter=0
    negcount=0
 
    # Setup stem preprocessing for verbs
    wnl = nltk.stem.WordNetLemmatizer()
    
    # 1. Negation detection pre-processing - return an array w/ position of negated terms
    vNEG = negdetect.getNegationArray(tags, negwindow)

    #
    # 2. Scan for scores for each POS
    # After POS-tagging a term will appear as either term/POS or term_POS
    # We assume such weirdnesses will not naturally occur on plain text.
    #
    posindex=0
    negindex=1
    tagList = []
    for tagword in tags:
       i+=1
       scoretuple = (0,0)
       tagfound = False
       tagseparator = ''

       # Adjectives 
       if aflag==True and re.search('[_/](JJ|JJ.)$',tagword):
          tagfound=True
          tagseparator = tagword[ re.search('[_/](JJ|JJ.)$',tagword).start() ]
          thisterm = tagword.split( tagseparator )[0]
          scoretuple = L.getadjective( thisterm )

       # Verbs (VBP / VBD/ etc...)
       if vflag==True and re.search('[_/](VB|VB.)$',tagword) :
          tagfound=True
          tagseparator = tagword[ re.search('[_/](VB|VB.)$',tagword).start() ]
          thisterm = tagword.split( tagseparator )[0]
          thisterm = wnl.lemmatize(thisterm, pos='v')
          scoretuple = L.getverb( thisterm )
          
       # Adverbs
       if rflag==True and re.search('[_/]RB$',tagword) :
          tagfound=True
          tagseparator = tagword[ re.search('[_/]RB$',tagword).start() ]
          thisterm = tagword.split( tagseparator )[0]
          scoretuple = L.getadverb(thisterm)
          
       # Nouns
       if nflag==True and re.search('[_/]NN$',tagword) :
          tagfound=True
          tagseparator = tagword[ re.search('[_/]NN$',tagword).start() ]
          thisterm = tagword.split( tagseparator )[0]
          scoretuple = L.getnoun(thisterm)
          
       # Process negation detection
       if negflag==True:
            posindex=vNEG[i-1]
            negindex=(1+vNEG[i-1])%2

       #
       # Add to total with weight score
       #
       posval = 0.0
       negval = 0.0 
       if tagfound and \
          ( 
            (scoringmethod == SCOREALL) or 
            (scoringmethod == SCOREWITHFREQ) or 
            (scoringmethod == SCOREONCE and (not tagword in tagList)) or
            (scoringmethod == SCOREWITHSTOP and (not objectiveWords.is_stop(thisterm)) )
          ):

           if (scoringmethod in [ SCOREWITHFREQ, SCOREWITHSTOP ] ):
              # Scoring with frequency information
              # Frequency is a real valued at 0.0-1.0. We calculate sqrt function so that the value grows faster even for numbers close to 0 
              posval += w(scoretuple[posindex],i,doclen) * (1.0 - math.sqrt(L.get_freq( thisterm )) )
              negval += w(scoretuple[negindex],i,doclen) * (1.0 - math.sqrt(L.get_freq( thisterm )) )
              
           else:
              # Just plain scoring from lexicon - add
              posval += w(scoretuple[posindex],i,doclen)
              negval += w(scoretuple[negindex],i,doclen)
       
       postotal += posval
       negtotal += negval

       # Found a tag - increase counters and add tag to list
       if tagfound:
           tagList.append( tagword )
           foundcounter += 1.0
           if negflag==True and vNEG[i-1]==1:
              negcount += 1
           if scoretuple == (0,0):
              notfoundcounter += 1.0
       
       # add this tag back to annotated version
       if tagfound:
          if negflag: 
             negtag = str(vNEG[i-1])
          else:
             negtag = 'NONEG'
          annotatedTags.append( tagword + '##NEGAT:' + negtag + '##POS:' + str(posval) + '##NEG:' + str(negval) )
       else:
          annotatedTags.append( tagword )

    # Completed scan

    # Add negated words to negative score (Potts 2011)
    # alpha is a scaling factor based on how much of the document has been negated
    if foundcounter > 0.0:
       alpha = (negcount)/(foundcounter)
    else:
       alpha = 0.0
    ##negtotal += negcount * ( 0.5 * (1 - alpha) )
    #negtotal += negcount * 0.3 

    if (foundcounter - notfoundcounter) > 0.0:
       # resulting scores are the normalized proportions of all *scored* terms (ignoring neutrals/unknowns)
       resultpos =  float(postotal)/(foundcounter - notfoundcounter )
       resultneg =  float(negtotal)/(foundcounter - notfoundcounter )
    else:
       resultpos = 0.0
       resultneg = 0.0
 
    return (resultpos, resultneg, ' '.join(annotatedTags))
Пример #4
0
def docSentiScore(L,
                  Doc,
                  aflag,
                  vflag,
                  rflag,
                  nflag,
                  negflag,
                  negwindow=5,
                  w=lambda x, y, z: x,
                  scoringmethod=SCOREALL):
    '''
     docSentiScore - Computes document sentiment score for POS-tagged Doc using sentiment lexicon L.
     The following mandatory flags specify which POS tags to take into account:
      - aflag
      - vflag
      - rflag (adverbs)
      - nflag
     In addition, negflag is a boolean indicating whether to use negation detection, with window scope negwindow.
     w(x,i,N) is a weight adjustment function based on word position within the document.
     scoringmethod is a list of non-exlcusive parameters used to switch on/off scoring features.

     Returns tuple (posscore, negscore, doc) containing final document scores and annotated document.
    '''

    # 0. Initialize
    tags = Doc.split()
    annotatedTags = []
    doclen = len(tags)
    i = 0
    postotal = 0
    negtotal = 0
    notfoundcounter = 0
    foundcounter = 0
    negcount = 0

    # Setup stem preprocessing for verbs
    wnl = nltk.stem.WordNetLemmatizer()

    # 1. Negation detection pre-processing - return an array w/ position of negated terms
    vNEG = negdetect.getNegationArray(tags, negwindow)

    #
    # 2. Scan for scores for each POS
    # After POS-tagging a term will appear as either term/POS or term_POS
    # We assume such weirdnesses will not naturally occur on plain text.
    #
    posindex = 0
    negindex = 1
    tagList = []
    for tagword in tags:
        i += 1
        scoretuple = (0, 0)
        tagfound = False
        tagseparator = ''

        # Adjectives
        if aflag == True and re.search('[_/](JJ|JJ.)$', tagword):
            tagfound = True
            tagseparator = tagword[re.search('[_/](JJ|JJ.)$', tagword).start()]
            thisterm = tagword.split(tagseparator)[0]
            scoretuple = L.getadjective(thisterm)

        # Verbs (VBP / VBD/ etc...)
        if vflag == True and re.search('[_/](VB|VB.)$', tagword):
            tagfound = True
            tagseparator = tagword[re.search('[_/](VB|VB.)$', tagword).start()]
            thisterm = tagword.split(tagseparator)[0]
            thisterm = wnl.lemmatize(thisterm, pos='v')
            scoretuple = L.getverb(thisterm)

        # Adverbs
        if rflag == True and re.search('[_/]RB$', tagword):
            tagfound = True
            tagseparator = tagword[re.search('[_/]RB$', tagword).start()]
            thisterm = tagword.split(tagseparator)[0]
            scoretuple = L.getadverb(thisterm)

        # Nouns
        if nflag == True and re.search('[_/]NN$', tagword):
            tagfound = True
            tagseparator = tagword[re.search('[_/]NN$', tagword).start()]
            thisterm = tagword.split(tagseparator)[0]
            scoretuple = L.getnoun(thisterm)

        # Process negation detection
        if negflag == True:
            posindex = vNEG[i - 1]
            negindex = (1 + vNEG[i - 1]) % 2

        #
        # Add to total with weight score
        #
        posval = 0.0
        negval = 0.0
        if tagfound and \
           (
             (scoringmethod == SCOREALL) or
             (scoringmethod == SCOREWITHFREQ) or
             (scoringmethod == SCOREONCE and (not tagword in tagList)) or
             (scoringmethod == SCOREWITHSTOP and (not objectiveWords.is_stop(thisterm)) )
           ):

            if (scoringmethod in [SCOREWITHFREQ, SCOREWITHSTOP]):
                # Scoring with frequency information
                # Frequency is a real valued at 0.0-1.0. We calculate sqrt function so that the value grows faster even for numbers close to 0
                posval += w(scoretuple[posindex], i,
                            doclen) * (1.0 - math.sqrt(L.get_freq(thisterm)))
                negval += w(scoretuple[negindex], i,
                            doclen) * (1.0 - math.sqrt(L.get_freq(thisterm)))

            else:
                # Just plain scoring from lexicon - add
                posval += w(scoretuple[posindex], i, doclen)
                negval += w(scoretuple[negindex], i, doclen)

        postotal += posval
        negtotal += negval

        # Found a tag - increase counters and add tag to list
        if tagfound:
            tagList.append(tagword)
            foundcounter += 1.0
            if negflag == True and vNEG[i - 1] == 1:
                negcount += 1
            if scoretuple == (0, 0):
                notfoundcounter += 1.0

        # add this tag back to annotated version
        if tagfound:
            if negflag:
                negtag = str(vNEG[i - 1])
            else:
                negtag = 'NONEG'
            annotatedTags.append(tagword + '##NEGAT:' + negtag + '##POS:' +
                                 str(posval) + '##NEG:' + str(negval))
        else:
            annotatedTags.append(tagword)

    # Completed scan

    # Add negated words to negative score (Potts 2011)
    # alpha is a scaling factor based on how much of the document has been negated
    if foundcounter > 0.0:
        alpha = (negcount) / (foundcounter)
    else:
        alpha = 0.0
    ##negtotal += negcount * ( 0.5 * (1 - alpha) )
    #negtotal += negcount * 0.3

    if (foundcounter - notfoundcounter) > 0.0:
        # resulting scores are the normalized proportions of all *scored* terms (ignoring neutrals/unknowns)
        resultpos = float(postotal) / (foundcounter - notfoundcounter)
        resultneg = float(negtotal) / (foundcounter - notfoundcounter)
    else:
        resultpos = 0.0
        resultneg = 0.0

    return (resultpos, resultneg, ' '.join(annotatedTags))