示例#1
0
def rankNew():
    fils = glob.glob('sentenceSplit/*') # why sentenceSplit?
    lengs = ["File\tnltk sentence\tlength by '.'\t% capitalized after '.'\tlength by ','\t% capitalized after ','\t"
              +". or , and capital\tlength by capitals\tlength by tab\ttotal length"""]
    model = getModel()
    for fil in fils:
      print 'file',fil
      #txt = removePageN(extractText(fil))
      txt = (extractText(fil))

      # first count the sentences with nltk
      nltk    = model.tokenize(txt)
    
      # then rank '.'
      len2,caps = countSentence(txt,'.')
      
      # then count the sentences separated by ','
      (klen,caps2) = countSentence(txt,',')
    
      # then count the sentences separated by cpC
      ss = chunker.grouptext(txt,'cpC')
      (cpC,_) = countWords(ss)
    
      # then count the sentences separated by capitalized words
      ss = chunker.grouptext(txt,'cap')
      (cap,_) = countWords(ss)
    
      # then count the sentences separated by tabs
      (tlen,_) = countSentence(txt,'\t')

 
      (tot,_) = countWords([txt])
      print tot
      totlen = tot[0]
      # calculate medium for the file
      normalleng = div(totlen,nltk)
      dotleng = div(totlen,len2)
      caps = 0 if float(len(caps))==0 else len(filter(lambda x: x, caps))/float(len(caps))
      cleng = div(totlen,klen)
      caps2 = 0 if float(len(caps2))==0 else len(filter(lambda x: x, caps2))/float(len(caps2))
      cpCleng = div(totlen,cpC)
      capleng = div(totlen,cap)
      tleng   = div(totlen,tlen)

      # then count the sentences separated by paras
      inp   = open(fil,'r').read()
      xml   = etree.fromstring(inp)
      body  = xml.find(usefuls.prefix+'body')
      paras = body.findall('paragraph')

      row = [fil,normalleng,dotleng,caps,cleng,caps2,cpCleng,capleng,tleng,totlen]
      lengs += ['\t'.join(map(lambda x: str(x),row))]

    table = '\n'.join(lengs)
    open('data3.txt','w').write(table)
示例#2
0
def rankSuper():
    fils = glob.glob('sentenceSplit/*')
    lengs = ["File\tnltk sentence\tlength by '.'\t% capitalized after '.'\tlength by ','\t% capitalized after ','\t"
              +". or , and capital\tlength by capitals\tlength by tab\tparagraph length\ttotal length"""]
    model = getModel()
    for fil in fils:
      print 'file',fil
      inp   = open(fil,'r').read()
      xml   = etree.fromstring(inp)
      body  = xml.find(usefuls.prefix+'body')
      paras = body.findall('paragraph')
      ss = [] 
      len1,len2,cleng,caps,caps2,cpC,cap,tlen = [[] for x in range(8)]
      for para in paras:

          txt = list(para.itertext())
          if txt is not None and len(txt)>0:
            #txt = removePageN(' '.join(txt))
            txt = ' '.join(txt)
 
            # first count the sentences with nltk
            s    = model.tokenize(txt)
            (l,_) = countWords(s)
            len1 += (l)
    
            # then rank '.'
            l,c = countSentence(txt,'.')
            len2 += (l)
            caps += (c)
            
            # then count the sentences separated by ','
            (klen,capsC) = countSentence(txt,',')
            cleng += (klen)
            caps2 += (capsC)
    
            # then count the sentences separated by cpC
            s = chunker.grouptext(txt,'cpC')
            (l,_) = countWords(s)
            cpC += l
    
            # then count the sentences separated by capitalized words
            s = chunker.grouptext(txt,'cap')
            (l,_) = countWords(s)
            cap += (l)
    
            # then count the sentences separated by tabs
            (l,_) = countSentence(txt,'\t')
            tlen += (l)

            ss += [' '+txt]
 
      # calculate medium for the file
      normalleng = mediumLen(len1)
      dotleng = mediumLen(len2)
      caps = 0 if float(len(caps))==0 else len(filter(lambda x: x, caps))/float(len(caps))
      cleng = mediumLen(cleng)
      caps2 = 0 if float(len(caps2))==0 else len(filter(lambda x: x, caps2))/float(len(caps2))
      cpCleng = mediumLen(cpC)
      capleng = mediumLen(cap)
      tleng = mediumLen(tlen)

      txt = extractText(fil)
      (tot,_) = countWords([txt])
      totleng = mediumLen(tot)
      # then count the sentences separated by paras
     # inp   = open(fil,'r').read()
     # xml   = etree.fromstring(inp)
     # body  = xml.find(usefuls.prefix+'body')
     # paras = body.findall('paragraph')
     # for para in paras:
     #     txt = list(para.itertext())
     #     if txt is not None and len(txt)>0:
     #       ss.append(' '.join(txt))
      ss = filter(lambda x : len(x.strip())>0,ss)
      (lens,_) =  countWords(ss)
      paraleng = mediumLen(lens)

      row = [fil,normalleng,dotleng,caps,cleng,caps2,cpCleng,capleng,tleng,paraleng,totleng]
      lengs += ['\t'.join(map(lambda x: str(x),row))]

    table = '\n'.join(lengs)
    open('data3.txt','w').write(table)