def setVocabularyMeasures(self):
     lemmacats = list(itertools.chain(*[x.listLemmaCats() for x in self.paragraphs]))
     try:
         # as the fit can crash
         popt, pcov = calcPLex.calcPLex( lemmacats, lemmacat2freqrank)
         self.PLex = popt[0]
     except:
         self.PLex = 0.0
     #print lemmacats
     popt, pcov = calcPLex.calcS( lemmacats, lemmacat2freqrank)
     self.S = popt[0]
     if len(lemmacats) >= 50:
         self.vocd = calcPLex.getVOCD(lemmacats)
     else:
         self.vocd = 0.0
     self.mtld = calcPLex.getMTLD(lemmacats)
     if len(lemmacats) >= 42:
         self.hdd = calcHDD.calcHDD(lemmacats)
     else:
         self.hdd = 0.0
     #self.maas = calcHDD.calcMaas(lemmacats) # don't think this works right atm
     vocabs, self.vocabOther, self.vocabUnk =\
         calcPLex.calcLFP( lemmacats, lemmacat2freqrank, difficultybins = (1000,2000,3000,4000,8000))
     self.vocab1k, self.vocab2k, self.vocab3k, self.vocab4k, self.vocab8k = vocabs
Exemplo n.º 2
0
for testxml in testxmls:
    print "loading xml", testxml
    tok2finalforms, tok2lemmacats, verb2info, trees, (weight, maxweight) = getFinalTokenFormsAndTreesAndWeight(testxml)
    # each tok2lemmacat goes to a list to 2-tuples
    # print tok2lemmacats.values()
    for lemmacats in tok2lemmacats.values():
        wordforms.extend([x[0] + u"_" + x[1] for x in lemmacats])

print wordforms
print "tokens", len(wordforms)
print "types", len(set(wordforms))
popt, pcov = calcPLex.calcPLex( wordforms, lemmacat2freqrank)

print "popt", popt
print "pcov", pcov

print "S"
popt, pcov = calcPLex.calcS( wordforms, lemmacat2freqrank)

print "popt", popt
print "pcov", pcov


vocd = calcPLex.getVOCD( wordforms )
mtld = calcPLex.getMTLD( wordforms )

print "vocd:", vocd
print "mtld:", mtld