classes = collections.defaultdict(lambda: 0) for item in itemlist: if getElement(item, 'status') == "Annotated": url = getElement(item, 'url') title = getElement(item, 'filename') readability = getElement(item, 'readability') classes[int( float(readability) + 0.5)] += 1 #getStatisticsFromInternet(url) #print item.attributes['ID'].value, title, readability, url #fo.write("%s,,,%f,,,%f\n" % (title, float(fkre), float(readability))) myTextObj = processFile(title, dataDir, fo, float(readability), removeWikiGarbage=False, wiki=False) tokens = myTextObj.tokens() documents.append(doc(title, readability, tokens)) allTokensSet.update(tokens.keys()) # print "NUMBER OF VALID = ", len(allTokensSet) #allTokens = dict( zip(allTokensSet, range(1, len(allTokensSet) + 1 ))) allTokens = list(allTokensSet) #buildSVMInput(documents, allTokens) buildNBInput(documents, allTokens) for c,v in classes.iteritems(): print "CLASS %s - %d items" % (c,v)
from __future__ import division import os import re import sys from myWikiText import MyWikiText from processFile import * if __name__ == "__main__": if len(sys.argv) < 2: paths = [ "data/en", "data/simple" ] else: paths = sys.argv[1:] print paths for path in paths: dataDir = path outFileName = re.sub("/","",path) fo = open(outFileName + ".out", "w") print "Writing %s.out" % (outFileName) printLabels(fo) for title in os.listdir(dataDir): # print title processFile(title, dataDir, fo, 0.0, removeWikiGarbage=True, wiki=True) fo.close()
import os import re import sys from myWikiText import MyWikiText from processFile import * if __name__ == "__main__": dataDir = "data/simple" title = "Zoonosis.simple" fo = sys.stdout fo.write("%s,,,%s,,,%s,,,%s,,,%s,,,%s,,,%s,,,%s,,,%s,,,%s,,,%s,,,%s,,,%s\n" % ("filename",\ "numWords",\ "numSentences",\ "numSyllables",\ "avgWordLengthSyl",\ "avgWordLengthInChars",\ "avgSenLengthInChars",\ "avgWordsPerSentece",\ "avgSyllablesPerSentence",\ "fleschReadingEase",\ "fleschKincaidGradeLevel",\ "colemanLiauIndex",\ "lixIndex")) myWikiText = processFile(title, dataDir, fo) fo.close()