# traverse the whole file, adding canonical forms of valid words into a # dictionary counting the number of appearances. d = dict() for line in txtsrc: # get rid of ASCII em and en dashes line = (line.replace("---", " ")).replace("--", " ") for word in line.split(): clean_word = clean(word) if clean_word == None: #ignore words that don't parse continue else: # add or update words that do parse incr(clean_word,d) # if we're not reading from a PDF, we have to close the file handle once # we're done counting all the words. the other three settings close # themselves. if not (args.pdf or args.gutenberg): txtsrc.close() # abort if the query makes no sense. note that we can't check this until we # build the dictionary: it depends on the number of unique words. if args.number > len(d): raise Exception('trying to compute the ' + str(args.number) + ' most used words, but there are only ' + str(len(d)) + ' unique words in the corpus') # print out the answer, with more or less verbosity
def incr_if_jj(d, word): # skip the tags that nltk introduces for position in a context if( not(start.search(word) or end.search(word))): tagged = nltk.pos_tag([word]) if tagged[0][1] == "JJ": incr (tagged[0][0], d)