def all_nword_freq_book(freqs, ngram): ## This function tries to find the freq of a word or phrase within a book ## freqlist[bookid]is the freq for one single book ## if flag == 0, case insensitive; if flag == 1, case sensitive freqlist = [] all_text = [] freqsarr = [] count = [0] * 66 for bookid in range(1, 67): loc0 = er.Location([bookid, 1, 1]) loc1 = er.Location([bookid, 100, 100]) all_text.append(er.get_range_text(er.filenum, loc0, loc1)[0]) count[bookid - 1] = word_count(all_text[-1]) bookid = 0 for chunk in all_text: words = chunk.split() leng = len(words) chunkfreqs = dict() for m in range(0, len(words)): for n in range(0, ngram): if m + n <= leng: nword = str(' '.join(words[m:m + n])) if freqs.has_key(nword): if nword in chunkfreqs: chunkfreqs[nword] += 1.0 / count[bookid] else: chunkfreqs[nword] = 1.0 / count[bookid] bookid += 1 freqsarr.append(chunkfreqs) return freqsarr
def nword_freq_book(wp, flag): ## This function tries to find the freq of a word or phrase within a book ## freqlist[bookid]is the freq for one single book ## if flag == 0, case insensitive; if flag == 1, case sensitive freqlist = [] for bookid in range(1, 67): loc0 = er.Location([bookid, 1, 1]) loc1 = er.Location([bookid, 100, 100]) text = er.get_range_text(er.filenum, loc0, loc1)[0] count = word_count(text) if flag: freqlist.append(float(len(re.findall(wp, text))) / count) else: freqlist.append(float(len(re.findall('(?i)' + wp, text))) / count) return freqlist
def even_word(leng): ## get ten most evenly distributed words and variated ones loc0 = er.Location([1, 1, 1]) loc1 = er.Location([100, 100, 100]) text = er.get_range_text(er.filenum, loc0, loc1)[0] count = rf.word_count(text) ## cut texts into chunks of length leng textarr = rf.cut_text(text, leng) freqs, freqsarr = rf.all_word_freq(textarr, count, leng) wordstd = dict() for word in freqs.keys(): wordstd[word] = comp_vari(rf.word_freq(word, freqsarr)) rank_word = sorted(wordstd.items(), key=operator.itemgetter(1), reverse=False) return rank_word
def even_nword(leng, ngram): ## get ten most evenly distributed words and variated ones loc0 = er.Location([1, 1, 1]) loc1 = er.Location([100, 100, 100]) text, punc_text = er.get_range_text(er.filenum, loc0, loc1) count = rf.word_count(text) ## cut texts into chunks of length leng nwordcv = dict() nwordcv0 = dict() nwordcv1 = dict() ## compute coefficient of variation based on book or word numbers if leng == 'book': leng = 1000 textarr = rf.cut_text(text, leng) freqs, freqsarr0, freqsarr1 = rf.all_nword_freq( textarr, text, punc_text, ngram, leng) freqsarr = rf.all_nword_freq_book(freqs, ngram) for nword in freqs.keys(): nwordcv[nword] = comp_vari(rf.word_freq(nword, freqsarr)) else: textarr = rf.cut_text(text, leng) freqs, freqsarr0, freqsarr1 = rf.all_nword_freq( textarr, text, punc_text, ngram, leng) for nword in freqs.keys(): nwordcv0[nword] = comp_vari(rf.word_freq(nword, freqsarr0)) nwordcv1[nword] = comp_vari(rf.word_freq(nword, freqsarr1)) rank_nword = sorted(nwordcv.items(), key=operator.itemgetter(1), reverse=False) rank_nword0 = sorted(nwordcv0.items(), key=operator.itemgetter(1), reverse=False) rank_nword1 = sorted(nwordcv1.items(), key=operator.itemgetter(1), reverse=False) return rank_nword, rank_nword0, rank_nword1
def word_phrase_freq(wp, loclist0, loclist1, flag, leng): ## a single function which can compute the whole thing ## if flag == 0, case insensitive; if flag == 1, case sensitive loc0 = er.Location(loclist0) loc1 = er.Location(loclist1) # text = er.get_range_text(er.filenum, loc0, loc1)[0] text, punc_text = er.get_range_text(er.filenum, loc0, loc1) count = word_count(text) ## cut texts into chunks of length leng textarr = cut_text(text, leng) # freqs, freqsarr = all_word_freq(textarr, count, leng) ## return freqlist based on the casing of words if flag: freqlist = nword_freq(wp, textarr, leng) else: freqlist = nword_freq_no_case(wp, textarr, leng) ## show the plot if you want # plt.show() return freqlist