示例#1
0
def all_nword_freq_book(freqs, ngram):
    ## This function tries to find the freq of a word or phrase within a book
    ## freqlist[bookid]is the freq for one single book
    ## if flag == 0, case insensitive; if flag == 1, case sensitive
    freqlist = []
    all_text = []
    freqsarr = []
    count = [0] * 66

    for bookid in range(1, 67):
        loc0 = er.Location([bookid, 1, 1])
        loc1 = er.Location([bookid, 100, 100])
        all_text.append(er.get_range_text(er.filenum, loc0, loc1)[0])
        count[bookid - 1] = word_count(all_text[-1])

    bookid = 0
    for chunk in all_text:
        words = chunk.split()
        leng = len(words)
        chunkfreqs = dict()
        for m in range(0, len(words)):
            for n in range(0, ngram):
                if m + n <= leng:
                    nword = str(' '.join(words[m:m + n]))
                    if freqs.has_key(nword):
                        if nword in chunkfreqs:
                            chunkfreqs[nword] += 1.0 / count[bookid]
                        else:
                            chunkfreqs[nword] = 1.0 / count[bookid]
        bookid += 1
        freqsarr.append(chunkfreqs)
    return freqsarr
示例#2
0
def nword_freq_book(wp, flag):
    ## This function tries to find the freq of a word or phrase within a book
    ## freqlist[bookid]is the freq for one single book
    ## if flag == 0, case insensitive; if flag == 1, case sensitive
    freqlist = []

    for bookid in range(1, 67):
        loc0 = er.Location([bookid, 1, 1])
        loc1 = er.Location([bookid, 100, 100])
        text = er.get_range_text(er.filenum, loc0, loc1)[0]
        count = word_count(text)
        if flag:
            freqlist.append(float(len(re.findall(wp, text))) / count)
        else:
            freqlist.append(float(len(re.findall('(?i)' + wp, text))) / count)
    return freqlist
示例#3
0
文件: stats.py 项目: shinan6/pgauthor
def even_word(leng):
    ## get ten most evenly distributed words and variated ones
    loc0 = er.Location([1, 1, 1])
    loc1 = er.Location([100, 100, 100])

    text = er.get_range_text(er.filenum, loc0, loc1)[0]
    count = rf.word_count(text)
    ## cut texts into chunks of length leng
    textarr = rf.cut_text(text, leng)
    freqs, freqsarr = rf.all_word_freq(textarr, count, leng)

    wordstd = dict()
    for word in freqs.keys():
        wordstd[word] = comp_vari(rf.word_freq(word, freqsarr))
    rank_word = sorted(wordstd.items(),
                       key=operator.itemgetter(1),
                       reverse=False)

    return rank_word
示例#4
0
文件: stats.py 项目: shinan6/pgauthor
def even_nword(leng, ngram):
    ## get ten most evenly distributed words and variated ones
    loc0 = er.Location([1, 1, 1])
    loc1 = er.Location([100, 100, 100])

    text, punc_text = er.get_range_text(er.filenum, loc0, loc1)
    count = rf.word_count(text)

    ## cut texts into chunks of length leng
    nwordcv = dict()
    nwordcv0 = dict()
    nwordcv1 = dict()
    ## compute coefficient of variation based on book or word numbers
    if leng == 'book':
        leng = 1000
        textarr = rf.cut_text(text, leng)
        freqs, freqsarr0, freqsarr1 = rf.all_nword_freq(
            textarr, text, punc_text, ngram, leng)
        freqsarr = rf.all_nword_freq_book(freqs, ngram)
        for nword in freqs.keys():
            nwordcv[nword] = comp_vari(rf.word_freq(nword, freqsarr))
    else:
        textarr = rf.cut_text(text, leng)
        freqs, freqsarr0, freqsarr1 = rf.all_nword_freq(
            textarr, text, punc_text, ngram, leng)
        for nword in freqs.keys():
            nwordcv0[nword] = comp_vari(rf.word_freq(nword, freqsarr0))
            nwordcv1[nword] = comp_vari(rf.word_freq(nword, freqsarr1))

    rank_nword = sorted(nwordcv.items(),
                        key=operator.itemgetter(1),
                        reverse=False)
    rank_nword0 = sorted(nwordcv0.items(),
                         key=operator.itemgetter(1),
                         reverse=False)
    rank_nword1 = sorted(nwordcv1.items(),
                         key=operator.itemgetter(1),
                         reverse=False)

    return rank_nword, rank_nword0, rank_nword1
示例#5
0
def word_phrase_freq(wp, loclist0, loclist1, flag, leng):
    ## a single function which can compute the whole thing
    ## if flag == 0, case insensitive; if flag == 1, case sensitive
    loc0 = er.Location(loclist0)
    loc1 = er.Location(loclist1)

    # text = er.get_range_text(er.filenum, loc0, loc1)[0]
    text, punc_text = er.get_range_text(er.filenum, loc0, loc1)
    count = word_count(text)
    ## cut texts into chunks of length leng
    textarr = cut_text(text, leng)
    # freqs, freqsarr = all_word_freq(textarr, count, leng)

    ## return freqlist based on the casing of words
    if flag:
        freqlist = nword_freq(wp, textarr, leng)
    else:
        freqlist = nword_freq_no_case(wp, textarr, leng)

    ## show the plot if you want
    # plt.show()
    return freqlist