示例#1
0
def findDFOfWords():
    DF = {}
    for nFiles, text in enumerate(ieeePapers()):
        sentences = lang.getSentenceList(text)
        words = set(sum([wordlist(s, regular=True) for s in sentences], []))
        for w in words:
            DF[w] = DF.get(w, 0) + 1
        if nFiles % 1000 == 0: print "finished", nFiles, "files"
    print "finished total", nFiles, "files"
    kbay.saveDF(DF,
                _SAVEDIR_ + SrcFileName + '_df.txt',
                sort=False,
                numDoc=idx)
示例#2
0
文件: tf.py 项目: kunlubrain/legacy
def df_words():

    print "find df of words..."
    DF = {}
    for nFiles, text in enumerate(textFiles()):

        sentences = lang.getSentenceList(text)
        words = set(sum([wordlist(s, regular=True) for s in sentences], []))
        kbay.count(words, DF)

        peek(nFiles, 1000)

    print "finished total", nFiles, "files"
    kbay.saveDF(DF, SAVEDIR+SrcFileName+'_df.txt', sort=False, numDoc=nFiles)
示例#3
0
def findDFOfGrams():
    gramHist = {}
    for idx_raw, text in enumerate(ieeePapers()):
        localGramHist = {}  # gram -> count
        sentences = lang.getSentenceList(text)
        for idofs, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            poslist = lang.posLookup(tokenlist)
            tokenstoplist = lang.markStops(tokenlist)
            tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False)
            tokenMarkList = lang.markStopOnNonending(tokenlist, poslist,
                                                     tokenstoplist)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList)
            ngb = lang.ngrambounds(tokenstoplist)
            selecedngb = lang.filterAdj(ngb, s)
            selecedngb = lang.filterAdv(selecedngb, s)
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            for g, l, r in selecedngb:
                localGramHist[g] = localGramHist.get(g, 0) + 1
                words = g.split()
                if len(words) >= 3:
                    for ii, w in enumerate(words[:-1]):
                        posEndingWord = poslist[ii + 1]
                        if "N" in posEndingWord or "X" in posEndingWord:
                            gg = " ".join(words[ii:ii + 2])
                            localGramHist[gg] = localGramHist.get(gg, 0) + 1
                if len(words) >= 4:
                    for ii, w in enumerate(words[:-2]):
                        posEndingWord = poslist[ii + 2]
                        if "N" in posEndingWord or "X" in posEndingWord:
                            gg = " ".join(words[ii:ii + 3])
                            localGramHist[gg] = localGramHist.get(gg, 0) + 1
                if len(words) >= 5:
                    for ii, w in enumerate(words[:-3]):
                        posEndingWord = poslist[ii + 3]
                        if "N" in posEndingWord or "X" in posEndingWord:
                            gg = " ".join(words[ii:ii + 4])
                            localGramHist[gg] = localGramHist.get(gg, 0) + 1

        # save the local grams
        for g in localGramHist:
            gramHist[g] = gramHist.get(g, 0) + 1

        peek(idx_raw + 1, 2000)

    kbay.saveDF(gramHist, 'ieeeGramDF.txt', sort=False, numDoc=idx_raw)
示例#4
0
def filterGramDF():
    book = fif.readWithFilter('ieeeGramDF.txt', filtervalue=3)
    kbay.saveDF(book, 'ieeeGramDF_above3.txt', sort=False, numDoc=460035)