def selectTest(): print "select test ..." book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) print "book size:", len(book) for idx_raw, text in enumerate(ieeePapers()): print text sentences = lang.getSentenceList(text) localHist = {} scoreByLang = {} gramLeftRight = {} for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) for g, l, r in selecedngb: localHist[g] = localHist.get(g, 0) + 1 scoreByLang[g] = scoreByLang.get(g, 0) + linguisticScore( g, l, r, tokenlist) if not g in gramLeftRight: gramLeftRight[g] = [] lefttoken = '<L>' + ('#BEGIN' if l == 0 else tokenlist[l - 1]) righttoken = '<R>' + ('#END' if r >= (len(tokenlist) - 1) else tokenlist[r + 1]) gramLeftRight[g].append((lefttoken, righttoken)) # scores scoreByDF = {} totalDF = 0 for g in localHist: scoreByDF[g] = book.get(g, 0) totalDF = scoreByDF[g] averageDF = totalDF / len(scoreByDF) sortedByDF = sorted(scoreByDF.items(), key=lambda x: x[1], reverse=True) print sortedByDF print "average DF", averageDF print "gram with DF above average" print[(g, count) for (g, count) in sortedByDF if count > averageDF] print "gram with DF below average" print[(g, count) for (g, count) in sortedByDF if not count > averageDF] print "lang score:" print scoreByLang print "gram left right" print gramLeftRight pause()
def findDFOfGrams(): gramHist = {} for idx_raw, text in enumerate(ieeePapers()): localGramHist = {} # gram -> count sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) for g, l, r in selecedngb: localGramHist[g] = localGramHist.get(g, 0) + 1 words = g.split() if len(words) >= 3: for ii, w in enumerate(words[:-1]): posEndingWord = poslist[ii + 1] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 2]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 if len(words) >= 4: for ii, w in enumerate(words[:-2]): posEndingWord = poslist[ii + 2] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 3]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 if len(words) >= 5: for ii, w in enumerate(words[:-3]): posEndingWord = poslist[ii + 3] if "N" in posEndingWord or "X" in posEndingWord: gg = " ".join(words[ii:ii + 4]) localGramHist[gg] = localGramHist.get(gg, 0) + 1 # save the local grams for g in localGramHist: gramHist[g] = gramHist.get(g, 0) + 1 peek(idx_raw + 1, 2000) kbay.saveDF(gramHist, 'ieeeGramDF.txt', sort=False, numDoc=idx_raw)
def ngramsOfSentence(s): tokenlist = lang.tokenize(s) if not len(tokenlist) > 5: return None if len(tokenlist) > 80: return None tokenstoplist = lang.markStops(tokenlist) poslist = lang.posLookup(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) #selecedngb = lang.filterVerb(selecedngb, s, verbose=0) #<--- "contrast", "field" ignored selecedngb = lang.filterSRS(selecedngb, tokenstoplist) return [t[0] for t in selecedngb]
def countWordFreq(): WordBook = {} GramBook = {} TotalCnt = 0 for idx, text in enumerate(loopWikiData()): sentences = getSentences(text) for s in sentences: words = lang.tokenize(s) words = [w if (__goodword(w)) else ',' for w in words] # count single word for w in words: WordBook[w] = WordBook.get(w, 0) + 1 TotalCnt += len(words) # count 2-gram tokenstoplist = lang.markStops(words) ngb = lang.ngrambounds(tokenstoplist) for (gram, leftidx, rightidx) in ngb: if rightidx > leftidx: for ii in range(rightidx - leftidx): twogram = words[leftidx + ii] + ' ' + words[leftidx + ii + 1] GramBook[twogram] = GramBook.get(twogram, 0) + 1 if idx % 1000 == 0: print "finished", idx, "files" if idx == 100000: break SaveFile = 'wiki_WordFreq.txt' with codecs.open(SaveFile, 'w', 'utf-8') as f: for word, count in WordBook.items(): f.write('%s::%s\n' % (word, count)) f.write('#TOTAL_NUMBER_OF_WORDS=%s' % TotalCnt) with codecs.open('wiki_GramFreq.txt', 'w', 'utf-8') as f: for word, count in GramBook.items(): f.write('%s::%s\n' % (word, count)) print "dict size %s, saved to %s" % (len(WordBook), SaveFile)
def findNgrams(DO_COOCUR=False, ROUND=1): NGRAMS_DOC = {} COOCUR_S = {} # co-occurrence in a sentence NGRAM_LR = {} for idx, text in enumerate(textFiles()): peek(idx + 1, 1000) #if idx>1000: break ngram_local = {} sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posnize(tokenlist) #print "\n-----tokens and poslist----" #print ["(%s, %s)"%(t, poslist[i]) for i, t in enumerate(tokenlist)] if not len(tokenlist) > 5: #print "Anormaly of sentence:", s #pause() continue tokenstoplist = lang.markStops(tokenlist) if 0: print "stops:" print tokenstoplist #pause() if len(tokenlist) > 80: continue print "###### text ######", idx print text print tokenlist, len(tokenlist) #pause() ngb = lang.ngrambounds(tokenstoplist) #print "gram with bounds:", ngb selecedngb = lang.filterSRS(ngb, tokenstoplist) #print "\nSRS-FIL gram with bounds:", selecedngb selecedngb = lang.filterAdj(selecedngb, s) #print "\nADJ-FIL gram with bounds:", selecedngb selecedngb = lang.filterAdv(selecedngb, s) #print "\nADV-FIL gram with bounds:", selecedngb #selecedngb = lang.filterVerb(selecedngb, s, verbose=0) #<--- "contrast", "field" incorrectly ignored #print "\nVERB-FIL gram with bounds:", selecedngb # do it again after pure pos-based filtering selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print "\nFINAL selected gram with bounds:", selecedngb if ROUND == 1: # in the 1st round, profile the next word after a gram for (gram, leftidx, rightidx) in selecedngb: nextword = lang.nextword(rightidx, tokenlist) prevword = lang.prevword(leftidx, tokenlist) nextwordcode = lang.profilingCode(nextword) prevwordcode = lang.profilingCode(prevword) kbay.inc3d(gram, '_', '_', NGRAM_LR) # '_' as itself kbay.inc3d(gram, 'l', prevwordcode, NGRAM_LR) kbay.inc3d(gram, 'r', nextwordcode, NGRAM_LR) if lang.isSubject(leftidx, rightidx, tokenlist): kbay.inc3d(gram, '_', 's', NGRAM_LR) #print "subject:", gram #pause() if ROUND == 2: # in the 2nd round, justify the gram for ngb in selecedngb: print "check this:", ngb sg = grammer.subgram(ngb[0], ngb[1], ngb[2], READIN_GRAM_LR, tokenlist, poslist) if sg: print "gram", ngb, "subgram", sg raw_input() if 0: print "\n\n", s print "raw ngb >", ngb print "final ngb >", selecedngb pause() ngrams = [t[0] for t in selecedngb] ngrams = [g for g in ngrams if len(g.split()) > 1] kbay.count(ngrams, ngram_local) if DO_COOCUR: for n1 in ngrams: for n2 in ngrams: kbay.inc2d(n1, n2, COOCUR_S) # doc.freq. - each gram counted only once kbay.count(ngram_local, NGRAMS_DOC) kbay.saveHist3D(NGRAM_LR, SAVEDIR + 'hist.txt') #print "filter df-doc" #filterHistByCount(NGRAMS_DOC, 2, verbose=False) #kbay.saveDF(NGRAMS_DOC, SAVEDIR+SrcFileName+'_ngrams_df_doc.txt', sort=False, numDoc=idx) if DO_COOCUR: print "filter coocur" kbay.filter2DHistByCount(COOCUR_S, 2, verbose=True) kbay.saveDF2D(COOCUR_S, SAVEDIR + SrcFileName + '_ngrams_coocur.txt') print "DONE findNgrams"
def selectGrams(): klog.msg('select grams') book = fif.readWordCoocur('tmp_ieee_coocur_abstractwide_words_4000.txt') #book=fif.readWordCoocur('tmp_ieee_coocur_abstractwide_word_bymemo.txt', filtervalue=2) CoHist = {} CoWord = {} klog.msg('looping files') for idx_raw, text in enumerate(ieeePapers()): localGramHist = {} # gram -> count sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print s #print "\n>INITIAL grams:\n", ngb #print "\n>SELECTED grams:\n", selecedngb for g, l, r in selecedngb: localGramHist[g] = localGramHist.get(g, 0) + 1 if 0: print text print "#.localgrams:", len(localGramHist) print localGramHist print "#.ngram:", len([1 for g in localGramHist if ' ' in g]) pause() #kbay.countCooccur(localGramHist, CoHist) # calculate mutual information gramlist = localGramHist.keys() gramscore = [] for g in gramlist: gramscore.append(relativeInfo(g, gramlist, book)) print sorted(gramscore, key=lambda x: x[1]) averageScore = sum([g[1] for g in gramscore]) / len(gramscore) print "above average:", averageScore print[g for g in gramscore if g[1] > averageScore] pause() wordset = set([w for g in localGramHist for w in g.split()]) kbay.countCooccur(wordset, CoWord) peek(idx_raw + 1, 1000) if (idx_raw + 1) % 4000 == 0: #mem.refreshMemory2D(CoWord, steps=idx_raw, prefilter=True) #h, hh = len(CoWord), kbay.sizeOf2DHist(CoWord) #print "hist size", h, hh, hh/h #mem.refreshMemory2DFirstOrder(CoWord, steps=idx_raw) kbay.saveDF2D( CoWord, 'tmp_ieee_coocur_abstractwide_words_%s.txt' % (idx_raw + 1)) CoWord = {} # reset break if 0: if (idx_raw + 1) % 40000 == 0: kbay.saveDF2D( CoHist, 'tmp_ieee_coocur_abstractwide_%s.txt' % (idx_raw + 1)) CoHist = {} # reset