def coocurWordFirstOrder(): klog.msg('find word coocur') Hist = {} nNumSentences = 0 for nFiles, text in enumerate(textFiles()): sentences = lang.getSentenceList(text) nNumSentences += len(sentences) if nFiles > 100000: break if nFiles % 1000 == 1: print nFiles, " files: #.sentences", nNumSentences continue for ids, s in enumerate(sentences): tokenlist = lang.tokenize(s) tokenlist = lang.regularwords(tokenlist) selected = [t for t in tokenlist if not lang.isNoise(t)] #kbay.countCooccur(selected, Hist) kbay.countCooccurNearest(selected, Hist, nearest=10) if nFiles % 1000 == 1: refreshMemory2D(Hist, steps=nFiles, prefilter=True) h, hh = len(Hist), kbay.sizeOf2DHist(Hist) print "hist size", h, hh, hh / h peek(nFiles + 1, 1000) if nFiles > 100000: break print "number of sentences:", nNumSentences return klog.msg("finished total %s files" % nFiles) refreshMemory2D(Hist, steps=nFiles, prefilter=True) kbay.filter2DHistByCount(Hist, 3, verbose=True) kbay.saveDF2D(Hist, SAVEDIR + SrcFileName + '_word_coocur.txt')
def findRelatedGrams(): klog.msg('find co-exist terms') Hist = {} for idx_raw, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) kbay.countCooccurConsideringSubgram(grams, Hist, gramDF=GRAM_DF, debug=0) if idx_raw % 1000 == 0: mem.refreshMemory2D(Hist, steps=idx_raw, prefilter=True) h, hh = len(Hist), kbay.sizeOf2DHist(Hist) print "hist size", h, hh, hh / h peek(idx_raw + 1, 1000) if idx_raw > 200000: break kbay.filter2DHistByCount(Hist, 2, verbose=True) kbay.saveDF2D(Hist, 'tmp_ieee_occur_all.txt')
def memorizeCogram(): book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) memo = mem.Memory() memo.setInitialCapacity(200) for idx_raw, text in enumerate(ieeePapers()): #if idx_raw<220000: continue sentences = lang.getSentenceList(text) gramsPreviousSentence = set([]) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) if not grams: continue goodgrams = set([g for g in grams if g in book]) memo.learnSymbList(goodgrams) # grams of previous sentence: learn grams of current sentence # grams of current sentence: learn grams of previous sentence memo.crosslearn(gramsPreviousSentence, goodgrams, crossweight=1) if 0 and len(list(gramsPreviousSentence) + list(goodgrams)) == 1: print "only 1 gram in two sentences!!!" print "sentence:", s print "grams before filtering:", grams print "grams after filtering", goodgrams if idofs > 0: print "previous sentence:", sentences[idofs - 1] print "previous grams before filtering:", ngramsOfSentence( sentences[idofs - 1]) print "previous grams after filtering:", gramsPreviousSentence pause() gramsPreviousSentence = goodgrams peek(idx_raw + 1, 2000) if (idx_raw + 1) % 2000 == 0: memo.refresh() memo.showsize() #if idx_raw>6000: # break kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_grams.txt')
def memorizeCoword(): memo = mem.Memory() memo.setInitialCapacity(200) book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) for idx_raw, text in enumerate(ieeePapers()): #if idx_raw<70000: continue sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) if not grams: continue words = set(' '.join(grams).split()) words = [w for w in words if w in book] memo.learnSymbList(words) peek(idx_raw + 1, 2000) if (idx_raw + 1) % 2000 == 0: memo.refresh() memo.showsize() kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_word_bymemo.txt')
def findNgrams(DO_COOCUR=False, ROUND=1): NGRAMS_DOC = {} COOCUR_S = {} # co-occurrence in a sentence NGRAM_LR = {} for idx, text in enumerate(textFiles()): peek(idx + 1, 1000) #if idx>1000: break ngram_local = {} sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posnize(tokenlist) #print "\n-----tokens and poslist----" #print ["(%s, %s)"%(t, poslist[i]) for i, t in enumerate(tokenlist)] if not len(tokenlist) > 5: #print "Anormaly of sentence:", s #pause() continue tokenstoplist = lang.markStops(tokenlist) if 0: print "stops:" print tokenstoplist #pause() if len(tokenlist) > 80: continue print "###### text ######", idx print text print tokenlist, len(tokenlist) #pause() ngb = lang.ngrambounds(tokenstoplist) #print "gram with bounds:", ngb selecedngb = lang.filterSRS(ngb, tokenstoplist) #print "\nSRS-FIL gram with bounds:", selecedngb selecedngb = lang.filterAdj(selecedngb, s) #print "\nADJ-FIL gram with bounds:", selecedngb selecedngb = lang.filterAdv(selecedngb, s) #print "\nADV-FIL gram with bounds:", selecedngb #selecedngb = lang.filterVerb(selecedngb, s, verbose=0) #<--- "contrast", "field" incorrectly ignored #print "\nVERB-FIL gram with bounds:", selecedngb # do it again after pure pos-based filtering selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print "\nFINAL selected gram with bounds:", selecedngb if ROUND == 1: # in the 1st round, profile the next word after a gram for (gram, leftidx, rightidx) in selecedngb: nextword = lang.nextword(rightidx, tokenlist) prevword = lang.prevword(leftidx, tokenlist) nextwordcode = lang.profilingCode(nextword) prevwordcode = lang.profilingCode(prevword) kbay.inc3d(gram, '_', '_', NGRAM_LR) # '_' as itself kbay.inc3d(gram, 'l', prevwordcode, NGRAM_LR) kbay.inc3d(gram, 'r', nextwordcode, NGRAM_LR) if lang.isSubject(leftidx, rightidx, tokenlist): kbay.inc3d(gram, '_', 's', NGRAM_LR) #print "subject:", gram #pause() if ROUND == 2: # in the 2nd round, justify the gram for ngb in selecedngb: print "check this:", ngb sg = grammer.subgram(ngb[0], ngb[1], ngb[2], READIN_GRAM_LR, tokenlist, poslist) if sg: print "gram", ngb, "subgram", sg raw_input() if 0: print "\n\n", s print "raw ngb >", ngb print "final ngb >", selecedngb pause() ngrams = [t[0] for t in selecedngb] ngrams = [g for g in ngrams if len(g.split()) > 1] kbay.count(ngrams, ngram_local) if DO_COOCUR: for n1 in ngrams: for n2 in ngrams: kbay.inc2d(n1, n2, COOCUR_S) # doc.freq. - each gram counted only once kbay.count(ngram_local, NGRAMS_DOC) kbay.saveHist3D(NGRAM_LR, SAVEDIR + 'hist.txt') #print "filter df-doc" #filterHistByCount(NGRAMS_DOC, 2, verbose=False) #kbay.saveDF(NGRAMS_DOC, SAVEDIR+SrcFileName+'_ngrams_df_doc.txt', sort=False, numDoc=idx) if DO_COOCUR: print "filter coocur" kbay.filter2DHistByCount(COOCUR_S, 2, verbose=True) kbay.saveDF2D(COOCUR_S, SAVEDIR + SrcFileName + '_ngrams_coocur.txt') print "DONE findNgrams"
def selectGrams(): klog.msg('select grams') book = fif.readWordCoocur('tmp_ieee_coocur_abstractwide_words_4000.txt') #book=fif.readWordCoocur('tmp_ieee_coocur_abstractwide_word_bymemo.txt', filtervalue=2) CoHist = {} CoWord = {} klog.msg('looping files') for idx_raw, text in enumerate(ieeePapers()): localGramHist = {} # gram -> count sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print s #print "\n>INITIAL grams:\n", ngb #print "\n>SELECTED grams:\n", selecedngb for g, l, r in selecedngb: localGramHist[g] = localGramHist.get(g, 0) + 1 if 0: print text print "#.localgrams:", len(localGramHist) print localGramHist print "#.ngram:", len([1 for g in localGramHist if ' ' in g]) pause() #kbay.countCooccur(localGramHist, CoHist) # calculate mutual information gramlist = localGramHist.keys() gramscore = [] for g in gramlist: gramscore.append(relativeInfo(g, gramlist, book)) print sorted(gramscore, key=lambda x: x[1]) averageScore = sum([g[1] for g in gramscore]) / len(gramscore) print "above average:", averageScore print[g for g in gramscore if g[1] > averageScore] pause() wordset = set([w for g in localGramHist for w in g.split()]) kbay.countCooccur(wordset, CoWord) peek(idx_raw + 1, 1000) if (idx_raw + 1) % 4000 == 0: #mem.refreshMemory2D(CoWord, steps=idx_raw, prefilter=True) #h, hh = len(CoWord), kbay.sizeOf2DHist(CoWord) #print "hist size", h, hh, hh/h #mem.refreshMemory2DFirstOrder(CoWord, steps=idx_raw) kbay.saveDF2D( CoWord, 'tmp_ieee_coocur_abstractwide_words_%s.txt' % (idx_raw + 1)) CoWord = {} # reset break if 0: if (idx_raw + 1) % 40000 == 0: kbay.saveDF2D( CoHist, 'tmp_ieee_coocur_abstractwide_%s.txt' % (idx_raw + 1)) CoHist = {} # reset
def buildCoocurDemoNetwork(): cobook = fif.readCoocurWithFilterFunc( 'tmp_ieee_coocur_abstractwide_grams.txt', None, filtervalue=2) selectedCobook = {} def __score(g): size = len(g.split()) cnt = hist[g] return size * size * cnt def __selectTop(pool, top, tel, gram, cobook, verbose=False): poolsize = len(pool) if poolsize > 0: gramsize = len(pool[0].split()) if gramsize == 1: top = 3 elif gramsize == 2: top = 6 if poolsize * 0.2 < 6 else int(poolsize * 0.2) if top > 10: top = 10 else: top = 5 if poolsize * 0.5 < 5 else int(poolsize * 0.5) else: top = 0 topx = sorted([(g2, hist[g2]) for g2 in pool], key=lambda x: x[1], reverse=True)[:top] telx = sorted([(g2, hist[g2]) for g2 in pool], key=lambda x: x[1], reverse=True)[-tel:] if verbose: print "top", top, "from", len(pool) print topx print "tel", top, "from", len(pool) print telx #if raw_input('showdetails?')=='y': print pool for gg, cnt in topx: cobook[gram][gg] = cnt for g, hist in cobook.items(): if not len(hist) > 1: continue onegram, twogram, threegram, fourgram, xgram = [], [], [], [], [] # make uniqukeys (deal with ambiguous terms, like XX, XXs, x y z, x Y, z) uniqkeys = set([]) for k in hist.keys(): alreadyIn = False for kk in uniqkeys: if len(k) > len(kk): minlen = len(kk) else: minlen = len(k) if k.lower()[:minlen - 2] == kk.lower()[:minlen - 2]: alreadyIn = True if not alreadyIn: uniqkeys.add(k) for g2 in uniqkeys: if g2 == g: continue size = len(g2.split()) if size == 1: onegram.append(g2) elif size == 2: twogram.append(g2) elif size == 3: threegram.append(g2) elif size == 4: fourgram.append(g2) else: xgram.append(g2) #print "> ", g selectedCobook[g] = {} __selectTop(onegram, 3, 3, g, selectedCobook) __selectTop(twogram, 5, 5, g, selectedCobook) __selectTop(threegram, 5, 5, g, selectedCobook) __selectTop(fourgram, 5, 5, g, selectedCobook) __selectTop(xgram, 5, 5, g, selectedCobook) #print "> cobook:" #print selectedCobook #raw_input('...') # select maximu three 1-gram #averageCntOnegram = 1.0*sum([hist[g2] for g2 in onegram])/(len(onegram) #aboveAvrgOnegram = [g2 for g2 in onegram if hist[g2] >= averageCntOnegram] #belowAvrgOnegram = [g2 for g2 in onegram if hist[g2] < averageCntOnegram] #averageCnt = 1.0*sum([cnt for g2, cnt in hist.items() if g2!=g])/(len(hist)-1) #aboveAvrg = [(g2, cnt) for g2, cnt in hist.items() if cnt >= averageCnt] #topx = sorted(aboveAvrg, key=lambda x:x[1], reverse=True)[:10] #topx = sorted(hist.items(), key=lambda x:x[1], reverse=True)[:10] #selectedCobook[g] = {} #for (gg, c) in topx: # if not gg==g: # selectedCobook[g][gg] = c kbay.saveDF2D(selectedCobook, 'tmp_ieee_coocur_selected.txt')