def findRelatedGrams(): klog.msg('find co-exist terms') Hist = {} for idx_raw, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) kbay.countCooccurConsideringSubgram(grams, Hist, gramDF=GRAM_DF, debug=0) if idx_raw % 1000 == 0: mem.refreshMemory2D(Hist, steps=idx_raw, prefilter=True) h, hh = len(Hist), kbay.sizeOf2DHist(Hist) print "hist size", h, hh, hh / h peek(idx_raw + 1, 1000) if idx_raw > 200000: break kbay.filter2DHistByCount(Hist, 2, verbose=True) kbay.saveDF2D(Hist, 'tmp_ieee_occur_all.txt')
def coocurWordFirstOrder(): klog.msg('find word coocur') Hist = {} nNumSentences = 0 for nFiles, text in enumerate(textFiles()): sentences = lang.getSentenceList(text) nNumSentences += len(sentences) if nFiles > 100000: break if nFiles % 1000 == 1: print nFiles, " files: #.sentences", nNumSentences continue for ids, s in enumerate(sentences): tokenlist = lang.tokenize(s) tokenlist = lang.regularwords(tokenlist) selected = [t for t in tokenlist if not lang.isNoise(t)] #kbay.countCooccur(selected, Hist) kbay.countCooccurNearest(selected, Hist, nearest=10) if nFiles % 1000 == 1: refreshMemory2D(Hist, steps=nFiles, prefilter=True) h, hh = len(Hist), kbay.sizeOf2DHist(Hist) print "hist size", h, hh, hh / h peek(nFiles + 1, 1000) if nFiles > 100000: break print "number of sentences:", nNumSentences return klog.msg("finished total %s files" % nFiles) refreshMemory2D(Hist, steps=nFiles, prefilter=True) kbay.filter2DHistByCount(Hist, 3, verbose=True) kbay.saveDF2D(Hist, SAVEDIR + SrcFileName + '_word_coocur.txt')
def coocurWord(save2file, verbose=0): klog.msg('find word coocur') Hist = {} for nFiles, (datafile, text) in enumerate(loopWikiData()): sentences = cleaner.getSentences_regex(text) for s in sentences: #selected = set(cleaner.capitalizedWordsOf(s)) selected = cleaner.tokenize_simple(s) if verbose or 0: print "sentence:", s print "selected:", selected pause() #kbay.countCooccurNearest(selected, Hist, nearest=10) kbay.countCooccurNearest(selected, Hist, nearest=2) #print "learn", selected #print Hist, len(Hist) #pause() if nFiles % 1000 == 0: print nFiles, " files done" if nFiles % 4000 == 0: print "before mem refresh:", len(Hist) memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True) print "after mem refresh:", len(Hist), '\n' if nFiles > 40000: break klog.msg("finished total %s files" % nFiles) memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True) kbay.filter2DHistByCount(Hist, 3, verbose=True) fif.saveHist2D(Hist, save2file) return Hist
def findNgrams(DO_COOCUR=False, ROUND=1): NGRAMS_DOC = {} COOCUR_S = {} # co-occurrence in a sentence NGRAM_LR = {} for idx, text in enumerate(textFiles()): peek(idx + 1, 1000) #if idx>1000: break ngram_local = {} sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posnize(tokenlist) #print "\n-----tokens and poslist----" #print ["(%s, %s)"%(t, poslist[i]) for i, t in enumerate(tokenlist)] if not len(tokenlist) > 5: #print "Anormaly of sentence:", s #pause() continue tokenstoplist = lang.markStops(tokenlist) if 0: print "stops:" print tokenstoplist #pause() if len(tokenlist) > 80: continue print "###### text ######", idx print text print tokenlist, len(tokenlist) #pause() ngb = lang.ngrambounds(tokenstoplist) #print "gram with bounds:", ngb selecedngb = lang.filterSRS(ngb, tokenstoplist) #print "\nSRS-FIL gram with bounds:", selecedngb selecedngb = lang.filterAdj(selecedngb, s) #print "\nADJ-FIL gram with bounds:", selecedngb selecedngb = lang.filterAdv(selecedngb, s) #print "\nADV-FIL gram with bounds:", selecedngb #selecedngb = lang.filterVerb(selecedngb, s, verbose=0) #<--- "contrast", "field" incorrectly ignored #print "\nVERB-FIL gram with bounds:", selecedngb # do it again after pure pos-based filtering selecedngb = lang.filterSRS(selecedngb, tokenstoplist) #print "\nFINAL selected gram with bounds:", selecedngb if ROUND == 1: # in the 1st round, profile the next word after a gram for (gram, leftidx, rightidx) in selecedngb: nextword = lang.nextword(rightidx, tokenlist) prevword = lang.prevword(leftidx, tokenlist) nextwordcode = lang.profilingCode(nextword) prevwordcode = lang.profilingCode(prevword) kbay.inc3d(gram, '_', '_', NGRAM_LR) # '_' as itself kbay.inc3d(gram, 'l', prevwordcode, NGRAM_LR) kbay.inc3d(gram, 'r', nextwordcode, NGRAM_LR) if lang.isSubject(leftidx, rightidx, tokenlist): kbay.inc3d(gram, '_', 's', NGRAM_LR) #print "subject:", gram #pause() if ROUND == 2: # in the 2nd round, justify the gram for ngb in selecedngb: print "check this:", ngb sg = grammer.subgram(ngb[0], ngb[1], ngb[2], READIN_GRAM_LR, tokenlist, poslist) if sg: print "gram", ngb, "subgram", sg raw_input() if 0: print "\n\n", s print "raw ngb >", ngb print "final ngb >", selecedngb pause() ngrams = [t[0] for t in selecedngb] ngrams = [g for g in ngrams if len(g.split()) > 1] kbay.count(ngrams, ngram_local) if DO_COOCUR: for n1 in ngrams: for n2 in ngrams: kbay.inc2d(n1, n2, COOCUR_S) # doc.freq. - each gram counted only once kbay.count(ngram_local, NGRAMS_DOC) kbay.saveHist3D(NGRAM_LR, SAVEDIR + 'hist.txt') #print "filter df-doc" #filterHistByCount(NGRAMS_DOC, 2, verbose=False) #kbay.saveDF(NGRAMS_DOC, SAVEDIR+SrcFileName+'_ngrams_df_doc.txt', sort=False, numDoc=idx) if DO_COOCUR: print "filter coocur" kbay.filter2DHistByCount(COOCUR_S, 2, verbose=True) kbay.saveDF2D(COOCUR_S, SAVEDIR + SrcFileName + '_ngrams_coocur.txt') print "DONE findNgrams"