def coocurWord(save2file, verbose=0): klog.msg('find word coocur') Hist = {} for nFiles, (datafile, text) in enumerate(loopWikiData()): sentences = cleaner.getSentences_regex(text) for s in sentences: #selected = set(cleaner.capitalizedWordsOf(s)) selected = cleaner.tokenize_simple(s) if verbose or 0: print "sentence:", s print "selected:", selected pause() #kbay.countCooccurNearest(selected, Hist, nearest=10) kbay.countCooccurNearest(selected, Hist, nearest=2) #print "learn", selected #print Hist, len(Hist) #pause() if nFiles % 1000 == 0: print nFiles, " files done" if nFiles % 4000 == 0: print "before mem refresh:", len(Hist) memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True) print "after mem refresh:", len(Hist), '\n' if nFiles > 40000: break klog.msg("finished total %s files" % nFiles) memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True) kbay.filter2DHistByCount(Hist, 3, verbose=True) fif.saveHist2D(Hist, save2file) return Hist
def forgetOnFile(ctffile, memcapacity, save2file, normlized=True): cohist = fif.readWordCoocur(ctffile) cohist = forget(cohist, termfreq=None, forgetby='average', memcapacity=None) if normlized: h = normalize(cohist) else: h = cohist fif.saveHist2D(h, save2file, splitter=',')
if 0: #TODO - a way to register the nbatch read_gutefrage(FIN, TF, debug=0) raw_input('-----') fif.saveTF(TF, totalWordCount=1, fname=FOUT_TF) raw_input('--- tf done ---') if 1: print "\n --- coocur ctf ---" tf = fif.readTF(FOUT_TF, {}, threshold=5) print "selected tf size", len(tf) raw_input('-----') do_ctf(FIN, tf, refreshperiod=2 * 1e4, denoise_ratio=0.0019, cutoff=2, ctf=CTF) fif.saveHist2D(CTF, FOUT_CTF) raw_input('--- ctf done ---') if 1: print "\n --- semantic ---" ctf = fif.readWordCoocur(FOUT_CTF, 1) print "> initial ctf size", mem.histsize(ctf) ctf3 = do_semantic(FIN, ctf, refresh_period=1e4, ctf3={}) print "> final size", histsize_3d(ctf3) fif.saveHist3D(ctf3, FOUT_CTF3)
kbay.countCooccurNearest(selected, Hist, nearest=2) #print "learn", selected #print Hist, len(Hist) #pause() if nFiles % 1000 == 0: print nFiles, " files done" if nFiles % 4000 == 0: print "before mem refresh:", len(Hist) memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True) print "after mem refresh:", len(Hist), '\n' if nFiles > 40000: break klog.msg("finished total %s files" % nFiles) memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True) kbay.filter2DHistByCount(Hist, 3, verbose=True) fif.saveHist2D(Hist, save2file) return Hist # 1. countDF() # 2. mine_helper_converter # 3. redo #nearalation() #cohist = coocurWord(save2file='stats/wiki_de_word_coocur', verbose=0) cohist = coocurWord(save2file='stats/wiki_de_word_coocur_leftright', verbose=0) cohist = coocur.forget(cohist) fif.saveHist2D(cohist, 'stats/wiki_word_coocur_filtered.txt')
fif.saveTF(tf, SAMPLE_SIZE, FOUT_TF_TOP) if TEST_CTF: print "read term frequncy" # read term-frequency and select those worth attention tf = fif.readTF(FOUT_TF_TOP) print "build conditional-tf" # build conditional-tf ctf, nbatch = ctfOfBatch(SAMPLE_SIZE, tf, grammode='ng', denoise_ratio=0.003) # save the 2D conditional tf fif.saveHist2D(ctf, FOUT_CTF) if TEST_MEMORY: mem.forgetOnFile(ctffile=fname_ctf, memcapacity=100, save2file=fname_ctf_mem) # it does not finish # size before forget: 100201 : 34424319 # ==> filter when reading the file!!! if RUN_SEMANTIC: ctf = fif.readWordCoocur(fname_ctf_mem, 10) print "> initial ctf size", mem.histsize(ctf) ctf3 = semantic(ctf, grammode=GRAM_MODE, ctf3={}) print "> final size", histsize_3d(ctf3) fif.saveHist3D(ctf3, fname_semet3)