Пример #1
0
def coocurWord(save2file, verbose=0):
    klog.msg('find word coocur')
    Hist = {}
    for nFiles, (datafile, text) in enumerate(loopWikiData()):
        sentences = cleaner.getSentences_regex(text)
        for s in sentences:
            #selected = set(cleaner.capitalizedWordsOf(s))
            selected = cleaner.tokenize_simple(s)
            if verbose or 0:
                print "sentence:", s
                print "selected:", selected
                pause()
            #kbay.countCooccurNearest(selected, Hist, nearest=10)
            kbay.countCooccurNearest(selected, Hist, nearest=2)
            #print "learn", selected
            #print Hist, len(Hist)
            #pause()
        if nFiles % 1000 == 0: print nFiles, " files done"
        if nFiles % 4000 == 0:
            print "before mem refresh:", len(Hist)
            memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True)
            print "after mem refresh:", len(Hist), '\n'
        if nFiles > 40000: break

    klog.msg("finished total %s files" % nFiles)
    memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True)
    kbay.filter2DHistByCount(Hist, 3, verbose=True)

    fif.saveHist2D(Hist, save2file)

    return Hist
Пример #2
0
def forgetOnFile(ctffile, memcapacity, save2file, normlized=True):
    cohist = fif.readWordCoocur(ctffile)
    cohist = forget(cohist,
                    termfreq=None,
                    forgetby='average',
                    memcapacity=None)
    if normlized:
        h = normalize(cohist)
    else:
        h = cohist
    fif.saveHist2D(h, save2file, splitter=',')
Пример #3
0
if 0:
    #TODO - a way to register the nbatch
    read_gutefrage(FIN, TF, debug=0)
    raw_input('-----')

    fif.saveTF(TF, totalWordCount=1, fname=FOUT_TF)

raw_input('--- tf done ---')

if 1:
    print "\n --- coocur ctf ---"
    tf = fif.readTF(FOUT_TF, {}, threshold=5)
    print "selected tf size", len(tf)
    raw_input('-----')
    do_ctf(FIN,
           tf,
           refreshperiod=2 * 1e4,
           denoise_ratio=0.0019,
           cutoff=2,
           ctf=CTF)
    fif.saveHist2D(CTF, FOUT_CTF)
raw_input('--- ctf done ---')

if 1:
    print "\n --- semantic ---"
    ctf = fif.readWordCoocur(FOUT_CTF, 1)
    print "> initial ctf size", mem.histsize(ctf)
    ctf3 = do_semantic(FIN, ctf, refresh_period=1e4, ctf3={})
    print "> final size", histsize_3d(ctf3)
    fif.saveHist3D(ctf3, FOUT_CTF3)
Пример #4
0
            kbay.countCooccurNearest(selected, Hist, nearest=2)
            #print "learn", selected
            #print Hist, len(Hist)
            #pause()
        if nFiles % 1000 == 0: print nFiles, " files done"
        if nFiles % 4000 == 0:
            print "before mem refresh:", len(Hist)
            memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True)
            print "after mem refresh:", len(Hist), '\n'
        if nFiles > 40000: break

    klog.msg("finished total %s files" % nFiles)
    memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True)
    kbay.filter2DHistByCount(Hist, 3, verbose=True)

    fif.saveHist2D(Hist, save2file)

    return Hist


# 1. countDF()
# 2. mine_helper_converter

# 3. redo
#nearalation()

#cohist = coocurWord(save2file='stats/wiki_de_word_coocur', verbose=0)
cohist = coocurWord(save2file='stats/wiki_de_word_coocur_leftright', verbose=0)
cohist = coocur.forget(cohist)
fif.saveHist2D(cohist, 'stats/wiki_word_coocur_filtered.txt')
Пример #5
0
    fif.saveTF(tf, SAMPLE_SIZE, FOUT_TF_TOP)

if TEST_CTF:
    print "read term frequncy"
    # read term-frequency and select those worth attention
    tf = fif.readTF(FOUT_TF_TOP)

    print "build conditional-tf"
    # build conditional-tf
    ctf, nbatch = ctfOfBatch(SAMPLE_SIZE,
                             tf,
                             grammode='ng',
                             denoise_ratio=0.003)

    # save the 2D conditional tf
    fif.saveHist2D(ctf, FOUT_CTF)

if TEST_MEMORY:
    mem.forgetOnFile(ctffile=fname_ctf,
                     memcapacity=100,
                     save2file=fname_ctf_mem)
    # it does not finish
    # size before forget: 100201 : 34424319
    # ==> filter when reading the file!!!

if RUN_SEMANTIC:
    ctf = fif.readWordCoocur(fname_ctf_mem, 10)
    print "> initial ctf size", mem.histsize(ctf)
    ctf3 = semantic(ctf, grammode=GRAM_MODE, ctf3={})
    print "> final size", histsize_3d(ctf3)
    fif.saveHist3D(ctf3, fname_semet3)