Exemplo n.º 1
0
def forgetOnFile(ctffile, memcapacity, save2file, normlized=True):
    cohist = fif.readWordCoocur(ctffile)
    cohist = forget(cohist,
                    termfreq=None,
                    forgetby='average',
                    memcapacity=None)
    if normlized:
        h = normalize(cohist)
    else:
        h = cohist
    fif.saveHist2D(h, save2file, splitter=',')
Exemplo n.º 2
0
import kbay
import fif

#co = fif.readWordCoocur('stats/pmed/pmed_word_coocur.txt')
co = fif.readWordCoocur('stats/wiki_word_coocur.txt')

def infoscore(w1, w2):
    try:
        n_w1        = co[w1][w1]
        n_w2_on_w1  = co[w1].get(w2,0)
        n_w2        = co[w2][w2]
        n_total     = 2607000
    except:
        return 0

    r2    = float(n_w2)/n_total
    r2on1 = float(n_w2_on_w1)/n_w1
    score = r2on1 / r2
    return score

def sumscore(wordlist, wref):
    return sum([infoscore(w, wref) for w in wordlist])

if 0:
    for w1, hist in co.items():
        if not w1 in ['red', 'blue', 'black', 'green', 'orange', 'white', 'color']: continue
        print ">>> checking word", w1
        n_w1 = hist.get(w1)
        scores = []
        for w2, n_w2_on_w1 in hist.items():
            try:
Exemplo n.º 3
0
def selectGrams():
    klog.msg('select grams')
    book = fif.readWordCoocur('tmp_ieee_coocur_abstractwide_words_4000.txt')
    #book=fif.readWordCoocur('tmp_ieee_coocur_abstractwide_word_bymemo.txt', filtervalue=2)
    CoHist = {}
    CoWord = {}
    klog.msg('looping files')
    for idx_raw, text in enumerate(ieeePapers()):
        localGramHist = {}  # gram -> count
        sentences = lang.getSentenceList(text)
        for idofs, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            poslist = lang.posLookup(tokenlist)
            tokenstoplist = lang.markStops(tokenlist)
            tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False)
            tokenMarkList = lang.markStopOnNonending(tokenlist, poslist,
                                                     tokenstoplist)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList)
            ngb = lang.ngrambounds(tokenstoplist)
            selecedngb = lang.filterAdj(ngb, s)
            selecedngb = lang.filterAdv(selecedngb, s)
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            #print s
            #print "\n>INITIAL grams:\n", ngb
            #print "\n>SELECTED grams:\n", selecedngb
            for g, l, r in selecedngb:
                localGramHist[g] = localGramHist.get(g, 0) + 1

            if 0:
                print text
                print "#.localgrams:", len(localGramHist)
                print localGramHist
                print "#.ngram:", len([1 for g in localGramHist if ' ' in g])
                pause()

        #kbay.countCooccur(localGramHist, CoHist)

        # calculate mutual information
        gramlist = localGramHist.keys()
        gramscore = []
        for g in gramlist:
            gramscore.append(relativeInfo(g, gramlist, book))
        print sorted(gramscore, key=lambda x: x[1])
        averageScore = sum([g[1] for g in gramscore]) / len(gramscore)
        print "above average:", averageScore
        print[g for g in gramscore if g[1] > averageScore]
        pause()

        wordset = set([w for g in localGramHist for w in g.split()])
        kbay.countCooccur(wordset, CoWord)

        peek(idx_raw + 1, 1000)

        if (idx_raw + 1) % 4000 == 0:
            #mem.refreshMemory2D(CoWord, steps=idx_raw, prefilter=True)
            #h, hh = len(CoWord), kbay.sizeOf2DHist(CoWord)
            #print "hist size", h, hh, hh/h
            #mem.refreshMemory2DFirstOrder(CoWord, steps=idx_raw)
            kbay.saveDF2D(
                CoWord,
                'tmp_ieee_coocur_abstractwide_words_%s.txt' % (idx_raw + 1))
            CoWord = {}  # reset
            break

        if 0:
            if (idx_raw + 1) % 40000 == 0:
                kbay.saveDF2D(
                    CoHist,
                    'tmp_ieee_coocur_abstractwide_%s.txt' % (idx_raw + 1))
                CoHist = {}  # reset
Exemplo n.º 4
0
    while depth < max_depth:
        new_front = []
        for new in front_nodes:
            if not new in visited:

                # visit this node
                new_front = new_front + __expand(co, new, max_width)

                # after the visit
                visited.append(new)

        # finish one depth
        depth += 1
        front_nodes = new_front # update the front

        # logging
        print "depth", depth, "new front", set(front_nodes)

    # final result: all found nodes
    return set(visited)

if __name__=='__main__':
    #
    #print "nodes", nodes
    #print "#. nodes", len(nodes)

    hist_coocur = fif.readWordCoocur(coocurfile)
    selected = expand_network(hist_coocur, "red", max_depth=12, max_width=5)
    # trans_csv_selected(hist_coocur, ["red"])
    trans_csv_selected(hist_coocur, selected, max_width=5)
Exemplo n.º 5
0
if 0:
    #TODO - a way to register the nbatch
    read_gutefrage(FIN, TF, debug=0)
    raw_input('-----')

    fif.saveTF(TF, totalWordCount=1, fname=FOUT_TF)

raw_input('--- tf done ---')

if 1:
    print "\n --- coocur ctf ---"
    tf = fif.readTF(FOUT_TF, {}, threshold=5)
    print "selected tf size", len(tf)
    raw_input('-----')
    do_ctf(FIN,
           tf,
           refreshperiod=2 * 1e4,
           denoise_ratio=0.0019,
           cutoff=2,
           ctf=CTF)
    fif.saveHist2D(CTF, FOUT_CTF)
raw_input('--- ctf done ---')

if 1:
    print "\n --- semantic ---"
    ctf = fif.readWordCoocur(FOUT_CTF, 1)
    print "> initial ctf size", mem.histsize(ctf)
    ctf3 = do_semantic(FIN, ctf, refresh_period=1e4, ctf3={})
    print "> final size", histsize_3d(ctf3)
    fif.saveHist3D(ctf3, FOUT_CTF3)
Exemplo n.º 6
0
                             grammode='ng',
                             denoise_ratio=0.003)

    # save the 2D conditional tf
    fif.saveHist2D(ctf, FOUT_CTF)

if TEST_MEMORY:
    mem.forgetOnFile(ctffile=fname_ctf,
                     memcapacity=100,
                     save2file=fname_ctf_mem)
    # it does not finish
    # size before forget: 100201 : 34424319
    # ==> filter when reading the file!!!

if RUN_SEMANTIC:
    ctf = fif.readWordCoocur(fname_ctf_mem, 10)
    print "> initial ctf size", mem.histsize(ctf)
    ctf3 = semantic(ctf, grammode=GRAM_MODE, ctf3={})
    print "> final size", histsize_3d(ctf3)
    fif.saveHist3D(ctf3, fname_semet3)


def semantic4(significants, ctf={}):

    for nbatch, terms in enumerate(termsOfBatch()):
        for t in terms:
            if not t in significants: continue
            related = significants[t]
            for tt in terms:
                if tt == t: continue
                if not tt in related: continue