Exemplo n.º 1
0
def selectTest():
    print "select test ..."
    book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4)
    print "book size:", len(book)

    for idx_raw, text in enumerate(ieeePapers()):
        print text
        sentences = lang.getSentenceList(text)
        localHist = {}
        scoreByLang = {}
        gramLeftRight = {}
        for idofs, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            poslist = lang.posLookup(tokenlist)
            tokenstoplist = lang.markStops(tokenlist)
            tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False)
            tokenMarkList = lang.markStopOnNonending(tokenlist, poslist,
                                                     tokenstoplist)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList)
            tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList)
            ngb = lang.ngrambounds(tokenstoplist)
            selecedngb = lang.filterAdj(ngb, s)
            selecedngb = lang.filterAdv(selecedngb, s)
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            for g, l, r in selecedngb:
                localHist[g] = localHist.get(g, 0) + 1

                scoreByLang[g] = scoreByLang.get(g, 0) + linguisticScore(
                    g, l, r, tokenlist)
                if not g in gramLeftRight:
                    gramLeftRight[g] = []
                lefttoken = '<L>' + ('#BEGIN' if l == 0 else tokenlist[l - 1])
                righttoken = '<R>' + ('#END' if r >=
                                      (len(tokenlist) - 1) else tokenlist[r +
                                                                          1])
                gramLeftRight[g].append((lefttoken, righttoken))

        # scores
        scoreByDF = {}

        totalDF = 0
        for g in localHist:
            scoreByDF[g] = book.get(g, 0)
            totalDF = scoreByDF[g]
        averageDF = totalDF / len(scoreByDF)
        sortedByDF = sorted(scoreByDF.items(),
                            key=lambda x: x[1],
                            reverse=True)
        print sortedByDF
        print "average DF", averageDF
        print "gram with DF above average"
        print[(g, count) for (g, count) in sortedByDF if count > averageDF]
        print "gram with DF below average"
        print[(g, count) for (g, count) in sortedByDF if not count > averageDF]

        print "lang score:"
        print scoreByLang
        print "gram left right"
        print gramLeftRight
        pause()
Exemplo n.º 2
0
def recommendTerms():
    dfbook = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4)
    cobook = fif.readCoocurWithFilterFunc(
        'tmp_ieee_coocur_abstractwide_grams.txt', dfbook)
    for idx_raw, text in enumerate(ieeePapers()):
        sentences = lang.getSentenceList(text)
        coHist = {}  # coocur_gram -> df for grams in abstract
        localHist = {}  # local gram -> occurrence count
        for idofs, s in enumerate(sentences):
            grams = ngramsOfSentence(s)
            for g in grams:
                localHist[g] = localHist.get(g, 0) + 1
        for g in localHist:
            cograms = cobook.get(g, [])
            for gg in cograms:
                coHist[gg] = coHist.get(gg, [])
                coHist[gg].append(g)
        # just by mention/occurrence
        score = {}
        for g in localHist:
            cograms = cobook.get(g, [])
            if not g in cobook: continue
            gcount = cobook[g][g]
            for gg in cograms:
                if gg == g: continue
                cocount = cobook[g][gg]

                # ignore those with only one-degree of relavance for the moment
                if not len(coHist[gg]) > 1:
                    continue

                score[gg] = score.get(gg, 0) + float(cocount) / gcount

        fluxAndPosterior = {}
        for g, colist in coHist.items():
            if not len(g.split()) > 1: continue
            if len(colist) > 1:
                fluxAndPosterior[g] = (score[g], colist)

        print "grams of text:"
        print localHist.keys()

        print "cogram having influx > 2 ..."
        for g, colist in coHist.items():
            if len(colist) > 1:
                print g, colist

        print "select from coHist ..."
        print sorted(coHist.items(), key=lambda x: len(x[1]),
                     reverse=True)[:20]

        print "select from posterior..."
        print sorted(fluxAndPosterior.items(), key=lambda x: x[1][0])
        pause()
Exemplo n.º 3
0
def memorizeCogram():

    book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4)
    memo = mem.Memory()
    memo.setInitialCapacity(200)

    for idx_raw, text in enumerate(ieeePapers()):
        #if idx_raw<220000: continue
        sentences = lang.getSentenceList(text)
        gramsPreviousSentence = set([])
        for idofs, s in enumerate(sentences):
            grams = ngramsOfSentence(s)
            if not grams: continue
            goodgrams = set([g for g in grams if g in book])
            memo.learnSymbList(goodgrams)
            # grams of previous sentence: learn grams of current sentence
            # grams of current  sentence: learn grams of previous sentence
            memo.crosslearn(gramsPreviousSentence, goodgrams, crossweight=1)
            if 0 and len(list(gramsPreviousSentence) + list(goodgrams)) == 1:
                print "only 1 gram in two sentences!!!"
                print "sentence:", s
                print "grams before filtering:", grams
                print "grams after filtering", goodgrams
                if idofs > 0:
                    print "previous sentence:", sentences[idofs - 1]
                    print "previous grams before filtering:", ngramsOfSentence(
                        sentences[idofs - 1])
                    print "previous grams after filtering:", gramsPreviousSentence
                pause()
            gramsPreviousSentence = goodgrams

        peek(idx_raw + 1, 2000)
        if (idx_raw + 1) % 2000 == 0:
            memo.refresh()
            memo.showsize()

        #if idx_raw>6000:
        #    break

    kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_grams.txt')
Exemplo n.º 4
0
def memorizeCoword():

    memo = mem.Memory()
    memo.setInitialCapacity(200)

    book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4)

    for idx_raw, text in enumerate(ieeePapers()):
        #if idx_raw<70000: continue
        sentences = lang.getSentenceList(text)
        for idofs, s in enumerate(sentences):
            grams = ngramsOfSentence(s)
            if not grams: continue
            words = set(' '.join(grams).split())
            words = [w for w in words if w in book]
            memo.learnSymbList(words)

        peek(idx_raw + 1, 2000)
        if (idx_raw + 1) % 2000 == 0:
            memo.refresh()
            memo.showsize()

    kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_word_bymemo.txt')
Exemplo n.º 5
0
def buildGramNetwork():
    dfbook = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=5)
    vocab = {}
    groupedBySize = {}
    for g in dfbook:
        words = g.split()
        size = len(words)
        if not size in groupedBySize:
            groupedBySize[size] = []
        groupedBySize[size].append(g)
        for w in words:
            vocab[w] = 1
    print "total vocab:", len(vocab)
    print "---size hist---"
    for s in groupedBySize:
        print "size=", s, "count=", len(groupedBySize[s])

    graph = {}
    for w in vocab:
        graph[w] = []

    def __findAllLeaves(word, verbose=False):
        frontierGrams = graph[word]
        leafnodes = []
        nIteration = 0
        while (frontierGrams):
            nextFrontier = []
            for g in frontierGrams:
                children = graph.get(g, None)
                if not children:
                    leafnodes.append(g)
                else:
                    nextFrontier += children
            frontierGrams = nextFrontier
            nIteration += 1
            if nIteration > 10:
                print "too many iterations for:", word
                print "current frontier:", frontierGrams
                raw_input('...')
        if verbose:
            print "leafnodes for", word, " - ", leafnodes
        return leafnodes

    def __debugsize(size):
        return size > 5000

    for size in sorted(groupedBySize.keys()):
        print "checking size=", size, "current graph size", len(graph)
        if size == 1: continue
        for g in groupedBySize[size]:
            words = g.split()
            if __debugsize(size):
                print "\nchecking: ", g
            for w in words:
                leaves = __findAllLeaves(w, verbose=__debugsize(size))
                appendOnLeaf = False
                for l in leaves:
                    if l in g:
                        appendOnLeaf = True
                        if l == g:
                            # do not append to itself (already in graph)
                            continue
                        graph[l] = graph.get(l, [])
                        graph[l].append(g)
                        if __debugsize(size):
                            print "append: <", g, "> to:", l
                if not appendOnLeaf:
                    # append to this word
                    graph[w] = graph.get(w, [])
                    graph[w].append(g)
                    if __debugsize(size):
                        print "append: <", g, "> to:", w
            if __debugsize(size):
                raw_input()

    print "+++GRAPH+++"
    print "size:", len(graph)
    print "size(non-empty):", len([n for n in graph if graph[n]])
    #print graph
    g = raw_input('..')
    while g:
        if g == 'look':
            print graph.keys()[:10]
            g = raw_input('..')
        else:
            if g in graph:
                print graph[g]
            else:
                print "not in graph"
            g = raw_input('..')
Exemplo n.º 6
0
def filterGramDF():
    book = fif.readWithFilter('ieeeGramDF.txt', filtervalue=3)
    kbay.saveDF(book, 'ieeeGramDF_above3.txt', sort=False, numDoc=460035)