def selectTest(): print "select test ..." book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) print "book size:", len(book) for idx_raw, text in enumerate(ieeePapers()): print text sentences = lang.getSentenceList(text) localHist = {} scoreByLang = {} gramLeftRight = {} for idofs, s in enumerate(sentences): tokenlist = lang.tokenize(s) poslist = lang.posLookup(tokenlist) tokenstoplist = lang.markStops(tokenlist) tokenVerbList = lang.markVerbs(tokenlist, poslist, verbose=False) tokenMarkList = lang.markStopOnNonending(tokenlist, poslist, tokenstoplist) tokenstoplist = lang.redoStops(tokenstoplist, tokenVerbList) tokenstoplist = lang.redoStops(tokenstoplist, tokenMarkList) ngb = lang.ngrambounds(tokenstoplist) selecedngb = lang.filterAdj(ngb, s) selecedngb = lang.filterAdv(selecedngb, s) selecedngb = lang.filterSRS(selecedngb, tokenstoplist) for g, l, r in selecedngb: localHist[g] = localHist.get(g, 0) + 1 scoreByLang[g] = scoreByLang.get(g, 0) + linguisticScore( g, l, r, tokenlist) if not g in gramLeftRight: gramLeftRight[g] = [] lefttoken = '<L>' + ('#BEGIN' if l == 0 else tokenlist[l - 1]) righttoken = '<R>' + ('#END' if r >= (len(tokenlist) - 1) else tokenlist[r + 1]) gramLeftRight[g].append((lefttoken, righttoken)) # scores scoreByDF = {} totalDF = 0 for g in localHist: scoreByDF[g] = book.get(g, 0) totalDF = scoreByDF[g] averageDF = totalDF / len(scoreByDF) sortedByDF = sorted(scoreByDF.items(), key=lambda x: x[1], reverse=True) print sortedByDF print "average DF", averageDF print "gram with DF above average" print[(g, count) for (g, count) in sortedByDF if count > averageDF] print "gram with DF below average" print[(g, count) for (g, count) in sortedByDF if not count > averageDF] print "lang score:" print scoreByLang print "gram left right" print gramLeftRight pause()
def recommendTerms(): dfbook = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) cobook = fif.readCoocurWithFilterFunc( 'tmp_ieee_coocur_abstractwide_grams.txt', dfbook) for idx_raw, text in enumerate(ieeePapers()): sentences = lang.getSentenceList(text) coHist = {} # coocur_gram -> df for grams in abstract localHist = {} # local gram -> occurrence count for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) for g in grams: localHist[g] = localHist.get(g, 0) + 1 for g in localHist: cograms = cobook.get(g, []) for gg in cograms: coHist[gg] = coHist.get(gg, []) coHist[gg].append(g) # just by mention/occurrence score = {} for g in localHist: cograms = cobook.get(g, []) if not g in cobook: continue gcount = cobook[g][g] for gg in cograms: if gg == g: continue cocount = cobook[g][gg] # ignore those with only one-degree of relavance for the moment if not len(coHist[gg]) > 1: continue score[gg] = score.get(gg, 0) + float(cocount) / gcount fluxAndPosterior = {} for g, colist in coHist.items(): if not len(g.split()) > 1: continue if len(colist) > 1: fluxAndPosterior[g] = (score[g], colist) print "grams of text:" print localHist.keys() print "cogram having influx > 2 ..." for g, colist in coHist.items(): if len(colist) > 1: print g, colist print "select from coHist ..." print sorted(coHist.items(), key=lambda x: len(x[1]), reverse=True)[:20] print "select from posterior..." print sorted(fluxAndPosterior.items(), key=lambda x: x[1][0]) pause()
def memorizeCogram(): book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) memo = mem.Memory() memo.setInitialCapacity(200) for idx_raw, text in enumerate(ieeePapers()): #if idx_raw<220000: continue sentences = lang.getSentenceList(text) gramsPreviousSentence = set([]) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) if not grams: continue goodgrams = set([g for g in grams if g in book]) memo.learnSymbList(goodgrams) # grams of previous sentence: learn grams of current sentence # grams of current sentence: learn grams of previous sentence memo.crosslearn(gramsPreviousSentence, goodgrams, crossweight=1) if 0 and len(list(gramsPreviousSentence) + list(goodgrams)) == 1: print "only 1 gram in two sentences!!!" print "sentence:", s print "grams before filtering:", grams print "grams after filtering", goodgrams if idofs > 0: print "previous sentence:", sentences[idofs - 1] print "previous grams before filtering:", ngramsOfSentence( sentences[idofs - 1]) print "previous grams after filtering:", gramsPreviousSentence pause() gramsPreviousSentence = goodgrams peek(idx_raw + 1, 2000) if (idx_raw + 1) % 2000 == 0: memo.refresh() memo.showsize() #if idx_raw>6000: # break kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_grams.txt')
def memorizeCoword(): memo = mem.Memory() memo.setInitialCapacity(200) book = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=4) for idx_raw, text in enumerate(ieeePapers()): #if idx_raw<70000: continue sentences = lang.getSentenceList(text) for idofs, s in enumerate(sentences): grams = ngramsOfSentence(s) if not grams: continue words = set(' '.join(grams).split()) words = [w for w in words if w in book] memo.learnSymbList(words) peek(idx_raw + 1, 2000) if (idx_raw + 1) % 2000 == 0: memo.refresh() memo.showsize() kbay.saveDF2D(memo.LTM, 'tmp_ieee_coocur_abstractwide_word_bymemo.txt')
def buildGramNetwork(): dfbook = fif.readWithFilter('ieeeGramDF_above3.txt', filtervalue=5) vocab = {} groupedBySize = {} for g in dfbook: words = g.split() size = len(words) if not size in groupedBySize: groupedBySize[size] = [] groupedBySize[size].append(g) for w in words: vocab[w] = 1 print "total vocab:", len(vocab) print "---size hist---" for s in groupedBySize: print "size=", s, "count=", len(groupedBySize[s]) graph = {} for w in vocab: graph[w] = [] def __findAllLeaves(word, verbose=False): frontierGrams = graph[word] leafnodes = [] nIteration = 0 while (frontierGrams): nextFrontier = [] for g in frontierGrams: children = graph.get(g, None) if not children: leafnodes.append(g) else: nextFrontier += children frontierGrams = nextFrontier nIteration += 1 if nIteration > 10: print "too many iterations for:", word print "current frontier:", frontierGrams raw_input('...') if verbose: print "leafnodes for", word, " - ", leafnodes return leafnodes def __debugsize(size): return size > 5000 for size in sorted(groupedBySize.keys()): print "checking size=", size, "current graph size", len(graph) if size == 1: continue for g in groupedBySize[size]: words = g.split() if __debugsize(size): print "\nchecking: ", g for w in words: leaves = __findAllLeaves(w, verbose=__debugsize(size)) appendOnLeaf = False for l in leaves: if l in g: appendOnLeaf = True if l == g: # do not append to itself (already in graph) continue graph[l] = graph.get(l, []) graph[l].append(g) if __debugsize(size): print "append: <", g, "> to:", l if not appendOnLeaf: # append to this word graph[w] = graph.get(w, []) graph[w].append(g) if __debugsize(size): print "append: <", g, "> to:", w if __debugsize(size): raw_input() print "+++GRAPH+++" print "size:", len(graph) print "size(non-empty):", len([n for n in graph if graph[n]]) #print graph g = raw_input('..') while g: if g == 'look': print graph.keys()[:10] g = raw_input('..') else: if g in graph: print graph[g] else: print "not in graph" g = raw_input('..')
def filterGramDF(): book = fif.readWithFilter('ieeeGramDF.txt', filtervalue=3) kbay.saveDF(book, 'ieeeGramDF_above3.txt', sort=False, numDoc=460035)