def outRdfGdf(out=sys.stdout): #google ngram based TF-IDF import vocab vocab = vocab.build() filename = 'df.db' d = shelve.open(OUT_DIR+filename) print >>sys.stderr, len(d.keys()) for k, v in sorted(d.iteritems(), key=lambda x:float(x[1])/log(vocab.get(x[0].decode('utf-8'),2)), reverse=True): g = vocab.get(k.decode('utf-8'),0) if 0 < g < 200000000: print >>out, k, v, g, float(v)/g d.close()
def findSeed(targetCID, useTfIdf=True, categoryName='', initialYMax=10, initialVMax=10): import vocab seedY = [] seedV = [] words = defaultdict(int) for d in document.iterDoc(targetCID): words[d[-1]] += 1 logging.log(logging.INFO, u'number of unique words:%d' % len(words)) vocab = vocab.build() if useTfIdf: sortfunc = lambda x: float(x[1])/log(vocab.get(x[0].origin,2)) else: sortfunc = lambda x: x[1] for word, cnt in sorted(words.iteritems(), key=sortfunc, reverse=True): g = vocab.get(word.origin,0) if 0 < g < 200000000: if word.surface == '(': #posid=36なので品詞で取り除けない continue #huristics logging.log(logging.INFO, (u'word:%s[%d], cnt:%d, google-cnt:%d' % (word, word.posid, cnt, g))) if word.origin in categoryName: continue #huristics if len(seedY) < initialYMax and word.willBeEntry(): seedY.append(word.get()) if len(seedV) < initialVMax and word.isAdj(): seedV.append(word.get()) if len(seedY) == initialYMax and len(seedV) == initialVMax: break print 'seedY:', repr(seedY).decode('unicode-escape') print 'seedV:', repr(seedV).decode('unicode-escape') return seedY, seedV