Пример #1
0
def outRdfGdf(out=sys.stdout): #google ngram based TF-IDF
    import vocab
    vocab = vocab.build()
    filename = 'df.db'
    d = shelve.open(OUT_DIR+filename)
    print >>sys.stderr, len(d.keys())
    for k, v in sorted(d.iteritems(), key=lambda x:float(x[1])/log(vocab.get(x[0].decode('utf-8'),2)), reverse=True):
        g = vocab.get(k.decode('utf-8'),0)
        if 0 < g < 200000000:
            print >>out, k, v, g, float(v)/g
    d.close()
Пример #2
0
def findSeed(targetCID, useTfIdf=True, categoryName='', initialYMax=10, initialVMax=10):
    import vocab
    seedY = []
    seedV = []
    words = defaultdict(int)
    for d in document.iterDoc(targetCID):
        words[d[-1]] += 1
    logging.log(logging.INFO, u'number of unique words:%d' % len(words))
    vocab = vocab.build()
    if useTfIdf: sortfunc = lambda x: float(x[1])/log(vocab.get(x[0].origin,2))
    else: sortfunc = lambda x: x[1]
    for word, cnt in sorted(words.iteritems(), key=sortfunc, reverse=True):
        g = vocab.get(word.origin,0)
        if 0 < g < 200000000:
            if word.surface == '(': #posid=36なので品詞で取り除けない
                continue #huristics
            logging.log(logging.INFO, (u'word:%s[%d], cnt:%d, google-cnt:%d' % (word, word.posid, cnt, g)))
            if word.origin in categoryName: continue #huristics
            if len(seedY) < initialYMax and word.willBeEntry(): seedY.append(word.get())
            if len(seedV) < initialVMax and word.isAdj(): seedV.append(word.get())
            if len(seedY) == initialYMax and len(seedV) == initialVMax: break
    print 'seedY:', repr(seedY).decode('unicode-escape')
    print 'seedV:', repr(seedV).decode('unicode-escape')
    return seedY, seedV