def __init__(self,db_path, weight_evaluator=None):
     self.dbpath = db_path
     self.dbcon = None
     if weight_evaluator:
         self.evaluator = weight_evaluator
     else:
         self.evaluator = WordWeightEvaluation(30,'../data')
         self.evaluator.add_docs_from_db(dbpath)
示例#2
0
def main():
    import preproc_qqtopic
    import extract_keyword2
    #import worddf
    dbfile = '../data/noise_test.db'
    lognoisefile = '../result/noise_test.log'
    
    noisefile = open(lognoisefile,'w')
    noisefile.write(out_result_header())
    rang = xrange(0,250, 60)
    for num in rang:
        dbfile = '../data/noise%d_test.db' % (num,)
        files = sample_docs(num)
        if os.path.exists(dbfile):
            os.remove(dbfile)

        dbcon = preproc_qqtopic.init_db(dbfile)
        preproc_qqtopic.load_topiclist(dbcon,'../data/topicgj')
        cnt = preproc_qqtopic.load_topic(dbcon,'noise_data',files)
        dbcon.close()
        print 'add number of noise document: %d' % cnt

        eva = WordWeightEvaluation(30, '../data/worddf')
        ke = extract_keyword2.DBKeywordExtractor(dbfile, eva)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()
def content_keyword(dbcon):
    print 'extracting keyword from content...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')
    wordset = load_wordset(dbcon)
    cnt = 0
    eluate = WordWeightEvaluation(30)
    for r in dbutils.iterRec(dbcon, 'document','docid title content'):
        word_weight_list = eluate.extract_kw(r[1],r[2])
        wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list])
        dbutils.updateByPK(dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]})
         
        cnt += 1
        if cnt%20==0:
            utils.updateProgress(cnt,doc_num)
        
    print ''
    eluate.close()
    dbcon.commit()
示例#4
0
def content_keyword(dbcon):
    print 'extracting keyword from content...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')
    wordset = load_wordset(dbcon)
    cnt = 0
    eluate = WordWeightEvaluation(30)
    for r in dbutils.iterRec(dbcon, 'document', 'docid title content'):
        word_weight_list = eluate.extract_kw(r[1], r[2])
        wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list])
        dbutils.updateByPK(dbcon, 'document', {'kw_content': wordwstr},
                           {'docid': r[0]})

        cnt += 1
        if cnt % 20 == 0:
            utils.updateProgress(cnt, doc_num)

    print ''
    eluate.close()
    dbcon.commit()
示例#5
0
def main():
    import pretext
    import extract_keyword2
    import worddf
    dbfile = '../data/sougou.db'
    logsteadyfile = '../result/sougou.log'

    steadyfile = open(logsteadyfile, 'w')
    steadyfile.write(out_result_header())

    if not os.path.exists(dbfile):
        pretext.load_topiclist(dbfile, '/home/cs/download/cluster_data/sougou')
        eva = WordWeightEvaluation(30, '../data/worddf')
        ke = extract_keyword2.DBKeywordExtractor(dbfile, eva)
        ke.init_db()
        ke.content_keyword()
        ke.title_keyword()
        ke.topic_keyword()
        ke.close_db()

    cb = CommunityBuilder(dbfile)

    metrics = list()
    c = 1
    real = cmpcluster.load_doc_labels(dbfile)
    print 'fudan'
    for i in range(c):
        print 'Time %d' % (i + 1)
        predicted = cb.build(max_depth=5, min_doc_num=20)
        metrics.append(cmp_cluster(predicted, real))

    mean, std = mean_std(metrics)
    meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean)
    stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std)
    steadyfile.write(meanstr)
    steadyfile.write(stdstr)
    steadyfile.close()
    os.system('emacs ' + logsteadyfile)
示例#6
0
def topics_to_db(topics, dbfile):
    import sqlite3
    if os.path.exists(dbfile): os.remove(dbfile)
    dbcon = preproc_qqtopic.init_db(dbfile)
    for t in topics:
        idx = t.rindex('/')
        if idx == len(t) - 1:
            idx = t.rindex('/', 0, idx)
        tname = t[idx + 1:]
        files = os.listdir(t)
        filelist = [os.path.join(t, f) for f in files]

        preproc_qqtopic.load_topic(dbcon, tname, filelist)
    dbcon.close()

    evaluator = WordWeightEvaluation(30)
    ke = extract_keyword2.DBKeywordExtractor(dbfile, evaluator)
    ke.init_db()
    ke.content_keyword()
    ke.title_keyword()
    ke.topic_keyword()
    ke.close_db()

    return dbfile
class DBKeywordExtractor:
    
    def __init__(self,db_path, weight_evaluator=None):
        self.dbpath = db_path
        self.dbcon = None
        if weight_evaluator:
            self.evaluator = weight_evaluator
        else:
            self.evaluator = WordWeightEvaluation(30,'../data')
            self.evaluator.add_docs_from_db(dbpath)
           
    def init_db(self):
        self.dbcon = sqlite3.connect(self.dbpath)
        
    def close_db(self):
        self.dbcon.close()
        self.evaluator.close()

    def content_keyword(self):
        print 'extracting keyword from content...'
        doc_num = dbutils.countOfRecs(self.dbcon, 'document')
        cnt = 0
        #eluate = WordWeightEvaluation(30)
        for r in dbutils.iterRec(self.dbcon, 'document','docid title content'):
            word_weight_list = self.evaluator.extract_kw(r[1],r[2])
            wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list])
            dbutils.updateByPK(self.dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]})

            cnt += 1
            if cnt%20==0:
                utils.updateProgress(cnt,doc_num)

        print ''
        #eluate.close()
        self.dbcon.commit()

    def title_keyword(self, maxn=5):
        for r in dbutils.iterRec(self.dbcon, 'document', 'docid title kw_content'):
            wordlist = []
            if r[1]:
                for wt in r[1].split():
                    wordlist.append(wt.split('/')[0])
            else:
                #from content keyword
                i = 0
                for ww in r[2].split():
                    wordlist.append(ww.split('/')[0])
                    i += 1
                    if i == maxn: break;
            kwstr = ' '.join(wordlist)    
            dbutils.updateByPK(self.dbcon, 'document', {'kw_title':kwstr}, {'docid':r[0]})
        self.dbcon.commit()
        

    def topic_keyword(self):
        self.dbcon.execute("create table if not exists topic (name text unique not null, doc_num integer default 0, keyword text, weight text)")
        cur = self.dbcon.execute('select cats, count(docid) from document group by cats')
        for r in cur: 
            kwset = dict()
            cur2 = self.dbcon.execute('select kw_title from document where cats=?', (r[0],))
            for kr in cur2:
                for w in kr[0].split():
                    try:
                        kwset[w] += 1
                    except KeyError:
                        kwset[w] = 1

            sum_weight = float(sum(kwset.itervalues()))
            items = kwset.items()
            items.sort(key=lambda x:x[1],reverse=True)
            kw_str = ' '.join([w for w,f in items])
            weight_str = ' '.join([str(f/sum_weight) for w,f in items])
            dbutils.insert(self.dbcon, 'topic', {'name':r[0], 'doc_num':r[1], 'keyword':kw_str, 'weight':weight_str})
        self.dbcon.commit()