Exemplo n.º 1
0
 def title_keyword(self, maxn=5):
     for r in dbutils.iterRec(self.dbcon, 'document', 'docid title kw_content'):
         wordlist = []
         if r[1]:
             for wt in r[1].split():
                 wordlist.append(wt.split('/')[0])
         else:
             #from content keyword
             i = 0
             for ww in r[2].split():
                 wordlist.append(ww.split('/')[0])
                 i += 1
                 if i == maxn: break;
         kwstr = ' '.join(wordlist)    
         dbutils.updateByPK(self.dbcon, 'document', {'kw_title':kwstr}, {'docid':r[0]})
     self.dbcon.commit()
Exemplo n.º 2
0
def title_df(dbcon):
    print 'statistic word document frequency...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')  
    cnt = 0
    
    for r in dbutils.iterRec(dbcon,'document', 'kw_title'):
        title_set = set(r[0].split())
        for w in title_set:
            df_r = dbutils.queryOneRec(dbcon, 'word', 't_df', 'word=?', (w,))
            assert df_r != None, "'%s' in Document %d except" % (w,r[2])
            dbutils.updateByPK(dbcon, 'word', {'t_df':df_r[0]+1}, {'word':w})

        cnt += 1
        if cnt%50 == 0:
            utils.updateProgress(cnt,doc_num)
    print ''
    dbcon.commit()
Exemplo n.º 3
0
    def content_keyword(self):
        print 'extracting keyword from content...'
        doc_num = dbutils.countOfRecs(self.dbcon, 'document')
        cnt = 0
        #eluate = WordWeightEvaluation(30)
        for r in dbutils.iterRec(self.dbcon, 'document','docid title content'):
            word_weight_list = self.evaluator.extract_kw(r[1],r[2])
            wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list])
            dbutils.updateByPK(self.dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]})

            cnt += 1
            if cnt%20==0:
                utils.updateProgress(cnt,doc_num)

        print ''
        #eluate.close()
        self.dbcon.commit()
Exemplo n.º 4
0
def title_df(dbcon):
    print 'statistic word document frequency...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')
    cnt = 0

    for r in dbutils.iterRec(dbcon, 'document', 'kw_title'):
        title_set = set(r[0].split())
        for w in title_set:
            df_r = dbutils.queryOneRec(dbcon, 'word', 't_df', 'word=?', (w, ))
            assert df_r != None, "'%s' in Document %d except" % (w, r[2])
            dbutils.updateByPK(dbcon, 'word', {'t_df': df_r[0] + 1},
                               {'word': w})

        cnt += 1
        if cnt % 50 == 0:
            utils.updateProgress(cnt, doc_num)
    print ''
    dbcon.commit()
Exemplo n.º 5
0
def title_keyword(dbcon):
    print 'extrating keyword from title...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')

    wordset = load_wordset(dbcon)
    cnt = 0
    for r in dbutils.iterRec(dbcon, 'document', 'docid title'):
        twords = set()
        for wt in r[1].split(' '):
            w = wt.split('/')[0]
            if w in wordset:
                twords.add(w)
        widstr = ' '.join(twords)

        dbutils.updateByPK(dbcon, 'document', {'kw_title':widstr}, {'docid':r[0]})
        
        cnt += 1
        if cnt%50==0:
            utils.updateProgress(cnt,doc_num)
        
    print ''
    dbcon.commit()
Exemplo n.º 6
0
def title_keyword(dbcon):
    print 'extrating keyword from title...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')

    wordset = load_wordset(dbcon)
    cnt = 0
    for r in dbutils.iterRec(dbcon, 'document', 'docid title'):
        twords = set()
        for wt in r[1].split(' '):
            w = wt.split('/')[0]
            if w in wordset:
                twords.add(w)
        widstr = ' '.join(twords)

        dbutils.updateByPK(dbcon, 'document', {'kw_title': widstr},
                           {'docid': r[0]})

        cnt += 1
        if cnt % 50 == 0:
            utils.updateProgress(cnt, doc_num)

    print ''
    dbcon.commit()