def __init__(self,db_path, weight_evaluator=None): self.dbpath = db_path self.dbcon = None if weight_evaluator: self.evaluator = weight_evaluator else: self.evaluator = WordWeightEvaluation(30,'../data') self.evaluator.add_docs_from_db(dbpath)
def main(): import preproc_qqtopic import extract_keyword2 #import worddf dbfile = '../data/noise_test.db' lognoisefile = '../result/noise_test.log' noisefile = open(lognoisefile,'w') noisefile.write(out_result_header()) rang = xrange(0,250, 60) for num in rang: dbfile = '../data/noise%d_test.db' % (num,) files = sample_docs(num) if os.path.exists(dbfile): os.remove(dbfile) dbcon = preproc_qqtopic.init_db(dbfile) preproc_qqtopic.load_topiclist(dbcon,'../data/topicgj') cnt = preproc_qqtopic.load_topic(dbcon,'noise_data',files) dbcon.close() print 'add number of noise document: %d' % cnt eva = WordWeightEvaluation(30, '../data/worddf') ke = extract_keyword2.DBKeywordExtractor(dbfile, eva) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db()
def content_keyword(dbcon): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(dbcon, 'document','docid title content'): word_weight_list = eluate.extract_kw(r[1],r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]}) cnt += 1 if cnt%20==0: utils.updateProgress(cnt,doc_num) print '' eluate.close() dbcon.commit()
def content_keyword(dbcon): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(dbcon, 'document', 'docid title content'): word_weight_list = eluate.extract_kw(r[1], r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(dbcon, 'document', {'kw_content': wordwstr}, {'docid': r[0]}) cnt += 1 if cnt % 20 == 0: utils.updateProgress(cnt, doc_num) print '' eluate.close() dbcon.commit()
def main(): import pretext import extract_keyword2 import worddf dbfile = '../data/sougou.db' logsteadyfile = '../result/sougou.log' steadyfile = open(logsteadyfile, 'w') steadyfile.write(out_result_header()) if not os.path.exists(dbfile): pretext.load_topiclist(dbfile, '/home/cs/download/cluster_data/sougou') eva = WordWeightEvaluation(30, '../data/worddf') ke = extract_keyword2.DBKeywordExtractor(dbfile, eva) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db() cb = CommunityBuilder(dbfile) metrics = list() c = 1 real = cmpcluster.load_doc_labels(dbfile) print 'fudan' for i in range(c): print 'Time %d' % (i + 1) predicted = cb.build(max_depth=5, min_doc_num=20) metrics.append(cmp_cluster(predicted, real)) mean, std = mean_std(metrics) meanstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(mean) stdstr = '%.1f \t%.3f \t%.3f \t%.3f \t%.3f\n' % tuple(std) steadyfile.write(meanstr) steadyfile.write(stdstr) steadyfile.close() os.system('emacs ' + logsteadyfile)
def topics_to_db(topics, dbfile): import sqlite3 if os.path.exists(dbfile): os.remove(dbfile) dbcon = preproc_qqtopic.init_db(dbfile) for t in topics: idx = t.rindex('/') if idx == len(t) - 1: idx = t.rindex('/', 0, idx) tname = t[idx + 1:] files = os.listdir(t) filelist = [os.path.join(t, f) for f in files] preproc_qqtopic.load_topic(dbcon, tname, filelist) dbcon.close() evaluator = WordWeightEvaluation(30) ke = extract_keyword2.DBKeywordExtractor(dbfile, evaluator) ke.init_db() ke.content_keyword() ke.title_keyword() ke.topic_keyword() ke.close_db() return dbfile
class DBKeywordExtractor: def __init__(self,db_path, weight_evaluator=None): self.dbpath = db_path self.dbcon = None if weight_evaluator: self.evaluator = weight_evaluator else: self.evaluator = WordWeightEvaluation(30,'../data') self.evaluator.add_docs_from_db(dbpath) def init_db(self): self.dbcon = sqlite3.connect(self.dbpath) def close_db(self): self.dbcon.close() self.evaluator.close() def content_keyword(self): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(self.dbcon, 'document') cnt = 0 #eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(self.dbcon, 'document','docid title content'): word_weight_list = self.evaluator.extract_kw(r[1],r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(self.dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]}) cnt += 1 if cnt%20==0: utils.updateProgress(cnt,doc_num) print '' #eluate.close() self.dbcon.commit() def title_keyword(self, maxn=5): for r in dbutils.iterRec(self.dbcon, 'document', 'docid title kw_content'): wordlist = [] if r[1]: for wt in r[1].split(): wordlist.append(wt.split('/')[0]) else: #from content keyword i = 0 for ww in r[2].split(): wordlist.append(ww.split('/')[0]) i += 1 if i == maxn: break; kwstr = ' '.join(wordlist) dbutils.updateByPK(self.dbcon, 'document', {'kw_title':kwstr}, {'docid':r[0]}) self.dbcon.commit() def topic_keyword(self): self.dbcon.execute("create table if not exists topic (name text unique not null, doc_num integer default 0, keyword text, weight text)") cur = self.dbcon.execute('select cats, count(docid) from document group by cats') for r in cur: kwset = dict() cur2 = self.dbcon.execute('select kw_title from document where cats=?', (r[0],)) for kr in cur2: for w in kr[0].split(): try: kwset[w] += 1 except KeyError: kwset[w] = 1 sum_weight = float(sum(kwset.itervalues())) items = kwset.items() items.sort(key=lambda x:x[1],reverse=True) kw_str = ' '.join([w for w,f in items]) weight_str = ' '.join([str(f/sum_weight) for w,f in items]) dbutils.insert(self.dbcon, 'topic', {'name':r[0], 'doc_num':r[1], 'keyword':kw_str, 'weight':weight_str}) self.dbcon.commit()