def __init__(self): if not hot_word.idf_hd: hot_word.idf_hd = idf() hot_word.idf_hd.load(tf_idf_config.idf_dumps_path, tf_idf_config.stopwords_path) if not hot_word.seg_hd: hot_word.seg_hd = cppjieba(tf_idf_config.dict_path, tf_idf_config.hmm_path) if not hot_word.short_url_hd: hot_word.short_url_hd = fast_search.load(tf_idf_config.short_url_path) if not hot_word.url_re: hot_word.url_re = re.compile(r"(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+") self.hot_word_dic = {} self.get_file_word_flag = "num" self.word_list_n = 5 self.get_file_word_cbk = {} self.get_file_word_cbk["num"] = self.get_file_word_list_by_num self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent
def __init__(self): if not hot_word.idf_hd: hot_word.idf_hd = idf() hot_word.idf_hd.load(tf_idf_config.idf_dumps_path, tf_idf_config.stopwords_path) if not hot_word.seg_hd: hot_word.seg_hd = cppjieba(tf_idf_config.dict_path, tf_idf_config.hmm_path) if not hot_word.short_url_hd: hot_word.short_url_hd = fast_search.load( tf_idf_config.short_url_path) if not hot_word.url_re: hot_word.url_re = re.compile( r'(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+') self.hot_word_dic = {} self.get_file_word_flag = "num" self.word_list_n = 5 self.get_file_word_cbk = {} self.get_file_word_cbk["num"] = self.get_file_word_list_by_num self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent
def get_rubbish_set(self, stopword_path="stopwords.txt"): rubbish_set = set() hd = 0 try: hd = fast_search.load(stopword_path) with open(stopword_path, "r") as fd: for l in fd: rubbish_set.add(l.strip()) except: hd = 0 #如果读不到文件则忽略 pass rubbish_set.add(" ") rubbish_set.add(" ") rubbish_set.add("\t") rubbish_set.add("\r") rubbish_set.add("\n") rubbish_set.add("\r\n") rubbish_set.add("DC") rubbish_set.add("DS") rubbish_set.add("gt") return rubbish_set, hd
def get_rubbish_set(self, stopword_path = "stopwords.txt"): rubbish_set = set() hd = 0 try: hd = fast_search.load(stopword_path) with open(stopword_path, "r") as fd: for l in fd: rubbish_set.add(l.strip()) except: hd = 0 #如果读不到文件则忽略 pass rubbish_set.add(" ") rubbish_set.add(" ") rubbish_set.add("\t") rubbish_set.add("\r") rubbish_set.add("\n") rubbish_set.add("\r\n") rubbish_set.add("DC") rubbish_set.add("DS") rubbish_set.add("gt") return rubbish_set, hd
def __init__(self, db_name): self.idf_hd = idf() with open("idf_dumps.txt", "r") as fd: s = fd.read() self.idf_hd.loads(s) self.hot_word_dic = {} self.short_url_hd = fast_search.load("short_url.txt") self.dbhd = leveldb.LevelDB(db_name) self.url_re = re.compile(r'(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+') #内部使用batch做缓存 add_doc时暂时不写入db文件 #要获取结果,或者达到阈值(batch_limit)时才写入文件 self.batch = leveldb.WriteBatch() self.batch_counter = 0 self.batch_limit = 100000 self.fid = 0 #self.get_file_word_flag = "percent" self.get_file_word_flag = "num" self.word_list_n = 5 self.get_file_word_cbk = {} self.get_file_word_cbk["num"] = self.get_file_word_list_by_num self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent
def __init__(self, db_name): self.idf_hd = idf() with open("idf_dumps.txt", "r") as fd: s = fd.read() self.idf_hd.loads(s) self.hot_word_dic = {} self.short_url_hd = fast_search.load("short_url.txt") self.dbhd = leveldb.LevelDB(db_name) self.url_re = re.compile( r'(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+') #内部使用batch做缓存 add_doc时暂时不写入db文件 #要获取结果,或者达到阈值(batch_limit)时才写入文件 self.batch = leveldb.WriteBatch() self.batch_counter = 0 self.batch_limit = 100000 self.fid = 0 #self.get_file_word_flag = "percent" self.get_file_word_flag = "num" self.word_list_n = 5 self.get_file_word_cbk = {} self.get_file_word_cbk["num"] = self.get_file_word_list_by_num self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent