def cluster_message(stop_words, user_dict, msg_fname, cluster_file, summary_file): # Init tokenizer jt = JiebaTokenizer(stop_words, user_dict, 'c') token_lines = token_message(jt, msg_fname) wdb = WordDictBuilder() wdb.add_tokens_list(token_lines) wdb.save('../data/word_dict.txt') keyword_dict = get_user_keywords(user_dict) cluser = Cluster(gl.gl_FUNCNUM) # Init feature_builder and simhash_builder fc = FeatureContainer(wdb.word_dict, keyword_dict) with open(msg_fname, 'r') as ins: for lineidx, line in enumerate(ins.readlines()): if (lineidx % 100 == 0): print lineidx (time, number, sender, message) = line.strip().split('|')[0:4] if (number == '10658368'): continue #替换数字、字母,截取第一句 short_msg = re.split(u'。'.encode('utf8'), message)[0] new_msg = re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', short_msg) #new_msg = re.split(u'。'.encode('utf8'), re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', message))[0] # Tokenize tokens = jt.tokens(new_msg.strip().decode('utf8')) feature_vec, sim_hash, min_hash = fc.compute_feature(tokens) cluser.add_one(min_hash, sim_hash, short_msg) cluser.save_cluster(cluster_file, summary_file) print "cluser finish"
def preProcessingData(filename): loadData(filename) jt_time = time.time() global jt jt = JiebaTokenizer(stopwords_path, 'c') end_jt_time = time.time() print('JiebaTokenizer time: %s' % str(end_jt_time - jt_time)) # 根据所有的标注数据做词向量模型 生成词典 wordList, wordDict = buildWords(jt, labelContents) end_build_time = time.time() print('buildWords time: %s' % str(end_build_time - end_jt_time)) # 生成特征向量 global fb fb = FeatureBuilder(wordDict) end_fb_build_time = time.time() print('FeatureBuilder time: %s' % str(end_fb_build_time - end_build_time)) # 生成指纹 global smb smb = SimhashBuilder(wordList) end_smb_build_time = time.time() print('SimhashBuilder time: %s' % str(end_smb_build_time - end_fb_build_time)) # 生成所有标注数据的特征向量 for flowId, processLabelDataMap in processFlowMap.items(): processFlowMap[flowId] = generateDocFeatureVector( processLabelDataMap, jt, fb, smb) end_docFV_time = time.time() print('generateDocFeatureVector time: %s' % str(end_docFV_time - end_smb_build_time))
""" import os import sys import time from tokens import JiebaTokenizer from DictBuilder import WordDictBuilder if __name__ == "__main__": if len(sys.argv) < 4: print "Usage:\tpreprocess.py <docpath> <stopword_path> <worddict_path>" exit(-1) doc_path, stopword_path, worddict_path = sys.argv[1:] print "Arguments:", sys.argv[1:] # Init tokenizer jt = JiebaTokenizer(stopword_path, "c") # Load doc data with open(doc_path) as ins: doc_data = ins.read().decode("utf8") # Tokenization doc_tokens = jt.tokens(doc_data) # Write to token file with open(doc_path[: doc_path.rfind(".")] + ".token", "w") as outs: outs.write("/".join([token.encode("utf8") for token in doc_tokens])) # Load original word dict, update and save wdb = WordDictBuilder(worddict_path, tokenlist=doc_tokens) wdb.run() wdb.save(worddict_path) print "Totally", len(wdb.word_dict), "words"
doc_1_noise,doc_path_1, doc_path_2, stopword_path, word_dict, mode, threshold = ('../lsh_data/doc_1.data',\ '../lsh_data/doc_1.clear',\ '../lsh_data/doc_2.data',\ '../lsh_data/stopwords.txt',\ '../lsh_data/word.dict',\ '-s',15) print 'Arguments get success:', sys.argv[1:] #原始query文档 with open(doc_1_noise) as noise_file: doc_noise_file = noise_file.read().decode('utf8') #去噪后的query文档 with open(doc_path_1) as ins: doc_data_1 = ins.read().decode('utf8') print 'Loaded', doc_path_1 # 初始化分词器,主要是加载停用词 jt = JiebaTokenizer(stopword_path, 'c') # 分词 tokens返回分词后的数组 doc_token_1 = jt.tokens(doc_data_1) print 'Loading word dict...' # 加载字典并构建词典 word_list = [] with open(word_dict, 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx # 构建非0特征向量 fb = FeatureBuilder(word_dict) doc_feat_1 = fb.compute(
import sys from tokens import JiebaTokenizer from simhash_imp import SimhashBuilder, hamming_distance from features import FeatureBuilder if __name__=="__main__": if len(sys.argv) < 7: print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path" exit(-1) # Load word list word_list = [] with open(sys.argv[1], 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) # Init tokenizer jt = JiebaTokenizer(sys.argv[2], 'c') # Init feature_builder word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx fb = FeatureBuilder(word_dict) # Init simhash_builder smb = SimhashBuilder(word_list) # Load fingerprint list fingerprint_list = [] with open(sys.argv[3], 'r') as ins: for line in ins.readlines(): fingerprint_list.append(int(line)) # For exp: load document content doc_list = [] with open(sys.argv[4], 'r') as ins:
import sys from tokens import JiebaTokenizer from simhash_imp import SimhashBuilder, hamming_distance from features import FeatureBuilder if __name__ == "__main__": if len(sys.argv) < 7: print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path" exit(-1) # Load word list word_list = [] with open(sys.argv[1], 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) # Init tokenizer jt = JiebaTokenizer(sys.argv[2], 'c') # Init feature_builder word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword.decode('utf8')] = idx fb = FeatureBuilder(word_dict) # Init simhash_builder smb = SimhashBuilder(word_list) # Load fingerprint list fingerprint_list = [] with open(sys.argv[3], 'r') as ins: for line in ins.readlines(): fingerprint_list.append(int(line)) # For exp: load document content doc_list = [] with open(sys.argv[4], 'r') as ins:
feature_vec = self.fb.compute(token_list) return feature_vec, self.smb.sim_hash(feature_vec) """ def __del__(self): with open(self.word_dict_path, 'w') as outs: for idx, word in enumerate(self.word_list): outs.write('%s\t%s%s'%(idx, word, os.linesep)) """ if __name__ == "__main__": if len(sys.argv) < 7: print "Usage:\tlaunch_inc.py <word_dict_path> <stop_words_path> <fingerprint_path> <documents_path> <test_path> <result_path>" exit(-1) # Init tokenizer jt = JiebaTokenizer(sys.argv[2], "c") # Init feature_builder and simhash_builder fc = FeatureContainer(sys.argv[1]) # Load fingerprint list fingerprint_list = [] with open(sys.argv[3], "r") as ins: for line in ins.readlines(): fingerprint_list.append(int(line)) # For exp: load document content doc_list = [] with open(sys.argv[4], "r") as ins: for line in ins.readlines(): doc_list.append(line.strip()) # Detection process begins min_sim = 64 min_docid = 0
if __name__ == "__main__": if len(sys.argv) < 7: print "Usage:\tisSimilar.py <doc1> <doc2> <stopword_path> <word_dict> <-c/-s> <threshold>" exit(-1) doc_path_1, doc_path_2, stopword_path, word_dict, mode, threshold = sys.argv[1:] print 'Arguments:', sys.argv[1:] with open(doc_path_1) as ins: doc_data_1 = ins.read().decode('utf8') print 'Loaded', doc_path_1 with open(doc_path_2) as ins: doc_data_2 = ins.read().decode('utf8') print 'Loaded', doc_path_2 # Init tokenizer jt = JiebaTokenizer(stopword_path, 'c') # Tokenization doc_token_1 = jt.tokens(doc_data_1) doc_token_2 = jt.tokens(doc_data_2) print 'Loading word dict...' # Load word list from word_dict word_list = [] with open(word_dict, 'r') as ins: for line in ins.readlines(): word_list.append(line.split()[1]) # Build unicode string word dict word_dict = {} for idx, ascword in enumerate(word_list):