示例#1
0
def cluster_message(stop_words, user_dict, msg_fname, cluster_file,
                    summary_file):
    # Init tokenizer
    jt = JiebaTokenizer(stop_words, user_dict, 'c')
    token_lines = token_message(jt, msg_fname)
    wdb = WordDictBuilder()
    wdb.add_tokens_list(token_lines)
    wdb.save('../data/word_dict.txt')
    keyword_dict = get_user_keywords(user_dict)

    cluser = Cluster(gl.gl_FUNCNUM)
    # Init feature_builder and simhash_builder
    fc = FeatureContainer(wdb.word_dict, keyword_dict)
    with open(msg_fname, 'r') as ins:
        for lineidx, line in enumerate(ins.readlines()):
            if (lineidx % 100 == 0):
                print lineidx
            (time, number, sender, message) = line.strip().split('|')[0:4]
            if (number == '10658368'):
                continue
            #替换数字、字母,截取第一句
            short_msg = re.split(u'。'.encode('utf8'), message)[0]
            new_msg = re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', short_msg)
            #new_msg = re.split(u'。'.encode('utf8'), re.sub(r'[0-9a-zA-Z+=\./:\"<>|_&#\s\*\-]', '', message))[0]

            # Tokenize
            tokens = jt.tokens(new_msg.strip().decode('utf8'))
            feature_vec, sim_hash, min_hash = fc.compute_feature(tokens)
            cluser.add_one(min_hash, sim_hash, short_msg)

    cluser.save_cluster(cluster_file, summary_file)
    print "cluser finish"
示例#2
0
def preProcessingData(filename):
    loadData(filename)
    jt_time = time.time()
    global jt
    jt = JiebaTokenizer(stopwords_path, 'c')
    end_jt_time = time.time()
    print('JiebaTokenizer time: %s' % str(end_jt_time - jt_time))
    # 根据所有的标注数据做词向量模型 生成词典
    wordList, wordDict = buildWords(jt, labelContents)
    end_build_time = time.time()
    print('buildWords time: %s' % str(end_build_time - end_jt_time))
    # 生成特征向量
    global fb
    fb = FeatureBuilder(wordDict)
    end_fb_build_time = time.time()
    print('FeatureBuilder time: %s' % str(end_fb_build_time - end_build_time))
    # 生成指纹
    global smb
    smb = SimhashBuilder(wordList)
    end_smb_build_time = time.time()
    print('SimhashBuilder time: %s' %
          str(end_smb_build_time - end_fb_build_time))
    # 生成所有标注数据的特征向量
    for flowId, processLabelDataMap in processFlowMap.items():
        processFlowMap[flowId] = generateDocFeatureVector(
            processLabelDataMap, jt, fb, smb)
    end_docFV_time = time.time()
    print('generateDocFeatureVector time: %s' %
          str(end_docFV_time - end_smb_build_time))
"""
import os
import sys
import time
from tokens import JiebaTokenizer
from DictBuilder import WordDictBuilder

if __name__ == "__main__":
    if len(sys.argv) < 4:
        print "Usage:\tpreprocess.py <docpath> <stopword_path> <worddict_path>"
        exit(-1)
    doc_path, stopword_path, worddict_path = sys.argv[1:]
    print "Arguments:", sys.argv[1:]

    # Init tokenizer
    jt = JiebaTokenizer(stopword_path, "c")
    # Load doc data
    with open(doc_path) as ins:
        doc_data = ins.read().decode("utf8")
    # Tokenization
    doc_tokens = jt.tokens(doc_data)
    # Write to token file
    with open(doc_path[: doc_path.rfind(".")] + ".token", "w") as outs:
        outs.write("/".join([token.encode("utf8") for token in doc_tokens]))

    # Load original word dict, update and save
    wdb = WordDictBuilder(worddict_path, tokenlist=doc_tokens)
    wdb.run()
    wdb.save(worddict_path)
    print "Totally", len(wdb.word_dict), "words"
示例#4
0
    doc_1_noise,doc_path_1, doc_path_2, stopword_path, word_dict, mode, threshold = ('../lsh_data/doc_1.data',\
                                                                                    '../lsh_data/doc_1.clear',\
                                                                                    '../lsh_data/doc_2.data',\
                                                                                    '../lsh_data/stopwords.txt',\
                                                                                    '../lsh_data/word.dict',\
                                                                                    '-s',15)
    print 'Arguments get success:', sys.argv[1:]
    #原始query文档
    with open(doc_1_noise) as noise_file:
        doc_noise_file = noise_file.read().decode('utf8')
    #去噪后的query文档
    with open(doc_path_1) as ins:
        doc_data_1 = ins.read().decode('utf8')
    print 'Loaded', doc_path_1
    # 初始化分词器,主要是加载停用词
    jt = JiebaTokenizer(stopword_path, 'c')

    # 分词 tokens返回分词后的数组
    doc_token_1 = jt.tokens(doc_data_1)
    print 'Loading word dict...'
    # 加载字典并构建词典
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
    # 构建非0特征向量
    fb = FeatureBuilder(word_dict)
    doc_feat_1 = fb.compute(
示例#5
0
import sys
from tokens import JiebaTokenizer
from simhash_imp import SimhashBuilder, hamming_distance
from features import FeatureBuilder

if __name__=="__main__":
    if len(sys.argv) < 7:
        print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path"
        exit(-1)
    # Load word list
    word_list = []
    with open(sys.argv[1], 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])
    # Init tokenizer
    jt = JiebaTokenizer(sys.argv[2], 'c')
    # Init feature_builder
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
    fb = FeatureBuilder(word_dict)
    # Init simhash_builder
    smb = SimhashBuilder(word_list)
    # Load fingerprint list
    fingerprint_list = []
    with open(sys.argv[3], 'r') as ins:
        for line in ins.readlines():
            fingerprint_list.append(int(line))
    # For exp: load document content
    doc_list = []
    with open(sys.argv[4], 'r') as ins:
示例#6
0
文件: launch.py 项目: TPLink32/nlp
import sys
from tokens import JiebaTokenizer
from simhash_imp import SimhashBuilder, hamming_distance
from features import FeatureBuilder

if __name__ == "__main__":
    if len(sys.argv) < 7:
        print "Usage:\tlaunch.py word_dict_path stop_words_path fingerprint_path documents_path test_path result_path"
        exit(-1)
    # Load word list
    word_list = []
    with open(sys.argv[1], 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])
    # Init tokenizer
    jt = JiebaTokenizer(sys.argv[2], 'c')
    # Init feature_builder
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
    fb = FeatureBuilder(word_dict)
    # Init simhash_builder
    smb = SimhashBuilder(word_list)
    # Load fingerprint list
    fingerprint_list = []
    with open(sys.argv[3], 'r') as ins:
        for line in ins.readlines():
            fingerprint_list.append(int(line))
    # For exp: load document content
    doc_list = []
    with open(sys.argv[4], 'r') as ins:
        feature_vec = self.fb.compute(token_list)
        return feature_vec, self.smb.sim_hash(feature_vec)


"""
    def __del__(self):
        with open(self.word_dict_path, 'w') as outs:
            for idx, word in enumerate(self.word_list):
                outs.write('%s\t%s%s'%(idx, word, os.linesep))
"""
if __name__ == "__main__":
    if len(sys.argv) < 7:
        print "Usage:\tlaunch_inc.py <word_dict_path> <stop_words_path> <fingerprint_path> <documents_path> <test_path> <result_path>"
        exit(-1)
    # Init tokenizer
    jt = JiebaTokenizer(sys.argv[2], "c")
    # Init feature_builder and simhash_builder
    fc = FeatureContainer(sys.argv[1])
    # Load fingerprint list
    fingerprint_list = []
    with open(sys.argv[3], "r") as ins:
        for line in ins.readlines():
            fingerprint_list.append(int(line))
    # For exp: load document content
    doc_list = []
    with open(sys.argv[4], "r") as ins:
        for line in ins.readlines():
            doc_list.append(line.strip())
    # Detection process begins
    min_sim = 64
    min_docid = 0
示例#8
0
if __name__ == "__main__":
    if len(sys.argv) < 7:
        print "Usage:\tisSimilar.py <doc1> <doc2> <stopword_path> <word_dict> <-c/-s> <threshold>"
        exit(-1)
    doc_path_1, doc_path_2, stopword_path, word_dict, mode, threshold = sys.argv[1:]
    print 'Arguments:', sys.argv[1:]
    with open(doc_path_1) as ins:
        doc_data_1 = ins.read().decode('utf8')
        print 'Loaded', doc_path_1
    with open(doc_path_2) as ins:
        doc_data_2 = ins.read().decode('utf8')
        print 'Loaded', doc_path_2

    # Init tokenizer
    jt = JiebaTokenizer(stopword_path, 'c')

    # Tokenization
    doc_token_1 = jt.tokens(doc_data_1)
    doc_token_2 = jt.tokens(doc_data_2)

    print 'Loading word dict...'
    # Load word list from word_dict
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])

    # Build unicode string word dict
    word_dict = {}
    for idx, ascword in enumerate(word_list):