示例#1
0
def preProcessingData(filename):
    loadData(filename)
    jt_time = time.time()
    global jt
    jt = JiebaTokenizer(stopwords_path, 'c')
    end_jt_time = time.time()
    print('JiebaTokenizer time: %s' % str(end_jt_time - jt_time))
    # 根据所有的标注数据做词向量模型 生成词典
    wordList, wordDict = buildWords(jt, labelContents)
    end_build_time = time.time()
    print('buildWords time: %s' % str(end_build_time - end_jt_time))
    # 生成特征向量
    global fb
    fb = FeatureBuilder(wordDict)
    end_fb_build_time = time.time()
    print('FeatureBuilder time: %s' % str(end_fb_build_time - end_build_time))
    # 生成指纹
    global smb
    smb = SimhashBuilder(wordList)
    end_smb_build_time = time.time()
    print('SimhashBuilder time: %s' %
          str(end_smb_build_time - end_fb_build_time))
    # 生成所有标注数据的特征向量
    for flowId, processLabelDataMap in processFlowMap.items():
        processFlowMap[flowId] = generateDocFeatureVector(
            processLabelDataMap, jt, fb, smb)
    end_docFV_time = time.time()
    print('generateDocFeatureVector time: %s' %
          str(end_docFV_time - end_smb_build_time))
示例#2
0
 def __init__(self, word_dict_path):
     # Load word list
     self.word_dict_path = word_dict_path
     self.word_list = []
     with open(word_dict_path, 'r') as ins:
         for line in ins.readlines():
             self.word_list.append(line.split()[1])
     self.word_dict = {}
     for idx, ascword in enumerate(self.word_list):
         self.word_dict[ascword.decode('utf8')] = idx
     self.fb = FeatureBuilder(self.word_dict)
     self.smb = SimhashBuilder(self.word_list)
     print 'Loaded ', len(self.word_list), 'words'
示例#3
0
    def __init__(self, word_dict, keyword_dict=None):
        # Load word list
        self.word_list = []
        self.word_dict = {}
        l = [(value, key) for key, value in word_dict.items()]
        l = sorted(l, reverse=True)
        for idx, (value, key) in enumerate(l):
            self.word_list.append(key)
            self.word_dict[key.decode('utf8')] = idx

        self.fb = FeatureBuilder(self.word_dict, keyword_dict)
        self.smb = SimhashBuilder(self.word_list)
        self.mnb = MinhashBuilder()
        print 'FeatureContainer OK'
示例#4
0
    # 初始化分词器,主要是加载停用词
    jt = JiebaTokenizer(stopword_path, 'c')

    # 分词 tokens返回分词后的数组
    doc_token_1 = jt.tokens(doc_data_1)
    print 'Loading word dict...'
    # 加载字典并构建词典
    word_list = []
    with open(word_dict, 'r') as ins:
        for line in ins.readlines():
            word_list.append(line.split()[1])
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
    # 构建非0特征向量
    fb = FeatureBuilder(word_dict)
    doc_feat_1 = fb.compute(
        doc_token_1
    )  # return feature_nonzero得到一个非0 长度的向量 ,元素为(idx,value)且 value > 0
    #使得字典中的每个值都有一个hash值,
    smb = SimhashBuilder(word_list)
    doc_fl_1 = DocFeatLoader(smb, doc_feat_1)
    #测试文件,用于调研算法
    out_file = open('/home/lin.xiong/lsh_data/out.file', 'w')
    #fp_set = set()
    fp_arr = []
    fp_post_id_dict = {}
    with open('/home/lin.xiong/lsh_data/lsh_clear.fingerprint', 'r') as fp:
        for line in fp:
            fp_post_id_dict[long(line.split('\t')[1])] = line.split('\t')[0]
            fp_arr.append(long(line.split('\t')[1]))