示例#1
0
     for line in fp:
         fp_post_id_dict[long(line.split('\t')[1])] = line.split('\t')[0]
         fp_arr.append(long(line.split('\t')[1]))
 comment = []
 with open('/home/lin.xiong/lsh_data/lsh.data', 'r') as comment_file:
     for line in comment_file:
         comment.append(line.strip().split('$&&$')[1])
 fp_comment_tup = zip(fp_arr, comment)
 fp_comment_dict = dict(fp_comment_tup)
 if mode == '-s':
     print 'Matching by Simhash + hamming distance'
     #----------------------------------------------------------------------
     tmp_dic = {}
     start_millis = int(round(time.time() * 1000))
     for fp in fp_arr:
         dist = hamming_distance(doc_fl_1.fingerprint, fp)
         tmp_dic[fp] = dist
     end_millis = int(round(time.time() * 1000))
     print end_millis - start_millis
     #------------------------------------------------------------------------
     dict_sorted = sorted(tmp_dic.items(), key=lambda d: d[1])
     concat = 0
     for fp_dist_tup in dict_sorted:
         if concat <= 99:
             bin_doc_1 = list(bin(doc_fl_1.fingerprint))
             print len(bin_doc_1),
             bin_doc_2 = list(bin(fp_dist_tup[0]))
             print len(bin_doc_2),
             bin_zip = zip(bin_doc_1, bin_doc_2)
             cnt = 0
             for bin1_bin2_tup in bin_zip:
示例#2
0
    # Detection process begins
    min_sim = 64
    min_docid = 0
    with open(sys.argv[5], 'r') as ins:
        for lineidx, line in enumerate(ins.readlines()):
            if lineidx != 642:
                continue
            # Tokenize
            tokens = jt.tokens(line.strip().decode('utf8'))
            # Compute text feature
            feature = fb.compute(tokens)
            # Compute simhash
            fingerprint = smb.sim_hash(feature)
            result_list = []
            for idx, fp in enumerate(fingerprint_list):
                sim = hamming_distance(fingerprint, fp, 64)
                result_list.append((sim, idx))
            result_list = sorted(result_list, cmp=lambda x,y: cmp(x[0],y[0]))
            if result_list[0][0] < min_sim:
                min_sim, min_docid = result_list[0][0], lineidx
            #'''
            with open(sys.argv[6], 'w') as outs:
                outs.write(line.strip()+os.linesep)
                for sim, idx in result_list:
                    outs.write('%s\t%s%s' %(sim, doc_list[idx], os.linesep)) 
            #'''
            #if lineidx == 2:
            #    break           
    print min_sim, min_docid

示例#3
0
    # Build unicode string word dict
    word_dict = {}
    for idx, ascword in enumerate(word_list):
        word_dict[ascword.decode('utf8')] = idx
        # Build nonzero-feature
    fb = FeatureBuilder(word_dict)
    doc_feat_1 = fb.compute(doc_token_1)
    doc_feat_2 = fb.compute(doc_token_2)

    # Init simhash_builder
    smb = SimhashBuilder(word_list)

    doc_fl_1 = DocFeatLoader(smb, doc_feat_1)
    doc_fl_2 = DocFeatLoader(smb, doc_feat_2)

    if mode == '-c':
        print 'Matching by VSM + cosine distance'
        dist = cosine_distance_nonzero(doc_fl_1.feat_vec, doc_fl_2.feat_vec, norm=False)
        if dist > float(threshold):
            print 'Matching Result:\t<True:%s>' % dist
        else:
            print 'Matching Result:\t<False:%s>' % dist
    elif mode == '-s':
        print 'Matching by Simhash + hamming distance'
        dist = hamming_distance(doc_fl_1.fingerprint, doc_fl_2.fingerprint)
        if dist < float(threshold):
            print 'Matching Result:\t<True:%s>' % dist
        else:
            print 'Matching Result:\t<False:%s>' % dist
示例#4
0
文件: launch.py 项目: TPLink32/nlp
            doc_list.append(line.strip())
    # Detection process begins
    min_sim = 64
    min_docid = 0
    with open(sys.argv[5], 'r') as ins:
        for lineidx, line in enumerate(ins.readlines()):
            if lineidx != 642:
                continue
            # Tokenize
            tokens = jt.tokens(line.strip().decode('utf8'))
            # Compute text feature
            feature = fb.compute(tokens)
            # Compute simhash
            fingerprint = smb.sim_hash(feature)
            result_list = []
            for idx, fp in enumerate(fingerprint_list):
                sim = hamming_distance(fingerprint, fp, 64)
                result_list.append((sim, idx))
            result_list = sorted(result_list, cmp=lambda x, y: cmp(x[0], y[0]))
            if result_list[0][0] < min_sim:
                min_sim, min_docid = result_list[0][0], lineidx
            #'''
            with open(sys.argv[6], 'w') as outs:
                outs.write(line.strip() + os.linesep)
                for sim, idx in result_list:
                    outs.write('%s\t%s%s' % (sim, doc_list[idx], os.linesep))
            #'''
            #if lineidx == 2:
            #    break
    print min_sim, min_docid