예제 #1
0
def xiaoming(lines, length_of_f, i):
    first_user = lines[i]
    a = first_user.split()
    user_ID = a[0] + '    ' + '\n'
    print(a[1])
    hash_value = [0 for i in range(0, 51)]  #该列表用于写入文件
    sort_value = [0 for i in range(0, 51)]  #该列表用于放置更新中的值
    # 创建一个该用户文件存储该用户与其他用户的minhash值
    #f_hash  = open('user_minhash/'+a[0]+'.txt',mode = 'w',encoding = 'utf-8')
    # f_hash.write(a[0] + '    '+'\n')  # 写入该用户的ID
    for j in range(length_of_f - 1):
        if (i == j):
            continue
        else:
            second_user = lines[j]

            b = second_user.split()
            data1 = a[2].split(',')
            data2 = b[2].split(',')

            #若果两列表相同个数为0或者一,舍去
            length_of_data1_and_data2 = len(data1) + len(data2)
            #去重
            both_data1_2 = list(set(data1 + data2))

            if ((length_of_data1_and_data2 - len(both_data1_2)) > 20):
                m1, m2 = MinHash(), MinHash()
                for d in data1:
                    m1.update(d.encode('utf8'))
                for d in data2:
                    m2.update(d.encode('utf8'))
                #print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))
                if (m1.jaccard(m2) <= 0.0078125):
                    continue
                else:
                    c = [b[0], m1.jaccard(m2)]
                    for m in range(49, -1, -1):
                        if (sort_value[m] == 0):
                            sort_value[m] = c
                        else:
                            if (c[1] > sort_value[m][1]):
                                sort_value[m + 1] = sort_value[m]
                                sort_value[m] = c
                            else:
                                break
            else:
                continue
                #new_value = b[0] + '    ' +str(m1.jaccard(m2)) + '\n'
                #hash_value.append(new_value)

    return sort_value
    #f_hash.write(b[0] + '    ' +str(m1.jaccard(m2)) + '\n')
    """ for item in sort_value:
class DataPoint:
    """
  A class that handles a hash example.
  """
    def __init__(self, string, index, only_string=True):
        """
    Params:
      :string:  String to be stored.
      :index: Number of the line in the file from which this sentence was read.
      :only_string: Whether to only store string.
    """
        self.string = string.strip("\n")
        self.index = index
        self.character_level = DATA_FILTERING["character_level"]
        self.cluster_index = 0

        if not only_string:
            self.init_hash()

    # Initialize hash from string.
    def init_hash(self):
        self.min_hash = MinHash(num_perm=DATA_FILTERING["num_permutations"])
        for word in self.string.split():
            if self.character_level:
                for char in word:
                    self.min_hash.update(char.encode('utf8'))
            else:
                self.min_hash.update(word.encode('utf8'))

    # Computes jaccard distance between self and another hash.
    def similarity(self, other, dist_matrix=""):
        return self.min_hash.jaccard(other.min_hash)
예제 #3
0
def multi_minhash(df1, num1, df2, num2):
    c_names1 = []
    c_names2 = []
    for name, dtype in df1.dtypes:
        if dtype == "string":
            c_names1.append(name)
    for name, dtype in df2.dtypes:
        if dtype == "string":
            c_names2.append(name)
    for col1, col2 in itertools.product(c_names1, c_names2):
        m1, m2 = MinHash(), MinHash()
        count1 = int(np.sqrt(num1)) * 100
        count2 = int(np.sqrt(num2)) * 100
        data1 = df1.select(col1).rdd.flatMap(lambda x: x).takeSample(
            False, count1)
        data2 = df2.select(col2).rdd.flatMap(lambda x: x).takeSample(
            False, count2)
        for d in data1:
            for i in ngrams(d, 4):
                m1.update(''.join(i).encode('utf-8'))
        for d in data2:
            for i in ngrams(d, 4):
                m2.update(''.join(i).encode('utf-8'))
        print("MinHash Similarity for {} and {} is {}".format(
            col1, col2, m1.jaccard(m2)))
예제 #4
0
    def _hello_world():
        """
        This fragment was taken from the datasketch github page:
        https://github.com/ekzhu/datasketch
        """
        data1 = [
            'minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'datasets'
        ]
        data2 = [
            'minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
            'estimating', 'the', 'similarity', 'between', 'documents'
        ]

        m1, m2 = MinHash(), MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2))) / float(
            len(s1.union(s2)))
        print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #5
0
def similarity():
  source_file = open(test_source_path)
  target_file = open(test_responses_path)
  source_list = [line.strip("\n") for line in source_file]
  target_list = [line.strip("\n") for line in target_file]
  similarities = []

  for source, target in zip(source_list, target_list):
    source_hash = MinHash(num_perm=256)
    for word in source.split():
      source_hash.update(word.encode('utf8'))

    target_hash = MinHash(num_perm=256)
    for word in target.split():
      target_hash.update(word.encode('utf8'))

    similarities.append(source_hash.jaccard(target_hash))

  avg_similarity = sum(similarities) / len(similarities)
  sim_std = math.sqrt(
      sum([(x - avg_similarity) ** 2 for x in similarities]) /
      (len(similarities) - 1))

  sim = "average similarity: " + str(avg_similarity) + " (%f)" % (sim_std)
  print(sim)
  output.write(sim + "\n")

  source_file.close()
  target_file.close()
예제 #6
0
def mylshforest(corpus):
    #print(len(corpus))
    forest = MinHashLSHForest(num_perm=32)
    score_res = [0]
    mh = []
    for i in range(len(corpus) - 1):
        doc = corpus[i]
        doc2 = corpus[i + 1]
        m = MinHash(num_perm=32)
        for d in doc:
            m.update(d.encode('utf8'))
        forest.add(str(i), m)
        forest.index()
        mh.append(m)

        m2 = MinHash(num_perm=32)
        for d in doc2:
            m2.update(d.encode('utf8'))
        result = forest.query(m2, 10)
        score = 0.0
        for j in range(len(result)):
            score = score + m2.jaccard(mh[int(result[j])])
        if (len(result) > 0):
            score = score / len(result)
        score_res.append(score)
        i = i + 1
    return score_res
예제 #7
0
 def estimate_jaccard_similarity_using_minhashing(self, first_doc, second_doc, permutations=128):
     h1 = MinHash(num_perm=permutations)
     h2 = MinHash(num_perm=permutations)
     for word in self.documents.get(first_doc):
         h1.update(get_word(word_id=word).encode('utf-8'))
     for word in self.documents.get(second_doc):
         h2.update(get_word(word_id=word).encode('utf-8'))
     return h1.jaccard(h2)
예제 #8
0
def compare_with_minhash(set1, set2):
    mh1, mh2 = MinHash(), MinHash()
    for el in set1:
        mh1.update(el.encode('utf8'))
    for el in set2:
        mh2.update(el.encode('utf8'))

    return mh1.jaccard(mh2)
예제 #9
0
def locSensitiveHashingFunction(s1, s2, tresh=.5):
    set_1, set_2 = Set(s1.split(' ')), Set(s2.split(' '))
    m1, m2 = MinHash(), MinHash()
    for d in set_1:
        m1.update(d.encode('utf-8'))
    for d in set_2:
        m2.update(d.encode('utf-8'))
    return 1 if m1.jaccard(m2) > tresh else 0
예제 #10
0
def get_result(question_list, maskquestion_dict):
    """
    :param question_list: 需要处理的未知问题
    :param maskquestion_dict: 已知的问题,最开始没有已知问题,随着时间推移不断加入已知问题,
    maskquestion_dict是一个字典类型,键为问题id,类型为字符串,值为问题
    :return:
    """
    current_max_question_id = 0
    for key in maskquestion_dict.keys():
        value = maskquestion_dict[key]
        mask_question = set(list(jieba.cut(value)))
        intersection_set = mask_question & stop_words
        mask_question = mask_question - intersection_set
        m = MinHash(num_perm=1024)
        for d in mask_question:
            m.update(d.encode("utf8"))
        lsh.insert(str(key), m)
        current_max_question_id = max(int(key), current_max_question_id)

    new_mask_question_dict = {}
    result_list = []
    for question in question_list:
        question_set = set(list(jieba.cut(question)))
        intersection_set = question_set & stop_words
        question_set = question_set - intersection_set
        m = MinHash(num_perm=1024)
        for d in question_set:
            m.update(d.encode("utf8"))
        result = lsh.query(m)
        if len(result) == 0:
            current_max_question_id += 1
            new_mask_question_dict[str(current_max_question_id)] = question
            result_list.append([str(current_max_question_id), question])
            """
            需要进行插入操作,将str(current_max_question_id), question插入到标问表中,如果数据库插入失败,直接返回失败状态,
            同时将new_mask_question_dict, result_list分别置为{},【】
            """
            lsh.insert(str(current_max_question_id), m)
        elif len(result) == 1:
            result_list.append([str(result[0]), question])
        else:
            select_max_similarity_question_id = ''
            jaccard_score = 0
            for key in result:
                value = maskquestion_dict[key] if key in maskquestion_dict else new_mask_question_dict[key]
                value_set = set(list(jieba.cut(value)))
                intersection_set = value_set & stop_words
                value_set = value_set - intersection_set
                temp_m = MinHash(num_perm=1024)
                for d in value_set:
                    temp_m.update(d.encode("utf8"))
                temp_score = m.jaccard(temp_m)
                if temp_score > jaccard_score:
                    jaccard_score = temp_score
                    select_max_similarity_question_id = key
            result_list.append([select_max_similarity_question_id, question])
    return new_mask_question_dict, result_list
예제 #11
0
 def distance(self, s1, s2):
     word_list1 = self.get_features(s1)
     word_list2 = self.get_features(s2)
     m1, m2 = MinHash(), MinHash()
     for d in word_list1:
         m1.update(d.encode('utf8'))
     for d in word_list2:
         m2.update(d.encode('utf8'))
     return m1.jaccard(m2)
예제 #12
0
파일: MinHash.py 프로젝트: hokaii/CodeShow
def calculate_jaccard(text1, text2):  # 计算两行文本jaccard相似度
    minihash1, minihash2 = MinHash(), MinHash()

    for word in text1:
        minihash1.update(word.encode('utf-8'))

    for word in text2:
        minihash2.update(word.encode('utf-8'))

    return minihash1.jaccard(minihash2)
예제 #13
0
def main():

    rcode_1, page_1 = page_fetcher.fetch_page(
        "http://www.e-prostor.gov.si/zbirke-prostorskih-podatkov/zbirka-vrednotenja-nepremicnin/"
    )
    rcode_2, page_2 = page_fetcher.fetch_page(
        "http://www.e-prostor.gov.si/dostop-do-podatkov/dostop-do-podatkov/")

    start_time = time.time()
    m1_128, m2_128 = MinHash(), MinHash()
    print("128 perm hash time: ", time.time() - start_time)

    start_time = time.time()
    m1, m2 = MinHash(num_perm=256), MinHash(num_perm=256)
    print("256 perm hash time: ", time.time() - start_time)

    for d in page_1:
        m1_128.update(d.encode('utf8'))
        m1.update(d.encode('utf8'))
    for d in page_2:
        m2_128.update(d.encode('utf8'))
        m2.update(d.encode('utf8'))

    print("Estimated Jaccard for page_1 and page_2 is", m1_128.jaccard(m2_128))
    print("Estimated Jaccard for page_1 and page_2 is", m1.jaccard(m2))

    s1 = set(page_1)
    s2 = set(page_2)
    actual_jaccard = float(len(s1.intersection(s2))) / float(len(s1.union(s2)))
    print("Actual Jaccard for page_1 and page_2 is", actual_jaccard)

    sha1 = hashlib.sha1(page_1.encode('utf-8'))
    print(sha1.hexdigest())

    sha2 = hashlib.sha1(page_2.encode('utf-8'))
    print(sha2.hexdigest())

    sha1 = hashlib.sha1("That's what she said.".encode('utf-8'))
    print(sha1.hexdigest())
    sha2 = hashlib.sha1("~Michael Scott".encode('utf-8'))
    print(sha2.hexdigest())
    sha3 = hashlib.sha1("That's what she said.".encode('utf-8'))
    print(sha3.hexdigest())
예제 #14
0
def DIDsampling(dataset,BF,username,userid,attra_id,beta,clustdict):
    # dataset = Cora_labeled.objects.all()
    # clustdict = dextrapreclustering.minhashPreClustering(dataset)
    cluster_membership = {}

    # values = models.sigirCoraAttrValue.objects.filter(attr_id=attra_id)
    # attrasynonyms = models.sigirCoraValueSynonym.objects.filter(value_id__in=[ value.id for value in values])
    # record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym_id__in=[ syn.id for syn in attrasynonyms])
    record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym__value__attr_id=attra_id)
    record_noAttra = dataset.exclude(id__in = [ item.cora_id for item in record_hasAttra])




    for k, v in clustdict.items():
        for d in v:
            cluster_membership[d] = k

    sum = 0.000001
    for record in BF:
        sum = sum + len(clustdict[cluster_membership[record.id]])
    for record in BF:
        # AC
        cora2ae = models.sigirCoraToAttrEntity.objects.filter(cora_id=record.id,user=username)
        if cora2ae:
            list = [ item.attrsynonym.value.attr.id for item in cora2ae]
            if attra_id in list:
                record.orderscore = 0
                record.save()
                continue
            else:
                ac = 1-len(list)/models.sigirCoraAttr.objects.filter(userid=userid).count()
        else:
            ac = 1

        # distribution on dataset
        k = cluster_membership[record.id]
        ic = len(clustdict[k])/record_noAttra.count()
        record_minhash = MinHash(num_perm=128)
        s = set(record.cleantext.split(" "))
        for d in s:
            record_minhash.update(d.encode('utf8'))
        term2sum = 0
        for rr in BF:
            rr_minhash =  MinHash(num_perm=128)
            ss = set(rr.cleantext.split(" "))
            for dd in ss:
                rr_minhash.update(dd.encode('utf8'))
            sim = record_minhash.jaccard(rr_minhash)
            sim = (sim/sum)**beta
            term2sum = term2sum + sim
        did = ac*ic*term2sum
        record.orderscore = did
        record.save()
    return BF
예제 #15
0
def get_jaccard_index(sequence1, sequence2, k):
    seq1_minHash, seq2_minHash = MinHash(), MinHash()
    seq1_kmers = count_kmers(sequence1, k)
    seq2_kmers = count_kmers(sequence2, k)
    seq1_keys = list(seq1_kmers.keys())
    seq2_keys = list(seq2_kmers.keys())
    for key in seq1_keys:
        seq1_minHash.update(key.encode('utf8'))
    for key in seq2_keys:
        seq2_minHash.update(key.encode('utf8'))
    return seq1_minHash.jaccard(seq2_minHash)
예제 #16
0
def minhash(x, y):
    m1, m2 = MinHash(), MinHash()

    s1 = extract_keywords(x)
    s2 = extract_keywords(y)

    for data in s1:
        m1.update(data.encode('utf8'))
    for data in s2:
        m2.update(data.encode('utf8'))

    return m1.jaccard(m2)
예제 #17
0
 def are_similar(self, logs):
     error_messages = {}
     for log in logs:
         error_message = ''
         for log_line in log['log_lines']:
             if 'ERROR' in log_line:
                 m = re.match(
                     r'^\S+\s+\S+\s+\S+\s\S+\s+\S+\s+(?P<log_content>.*)$',
                     log_line)
                 if m:
                     content = m.group('log_content')
                     if not error_message:
                         error_message += content
                     else:
                         error_message += ' ' + content
         if error_message:
             if self.service_name(log['service']) not in error_messages:
                 error_messages[self.service_name(log['service'])] = []
             error_messages[self.service_name(
                 log['service'])].append(error_message)
     error_messages = [" ".join(x) for x in error_messages.values()]
     if error_messages:
         error_messages = [self.get_anon_string(x) for x in error_messages]
         error_messages = [x for x in error_messages if x]
         error_messages = [re.sub(r'\s+', ' ', x) for x in error_messages]
         values = []
         for i in range(len(error_messages) - 1):
             for j in range(i + 1, len(error_messages)):
                 i_tokens = nltk.word_tokenize(error_messages[i])
                 j_tokens = nltk.word_tokenize(error_messages[j])
                 m1, m2 = MinHash(), MinHash()
                 for d in i_tokens:
                     m1.update(d.encode('iso-8859-1'))
                 for d in j_tokens:
                     m2.update(d.encode('iso-8859-1'))
                 value = m1.jaccard(m2)
                 values.append(value)
         if len(values) > 1:
             u_value = max(values)
         elif len(values) == 1:
             u_value = values[0]
         else:
             u_value = 0
         print(u_value)
         if u_value and u_value < 0.2:
             print(error_messages)
             input()
         if u_value < 0.5:
             return False
     else:
         return False
     return True
예제 #18
0
def calc_of_similarity(text1, text2):
    # MinHash计算
    minhash1, minhash2 = MinHash(), MinHash()
    # 提取关键词
    keywords1 = extract_keyword(text1)
    keywords2 = extract_keyword(text2)

    for data in keywords1:
        minhash1.update(data.encode('utf8'))
    for data in keywords2:
        minhash2.update(data.encode('utf8'))

    return minhash1.jaccard(minhash2)
예제 #19
0
    def main(self):
        # 去除停用词
        jieba.analyse.set_stop_words('stopwords.txt')
        # MinHash计算
        m1, m2 = MinHash(), MinHash()
        # 提取关键词
        s1 = self.extract_keyword(self.s1)
        s2 = self.extract_keyword(self.s2)

        for data in s1:
            m1.update(data.encode('utf8'))
        for data in s2:
            m2.update(data.encode('utf8'))
        return m1.jaccard(m2)
예제 #20
0
파일: utils.py 프로젝트: Xuzhiqian/Demo
def jaccardMeasure(hashvalue1, hashvalue2):
    """
    利用datasketch计算两个Hash值的Jaccard相似度
    :param hashvalue1: datasketch 哈希值1
    :param hashvalue2: datasketch 哈希值2
    :return: float 两个哈希值的相似度 [0, 1]
    """
    try:
        m1 = MinHash(hashvalues=hashvalue1)
        m2 = MinHash(hashvalues=hashvalue2)
        return m1.jaccard(m2)
    except Exception as exc:
        logger.error(Fore.RED + 'MinHash failed with {0}'.format(exc))
        return 0.0
예제 #21
0
파일: mining.py 프로젝트: wafec/javadocto
def compare_with_minhash(value_a, value_b):
    str_a = _to_str_for_distance_calculation(value_a)
    str_b = _to_str_for_distance_calculation(value_b)
    try:
        tokens_a = nltk.word_tokenize(str_a)
        tokens_b = nltk.word_tokenize(str_b)
        m1, m2 = MinHash(), MinHash()
        for d in tokens_a:
            m1.update(d.encode('utf8'))
        for d in tokens_b:
            m2.update(d.encode('utf8'))
        value = m1.jaccard(m2)
        return value
    except Exception as exc:
        print(str_a, str_b)
        raise exc
예제 #22
0
def query_sim(in_dir):

    js = json.load(codecs.open(in_dir, "r"))
    line = js["content_p"]
    seg_list = jieba.cut(line, cut_all=False)
    no_list = []
    for word in seg_list:
        if word not in stopword:
            no_list.append(word)

    mh = MinHash(num_perm=128)
    for word in no_list:
        mh.update(word.encode('utf8'))

    result = forest.query(mh, 1)
    return mh.jaccard(forest[result[0]])
예제 #23
0
파일: doc.py 프로젝트: rbramwell/textpipe
 def similarity(self, other_doc, metric='jaccard', hash_method='minhash'):
     """
     Computes similarity for two documents.
     Only minhash Jaccard similarity is implemented.
     >>> doc1 = Doc('Sentence for computing the minhash')
     >>> doc2 = Doc('Sentence for computing the similarity')
     >>> doc1.similarity(doc2)
     0.7265625
     """
     if hash_method == 'minhash' and metric == 'jaccard':
         hash1 = MinHash(hashvalues=self.minhash)
         hash2 = MinHash(hashvalues=other_doc.minhash)
         return hash1.jaccard(hash2)
     else:
         raise NotImplementedError(f'Metric/hash method combination {metric}'
                                   f'/{hash_method} is not implemented as similarity metric')
예제 #24
0
def compare_two_group(crc_list1, crc_list2):
    """
    return the jaccard similarity of two list, based on MinHash

    Args:
        crc_list1(list):a list contains of crc values
        crc_list2

    Returns:
        similarity: the similarity between two lists, range [0, 1]
    """
    m1, m2 = MinHash(num_perm=800), MinHash(num_perm=800)
    for crc in crc_list1:
        m1.update(crc.encode('utf8'))
    for crc in crc_list2:
        m2.update(crc.encode('utf8'))
    similarity = m1.jaccard(m2)
    return similarity
예제 #25
0
def single_minhash(df, num):
    c_names = []
    for name, dtype in df.dtypes:
        if dtype == "string":
            c_names.append(name)
    for col1, col2 in itertools.combinations(c_names, 2):
        m1, m2 = MinHash(), MinHash()
        count = int(np.sqrt(num))
        data1 = df.select(col1).rdd.flatMap(lambda x: x).takeSample(
            False, count)
        data2 = df.select(col2).rdd.flatMap(lambda x: x).takeSample(
            False, count)
        for d in data1:
            for i in ngrams(d, 4):
                m1.update(''.join(i).encode('utf-8'))
        for d in data2:
            for i in ngrams(d, 4):
                m2.update(''.join(i).encode('utf-8'))
        print("MinHash Similarity for {} and {} is {}".format(
            col1, col2, m1.jaccard(m2)))
    def _hello_world():
        """
        This fragment was taken from the datasketch github page:
        https://github.com/ekzhu/datasketch
        """
        data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
                'estimating', 'the', 'similarity', 'between', 'datasets']
        data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
                'estimating', 'the', 'similarity', 'between', 'documents']

        m1, m2 = MinHash(), MinHash()
        for d in data1:
            m1.update(d.encode('utf8'))
        for d in data2:
            m2.update(d.encode('utf8'))
        print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

        s1 = set(data1)
        s2 = set(data2)
        actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
        print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #27
0
def DIDsamplingInit(dataset,BF,clustdict,beta):
    # dataset = Cora_labeled.objects.all()
    # clustdict = dextrapreclustering.minhashPreClustering(dataset)
    cluster_membership = {}

    # values = models.sigirCoraAttrValue.objects.filter(attr_id=attra_id)
    # attrasynonyms = models.sigirCoraValueSynonym.objects.filter(value_id__in=[ value.id for value in values])
    # record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym_id__in=[ syn.id for syn in attrasynonyms])



    for k, v in clustdict.items():
        for d in v:
            cluster_membership[d] = k
    sum = 0.000001
    for record in BF:
        sum = sum + len(clustdict[cluster_membership[record.id]])
    for record in BF:

        # distribution on dataset
        k = cluster_membership[record.id]
        ic = len(clustdict[k])/dataset.count()
        record_minhash = MinHash(num_perm=128)
        s = set(record.cleantext.split(" "))
        for d in s:
            record_minhash.update(d.encode('utf8'))
        term2sum = 0
        for rr in BF:
            rr_minhash =  MinHash(num_perm=128)
            ss = set(rr.cleantext.split(" "))
            for dd in ss:
                rr_minhash.update(dd.encode('utf8'))
            sim = record_minhash.jaccard(rr_minhash)
            sim = (sim/sum)**beta
            term2sum = term2sum + sim
        did = ic*term2sum
        record.orderscore = did
        record.save()
    return BF
예제 #28
0
    def run(self) -> dict():
        processed_key = list()
        prepare_delete_key = list()
        # print(len(self.sim_hash_dict.keys()))
        for key in self.sim_hash_dict.keys():
            source_message = self.sim_hash_dict[key][0]['message']
            source_min_hash = MinHash(hashfunc=self._hash_func)
            content_list = source_message
            # content_list = [i for i in splitWords(source_message)]
            for i in content_list:
                source_min_hash.update(i)
            self.sim_hash_dict[key][0]['minhash'] = source_min_hash

        for key in self.sim_hash_dict.keys():

            if key in processed_key:
                continue
            # 确定是最小的放在前面吗?【0】表示该集合中最小的hash
            source_min_hash = self.sim_hash_dict[key][0]['minhash']
            processed_key.append(key)

            for sub_key in self.sim_hash_dict.keys():
                if sub_key <= key or sub_key in processed_key:
                    continue

                # 找到其他集合中的最小hash值。
                target_min_hash = self.sim_hash_dict[sub_key][0]['minhash']
                if source_min_hash.jaccard(target_min_hash) > self.sim_value:
                    processed_key.append(sub_key)
                    self.sim_hash_dict[key].extend(self.sim_hash_dict[sub_key])
                    prepare_delete_key.append(sub_key)
        for value in prepare_delete_key:
            del self.sim_hash_dict[value]
        print('After Minhash Reduce, total: %s bin(s)' %
              len(self.sim_hash_dict.keys()))
        return self.sim_hash_dict
예제 #29
0
        if (i == j):
            continue
        else:
            second_user = lines[j]

            b = second_user.split()
            data1 = a[2].split(',')
            data2 = b[2].split(',')

            m1, m2 = MinHash(), MinHash()
            for d in data1:
                m1.update(d.encode('utf8'))
            for d in data2:
                m2.update(d.encode('utf8'))
            #print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))
            if (m1.jaccard(m2) == 0):
                continue
            else:
                f_hash.write(b[0] + '    ' + str(m1.jaccard(m2)) + '\n')
    f_hash.close()

f.close()

# 排序
""" 
first_line = f.readline()
second_line = f.readline()
a = first_line.split()
user = a[0]
sum_of_fans = a[1]
fans = a[2].split(',')
예제 #30
0
file1.close()
file2 = open("google-names.txt", "r")
text2 = file2.read()
file2.close()

# split into words by white space
words1 = text1.split()
words2 = text2.split()

# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped1 = [w.translate(table) for w in words1]
stripped2 = [w.translate(table) for w in words2]

from datasketch import MinHash 

# minhash
m1, m2 = MinHash(), MinHash()
for d in stripped1:
    m1.update(d.encode('utf8'))
for d in stripped2:
    m2.update(d.encode('utf8'))
print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))

# jaccard
s1 = set(stripped1)
s2 = set(stripped2)
actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2)))
print("Actual Jaccard for data1 and data2 is", actual_jaccard)
예제 #31
0
            data1 = a[2].split(',')
            data2 = b[2].split(',')

            #若果两列表相同个数为0或者一,舍去
            length_of_data1_and_data2 = len(data1) + len(data2)
            #去重
            both_data1_2 = list(set(data1 + data2))

            if ((length_of_data1_and_data2 - len(both_data1_2)) > 20):
                m1, m2 = MinHash(), MinHash()
                for d in data1:
                    m1.update(d.encode('utf8'))
                for d in data2:
                    m2.update(d.encode('utf8'))
                #print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))
                if (m1.jaccard(m2) <= 0.0078125):
                    continue
                else:
                    c = [b[0], m1.jaccard(m2)]
                    for m in range(49, -1, -1):
                        if (sort_value[m] == 0):
                            sort_value[m] = c
                        else:
                            if (c[1] > sort_value[m][1]):
                                sort_value[m + 1] = sort_value[m]
                                sort_value[m] = c
                            else:
                                break
            else:
                continue
                #new_value = b[0] + '    ' +str(m1.jaccard(m2)) + '\n'