def xiaoming(lines, length_of_f, i): first_user = lines[i] a = first_user.split() user_ID = a[0] + ' ' + '\n' print(a[1]) hash_value = [0 for i in range(0, 51)] #该列表用于写入文件 sort_value = [0 for i in range(0, 51)] #该列表用于放置更新中的值 # 创建一个该用户文件存储该用户与其他用户的minhash值 #f_hash = open('user_minhash/'+a[0]+'.txt',mode = 'w',encoding = 'utf-8') # f_hash.write(a[0] + ' '+'\n') # 写入该用户的ID for j in range(length_of_f - 1): if (i == j): continue else: second_user = lines[j] b = second_user.split() data1 = a[2].split(',') data2 = b[2].split(',') #若果两列表相同个数为0或者一,舍去 length_of_data1_and_data2 = len(data1) + len(data2) #去重 both_data1_2 = list(set(data1 + data2)) if ((length_of_data1_and_data2 - len(both_data1_2)) > 20): m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) #print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) if (m1.jaccard(m2) <= 0.0078125): continue else: c = [b[0], m1.jaccard(m2)] for m in range(49, -1, -1): if (sort_value[m] == 0): sort_value[m] = c else: if (c[1] > sort_value[m][1]): sort_value[m + 1] = sort_value[m] sort_value[m] = c else: break else: continue #new_value = b[0] + ' ' +str(m1.jaccard(m2)) + '\n' #hash_value.append(new_value) return sort_value #f_hash.write(b[0] + ' ' +str(m1.jaccard(m2)) + '\n') """ for item in sort_value:
class DataPoint: """ A class that handles a hash example. """ def __init__(self, string, index, only_string=True): """ Params: :string: String to be stored. :index: Number of the line in the file from which this sentence was read. :only_string: Whether to only store string. """ self.string = string.strip("\n") self.index = index self.character_level = DATA_FILTERING["character_level"] self.cluster_index = 0 if not only_string: self.init_hash() # Initialize hash from string. def init_hash(self): self.min_hash = MinHash(num_perm=DATA_FILTERING["num_permutations"]) for word in self.string.split(): if self.character_level: for char in word: self.min_hash.update(char.encode('utf8')) else: self.min_hash.update(word.encode('utf8')) # Computes jaccard distance between self and another hash. def similarity(self, other, dist_matrix=""): return self.min_hash.jaccard(other.min_hash)
def multi_minhash(df1, num1, df2, num2): c_names1 = [] c_names2 = [] for name, dtype in df1.dtypes: if dtype == "string": c_names1.append(name) for name, dtype in df2.dtypes: if dtype == "string": c_names2.append(name) for col1, col2 in itertools.product(c_names1, c_names2): m1, m2 = MinHash(), MinHash() count1 = int(np.sqrt(num1)) * 100 count2 = int(np.sqrt(num2)) * 100 data1 = df1.select(col1).rdd.flatMap(lambda x: x).takeSample( False, count1) data2 = df2.select(col2).rdd.flatMap(lambda x: x).takeSample( False, count2) for d in data1: for i in ngrams(d, 4): m1.update(''.join(i).encode('utf-8')) for d in data2: for i in ngrams(d, 4): m2.update(''.join(i).encode('utf-8')) print("MinHash Similarity for {} and {} is {}".format( col1, col2, m1.jaccard(m2)))
def _hello_world(): """ This fragment was taken from the datasketch github page: https://github.com/ekzhu/datasketch """ data1 = [ 'minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'datasets' ] data2 = [ 'minhash', 'is', 'a', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents' ] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) / float( len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def similarity(): source_file = open(test_source_path) target_file = open(test_responses_path) source_list = [line.strip("\n") for line in source_file] target_list = [line.strip("\n") for line in target_file] similarities = [] for source, target in zip(source_list, target_list): source_hash = MinHash(num_perm=256) for word in source.split(): source_hash.update(word.encode('utf8')) target_hash = MinHash(num_perm=256) for word in target.split(): target_hash.update(word.encode('utf8')) similarities.append(source_hash.jaccard(target_hash)) avg_similarity = sum(similarities) / len(similarities) sim_std = math.sqrt( sum([(x - avg_similarity) ** 2 for x in similarities]) / (len(similarities) - 1)) sim = "average similarity: " + str(avg_similarity) + " (%f)" % (sim_std) print(sim) output.write(sim + "\n") source_file.close() target_file.close()
def mylshforest(corpus): #print(len(corpus)) forest = MinHashLSHForest(num_perm=32) score_res = [0] mh = [] for i in range(len(corpus) - 1): doc = corpus[i] doc2 = corpus[i + 1] m = MinHash(num_perm=32) for d in doc: m.update(d.encode('utf8')) forest.add(str(i), m) forest.index() mh.append(m) m2 = MinHash(num_perm=32) for d in doc2: m2.update(d.encode('utf8')) result = forest.query(m2, 10) score = 0.0 for j in range(len(result)): score = score + m2.jaccard(mh[int(result[j])]) if (len(result) > 0): score = score / len(result) score_res.append(score) i = i + 1 return score_res
def estimate_jaccard_similarity_using_minhashing(self, first_doc, second_doc, permutations=128): h1 = MinHash(num_perm=permutations) h2 = MinHash(num_perm=permutations) for word in self.documents.get(first_doc): h1.update(get_word(word_id=word).encode('utf-8')) for word in self.documents.get(second_doc): h2.update(get_word(word_id=word).encode('utf-8')) return h1.jaccard(h2)
def compare_with_minhash(set1, set2): mh1, mh2 = MinHash(), MinHash() for el in set1: mh1.update(el.encode('utf8')) for el in set2: mh2.update(el.encode('utf8')) return mh1.jaccard(mh2)
def locSensitiveHashingFunction(s1, s2, tresh=.5): set_1, set_2 = Set(s1.split(' ')), Set(s2.split(' ')) m1, m2 = MinHash(), MinHash() for d in set_1: m1.update(d.encode('utf-8')) for d in set_2: m2.update(d.encode('utf-8')) return 1 if m1.jaccard(m2) > tresh else 0
def get_result(question_list, maskquestion_dict): """ :param question_list: 需要处理的未知问题 :param maskquestion_dict: 已知的问题,最开始没有已知问题,随着时间推移不断加入已知问题, maskquestion_dict是一个字典类型,键为问题id,类型为字符串,值为问题 :return: """ current_max_question_id = 0 for key in maskquestion_dict.keys(): value = maskquestion_dict[key] mask_question = set(list(jieba.cut(value))) intersection_set = mask_question & stop_words mask_question = mask_question - intersection_set m = MinHash(num_perm=1024) for d in mask_question: m.update(d.encode("utf8")) lsh.insert(str(key), m) current_max_question_id = max(int(key), current_max_question_id) new_mask_question_dict = {} result_list = [] for question in question_list: question_set = set(list(jieba.cut(question))) intersection_set = question_set & stop_words question_set = question_set - intersection_set m = MinHash(num_perm=1024) for d in question_set: m.update(d.encode("utf8")) result = lsh.query(m) if len(result) == 0: current_max_question_id += 1 new_mask_question_dict[str(current_max_question_id)] = question result_list.append([str(current_max_question_id), question]) """ 需要进行插入操作,将str(current_max_question_id), question插入到标问表中,如果数据库插入失败,直接返回失败状态, 同时将new_mask_question_dict, result_list分别置为{},【】 """ lsh.insert(str(current_max_question_id), m) elif len(result) == 1: result_list.append([str(result[0]), question]) else: select_max_similarity_question_id = '' jaccard_score = 0 for key in result: value = maskquestion_dict[key] if key in maskquestion_dict else new_mask_question_dict[key] value_set = set(list(jieba.cut(value))) intersection_set = value_set & stop_words value_set = value_set - intersection_set temp_m = MinHash(num_perm=1024) for d in value_set: temp_m.update(d.encode("utf8")) temp_score = m.jaccard(temp_m) if temp_score > jaccard_score: jaccard_score = temp_score select_max_similarity_question_id = key result_list.append([select_max_similarity_question_id, question]) return new_mask_question_dict, result_list
def distance(self, s1, s2): word_list1 = self.get_features(s1) word_list2 = self.get_features(s2) m1, m2 = MinHash(), MinHash() for d in word_list1: m1.update(d.encode('utf8')) for d in word_list2: m2.update(d.encode('utf8')) return m1.jaccard(m2)
def calculate_jaccard(text1, text2): # 计算两行文本jaccard相似度 minihash1, minihash2 = MinHash(), MinHash() for word in text1: minihash1.update(word.encode('utf-8')) for word in text2: minihash2.update(word.encode('utf-8')) return minihash1.jaccard(minihash2)
def main(): rcode_1, page_1 = page_fetcher.fetch_page( "http://www.e-prostor.gov.si/zbirke-prostorskih-podatkov/zbirka-vrednotenja-nepremicnin/" ) rcode_2, page_2 = page_fetcher.fetch_page( "http://www.e-prostor.gov.si/dostop-do-podatkov/dostop-do-podatkov/") start_time = time.time() m1_128, m2_128 = MinHash(), MinHash() print("128 perm hash time: ", time.time() - start_time) start_time = time.time() m1, m2 = MinHash(num_perm=256), MinHash(num_perm=256) print("256 perm hash time: ", time.time() - start_time) for d in page_1: m1_128.update(d.encode('utf8')) m1.update(d.encode('utf8')) for d in page_2: m2_128.update(d.encode('utf8')) m2.update(d.encode('utf8')) print("Estimated Jaccard for page_1 and page_2 is", m1_128.jaccard(m2_128)) print("Estimated Jaccard for page_1 and page_2 is", m1.jaccard(m2)) s1 = set(page_1) s2 = set(page_2) actual_jaccard = float(len(s1.intersection(s2))) / float(len(s1.union(s2))) print("Actual Jaccard for page_1 and page_2 is", actual_jaccard) sha1 = hashlib.sha1(page_1.encode('utf-8')) print(sha1.hexdigest()) sha2 = hashlib.sha1(page_2.encode('utf-8')) print(sha2.hexdigest()) sha1 = hashlib.sha1("That's what she said.".encode('utf-8')) print(sha1.hexdigest()) sha2 = hashlib.sha1("~Michael Scott".encode('utf-8')) print(sha2.hexdigest()) sha3 = hashlib.sha1("That's what she said.".encode('utf-8')) print(sha3.hexdigest())
def DIDsampling(dataset,BF,username,userid,attra_id,beta,clustdict): # dataset = Cora_labeled.objects.all() # clustdict = dextrapreclustering.minhashPreClustering(dataset) cluster_membership = {} # values = models.sigirCoraAttrValue.objects.filter(attr_id=attra_id) # attrasynonyms = models.sigirCoraValueSynonym.objects.filter(value_id__in=[ value.id for value in values]) # record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym_id__in=[ syn.id for syn in attrasynonyms]) record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym__value__attr_id=attra_id) record_noAttra = dataset.exclude(id__in = [ item.cora_id for item in record_hasAttra]) for k, v in clustdict.items(): for d in v: cluster_membership[d] = k sum = 0.000001 for record in BF: sum = sum + len(clustdict[cluster_membership[record.id]]) for record in BF: # AC cora2ae = models.sigirCoraToAttrEntity.objects.filter(cora_id=record.id,user=username) if cora2ae: list = [ item.attrsynonym.value.attr.id for item in cora2ae] if attra_id in list: record.orderscore = 0 record.save() continue else: ac = 1-len(list)/models.sigirCoraAttr.objects.filter(userid=userid).count() else: ac = 1 # distribution on dataset k = cluster_membership[record.id] ic = len(clustdict[k])/record_noAttra.count() record_minhash = MinHash(num_perm=128) s = set(record.cleantext.split(" ")) for d in s: record_minhash.update(d.encode('utf8')) term2sum = 0 for rr in BF: rr_minhash = MinHash(num_perm=128) ss = set(rr.cleantext.split(" ")) for dd in ss: rr_minhash.update(dd.encode('utf8')) sim = record_minhash.jaccard(rr_minhash) sim = (sim/sum)**beta term2sum = term2sum + sim did = ac*ic*term2sum record.orderscore = did record.save() return BF
def get_jaccard_index(sequence1, sequence2, k): seq1_minHash, seq2_minHash = MinHash(), MinHash() seq1_kmers = count_kmers(sequence1, k) seq2_kmers = count_kmers(sequence2, k) seq1_keys = list(seq1_kmers.keys()) seq2_keys = list(seq2_kmers.keys()) for key in seq1_keys: seq1_minHash.update(key.encode('utf8')) for key in seq2_keys: seq2_minHash.update(key.encode('utf8')) return seq1_minHash.jaccard(seq2_minHash)
def minhash(x, y): m1, m2 = MinHash(), MinHash() s1 = extract_keywords(x) s2 = extract_keywords(y) for data in s1: m1.update(data.encode('utf8')) for data in s2: m2.update(data.encode('utf8')) return m1.jaccard(m2)
def are_similar(self, logs): error_messages = {} for log in logs: error_message = '' for log_line in log['log_lines']: if 'ERROR' in log_line: m = re.match( r'^\S+\s+\S+\s+\S+\s\S+\s+\S+\s+(?P<log_content>.*)$', log_line) if m: content = m.group('log_content') if not error_message: error_message += content else: error_message += ' ' + content if error_message: if self.service_name(log['service']) not in error_messages: error_messages[self.service_name(log['service'])] = [] error_messages[self.service_name( log['service'])].append(error_message) error_messages = [" ".join(x) for x in error_messages.values()] if error_messages: error_messages = [self.get_anon_string(x) for x in error_messages] error_messages = [x for x in error_messages if x] error_messages = [re.sub(r'\s+', ' ', x) for x in error_messages] values = [] for i in range(len(error_messages) - 1): for j in range(i + 1, len(error_messages)): i_tokens = nltk.word_tokenize(error_messages[i]) j_tokens = nltk.word_tokenize(error_messages[j]) m1, m2 = MinHash(), MinHash() for d in i_tokens: m1.update(d.encode('iso-8859-1')) for d in j_tokens: m2.update(d.encode('iso-8859-1')) value = m1.jaccard(m2) values.append(value) if len(values) > 1: u_value = max(values) elif len(values) == 1: u_value = values[0] else: u_value = 0 print(u_value) if u_value and u_value < 0.2: print(error_messages) input() if u_value < 0.5: return False else: return False return True
def calc_of_similarity(text1, text2): # MinHash计算 minhash1, minhash2 = MinHash(), MinHash() # 提取关键词 keywords1 = extract_keyword(text1) keywords2 = extract_keyword(text2) for data in keywords1: minhash1.update(data.encode('utf8')) for data in keywords2: minhash2.update(data.encode('utf8')) return minhash1.jaccard(minhash2)
def main(self): # 去除停用词 jieba.analyse.set_stop_words('stopwords.txt') # MinHash计算 m1, m2 = MinHash(), MinHash() # 提取关键词 s1 = self.extract_keyword(self.s1) s2 = self.extract_keyword(self.s2) for data in s1: m1.update(data.encode('utf8')) for data in s2: m2.update(data.encode('utf8')) return m1.jaccard(m2)
def jaccardMeasure(hashvalue1, hashvalue2): """ 利用datasketch计算两个Hash值的Jaccard相似度 :param hashvalue1: datasketch 哈希值1 :param hashvalue2: datasketch 哈希值2 :return: float 两个哈希值的相似度 [0, 1] """ try: m1 = MinHash(hashvalues=hashvalue1) m2 = MinHash(hashvalues=hashvalue2) return m1.jaccard(m2) except Exception as exc: logger.error(Fore.RED + 'MinHash failed with {0}'.format(exc)) return 0.0
def compare_with_minhash(value_a, value_b): str_a = _to_str_for_distance_calculation(value_a) str_b = _to_str_for_distance_calculation(value_b) try: tokens_a = nltk.word_tokenize(str_a) tokens_b = nltk.word_tokenize(str_b) m1, m2 = MinHash(), MinHash() for d in tokens_a: m1.update(d.encode('utf8')) for d in tokens_b: m2.update(d.encode('utf8')) value = m1.jaccard(m2) return value except Exception as exc: print(str_a, str_b) raise exc
def query_sim(in_dir): js = json.load(codecs.open(in_dir, "r")) line = js["content_p"] seg_list = jieba.cut(line, cut_all=False) no_list = [] for word in seg_list: if word not in stopword: no_list.append(word) mh = MinHash(num_perm=128) for word in no_list: mh.update(word.encode('utf8')) result = forest.query(mh, 1) return mh.jaccard(forest[result[0]])
def similarity(self, other_doc, metric='jaccard', hash_method='minhash'): """ Computes similarity for two documents. Only minhash Jaccard similarity is implemented. >>> doc1 = Doc('Sentence for computing the minhash') >>> doc2 = Doc('Sentence for computing the similarity') >>> doc1.similarity(doc2) 0.7265625 """ if hash_method == 'minhash' and metric == 'jaccard': hash1 = MinHash(hashvalues=self.minhash) hash2 = MinHash(hashvalues=other_doc.minhash) return hash1.jaccard(hash2) else: raise NotImplementedError(f'Metric/hash method combination {metric}' f'/{hash_method} is not implemented as similarity metric')
def compare_two_group(crc_list1, crc_list2): """ return the jaccard similarity of two list, based on MinHash Args: crc_list1(list):a list contains of crc values crc_list2 Returns: similarity: the similarity between two lists, range [0, 1] """ m1, m2 = MinHash(num_perm=800), MinHash(num_perm=800) for crc in crc_list1: m1.update(crc.encode('utf8')) for crc in crc_list2: m2.update(crc.encode('utf8')) similarity = m1.jaccard(m2) return similarity
def single_minhash(df, num): c_names = [] for name, dtype in df.dtypes: if dtype == "string": c_names.append(name) for col1, col2 in itertools.combinations(c_names, 2): m1, m2 = MinHash(), MinHash() count = int(np.sqrt(num)) data1 = df.select(col1).rdd.flatMap(lambda x: x).takeSample( False, count) data2 = df.select(col2).rdd.flatMap(lambda x: x).takeSample( False, count) for d in data1: for i in ngrams(d, 4): m1.update(''.join(i).encode('utf-8')) for d in data2: for i in ngrams(d, 4): m2.update(''.join(i).encode('utf-8')) print("MinHash Similarity for {} and {} is {}".format( col1, col2, m1.jaccard(m2)))
def _hello_world(): """ This fragment was taken from the datasketch github page: https://github.com/ekzhu/datasketch """ data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'datasets'] data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents'] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def DIDsamplingInit(dataset,BF,clustdict,beta): # dataset = Cora_labeled.objects.all() # clustdict = dextrapreclustering.minhashPreClustering(dataset) cluster_membership = {} # values = models.sigirCoraAttrValue.objects.filter(attr_id=attra_id) # attrasynonyms = models.sigirCoraValueSynonym.objects.filter(value_id__in=[ value.id for value in values]) # record_hasAttra = models.sigirCoraToAttrEntity.objects.filter(user=username,attrsynonym_id__in=[ syn.id for syn in attrasynonyms]) for k, v in clustdict.items(): for d in v: cluster_membership[d] = k sum = 0.000001 for record in BF: sum = sum + len(clustdict[cluster_membership[record.id]]) for record in BF: # distribution on dataset k = cluster_membership[record.id] ic = len(clustdict[k])/dataset.count() record_minhash = MinHash(num_perm=128) s = set(record.cleantext.split(" ")) for d in s: record_minhash.update(d.encode('utf8')) term2sum = 0 for rr in BF: rr_minhash = MinHash(num_perm=128) ss = set(rr.cleantext.split(" ")) for dd in ss: rr_minhash.update(dd.encode('utf8')) sim = record_minhash.jaccard(rr_minhash) sim = (sim/sum)**beta term2sum = term2sum + sim did = ic*term2sum record.orderscore = did record.save() return BF
def run(self) -> dict(): processed_key = list() prepare_delete_key = list() # print(len(self.sim_hash_dict.keys())) for key in self.sim_hash_dict.keys(): source_message = self.sim_hash_dict[key][0]['message'] source_min_hash = MinHash(hashfunc=self._hash_func) content_list = source_message # content_list = [i for i in splitWords(source_message)] for i in content_list: source_min_hash.update(i) self.sim_hash_dict[key][0]['minhash'] = source_min_hash for key in self.sim_hash_dict.keys(): if key in processed_key: continue # 确定是最小的放在前面吗?【0】表示该集合中最小的hash source_min_hash = self.sim_hash_dict[key][0]['minhash'] processed_key.append(key) for sub_key in self.sim_hash_dict.keys(): if sub_key <= key or sub_key in processed_key: continue # 找到其他集合中的最小hash值。 target_min_hash = self.sim_hash_dict[sub_key][0]['minhash'] if source_min_hash.jaccard(target_min_hash) > self.sim_value: processed_key.append(sub_key) self.sim_hash_dict[key].extend(self.sim_hash_dict[sub_key]) prepare_delete_key.append(sub_key) for value in prepare_delete_key: del self.sim_hash_dict[value] print('After Minhash Reduce, total: %s bin(s)' % len(self.sim_hash_dict.keys())) return self.sim_hash_dict
if (i == j): continue else: second_user = lines[j] b = second_user.split() data1 = a[2].split(',') data2 = b[2].split(',') m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) #print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) if (m1.jaccard(m2) == 0): continue else: f_hash.write(b[0] + ' ' + str(m1.jaccard(m2)) + '\n') f_hash.close() f.close() # 排序 """ first_line = f.readline() second_line = f.readline() a = first_line.split() user = a[0] sum_of_fans = a[1] fans = a[2].split(',')
file1.close() file2 = open("google-names.txt", "r") text2 = file2.read() file2.close() # split into words by white space words1 = text1.split() words2 = text2.split() # remove punctuation from each word import string table = str.maketrans('', '', string.punctuation) stripped1 = [w.translate(table) for w in words1] stripped2 = [w.translate(table) for w in words2] from datasketch import MinHash # minhash m1, m2 = MinHash(), MinHash() for d in stripped1: m1.update(d.encode('utf8')) for d in stripped2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) # jaccard s1 = set(stripped1) s2 = set(stripped2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
data1 = a[2].split(',') data2 = b[2].split(',') #若果两列表相同个数为0或者一,舍去 length_of_data1_and_data2 = len(data1) + len(data2) #去重 both_data1_2 = list(set(data1 + data2)) if ((length_of_data1_and_data2 - len(both_data1_2)) > 20): m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) #print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) if (m1.jaccard(m2) <= 0.0078125): continue else: c = [b[0], m1.jaccard(m2)] for m in range(49, -1, -1): if (sort_value[m] == 0): sort_value[m] = c else: if (c[1] > sort_value[m][1]): sort_value[m + 1] = sort_value[m] sort_value[m] = c else: break else: continue #new_value = b[0] + ' ' +str(m1.jaccard(m2)) + '\n'