def compute_minhash(lst): m1 = MinHash(num_perm=128) for d in set(lst): m1.update(d.encode('utf8')) return m1
def create_minhash(data): minhash = MinHash(HASH_SIZE, seed=12) for qgram in data: minhash.update(qgram.encode('utf-8')) return minhash
f = open("../data/" + dataset + ".graph") ptr = f.readline().strip("\n").strip(" ").split(" ") idx = f.readline().strip("\n").strip(" ").split(' ') if len(idx) != nume: print("error idx", len(idx)) exit() if len(ptr) != numv + 1: print("error ptr", len(ptr)) exit() t0 = time.time() lsh = MinHashLSH(threshold=lsh_thres, num_perm=per) allver = [] lists = [[] for i in range(numv)] for i in range(numv): m = MinHash(num_perm=per) for iter in range((int)(ptr[i]), (int)(ptr[i + 1])): m.update(str(idx[iter]).encode('utf-8')) lists[i].append(idx[iter]) lsh.insert(str(i), m) allver.append(m) #res = lsh.query(allver[0]) #print(res) t1 = time.time() print("init LSH", t1 - t0) def jd(l1, l2): if len(l1) == 0 or len(l2) == 0: return 0 s1 = set(l1)
from datasketch import MinHash k_sig = 1024 minhash = MinHash(num_perm=k_sig) minhash2 = MinHash(num_perm=k_sig) minhash3 = MinHash(num_perm=k_sig) minhash.update( "This is a good algorithm that can perform wild ranges of services".encode( 'utf-8')) minhash2.update( "The computing algorithm for this is very cool and works really well". encode('utf-8')) minhash3.update( "The computing algorithm for this is very cool and works really wellx". encode('utf-8')) # print(minhash.hashvalues) # print(minhash2.hashvalues) # print(minhash3.hashvalues) print(minhash2.jaccard(minhash3)) # print(minhash2.permutations) # print(minhash3.permutations)
def get_minhash(item_str): temp = MinHash() for d in item_str: temp.update(d.encode('utf8')) return temp
from datasketch import MinHash data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'datasets'] data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents'] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m1)) print(MinHash(hashvalues=m1.hashvalues).jaccard(m1))
def test_bytesize(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) self.assertTrue(lm1.bytesize() == (4 * 4) + 4 + 8)
def getHashSig(tagsListOfPep): minHash = MinHash(num_perm=NUM_PERMUTATION) for tag in tagsListOfPep: minHash.update(tag.encode('utf-8')) return minHash.digest()
def min_hash(self, num_perm=64, seed=42, use_components=None, type_option=None, n_char=None, n_word=None, npz=None, isrequest=False): """ Minhash function. Parameters -------- num_perm: int Number of permutations. seed: int For random permutations. use_components: str, optional: ['name', 'addr'] or ['name'] or ['addr']. Components to use. type_option: str, optional: ['char', 'word'] or ['char'] or ['word']. Components to use. n_char: list of int sizes of char grams. n_word: list of int sizes of word grams. isrequest: bool If this LpuList created for base, we have False. npz: bool Indicator or using npz files. """ if npz: self.options = type_option self.num_perm = num_perm n = n_char if type_option == 'char' else n_word self.features['not_weighed_{}_{}_{}minhash'.format(use_components[0], type_option[0], n[0])] = \ np.load(npz)['min_hash'] return self use_components = use_components or ['name'] type_option = type_option or ['char'] n_char = n_char or [3] n_word = n_word or [1] if 'char' not in type_option and 'word' not in type_option: assert False, "Проверьте значение параметра type_option." if 'name' not in use_components and 'addr' not in use_components: assert False, "Проверьте значение параметра use_components." self.options = type_option self.num_perm = num_perm for i in use_components: for j in type_option: n_list = n_char if j == 'char' else n_word for n in n_list: help_list = [] for idx, name in enumerate( self.features['{}_{}_{}grams'.format(i, j, n)]): minhash = MinHash(num_perm, seed=seed) for ngram in name: minhash.update(ngram.encode('utf8')) lean_minhash = LeanMinHash(minhash) help_list.append(lean_minhash) self.features['not_weighed_{}_{}_{}minhash'.format( i, j, n)] = np.array(help_list) file_path = 'data/min_hash_dadata/{}_{}_{}_not_weighed_minhash.npz'.format( i, j, n) if not isrequest: np.savez_compressed(file_path, min_hash=np.array(help_list)) return self
# m1 = MinHash(num_perm=128) # m2 = MinHash(num_perm=128) # m3 = MinHash(num_perm=128) # for d in set1: # m1.update(d.encode('utf8')) # for d in set2: # m2.update(d.encode('utf8')) # for d in set3: # m3.update(d.encode('utf8')) # Create LSH index # lsh = MinHashLSH(threshold=0.2, num_perm=128, params = (2,3)) k_sig = 1024 m1 = MinHash(num_perm=k_sig) m2 = MinHash(num_perm=k_sig) m3 = MinHash(num_perm=k_sig) m4 = MinHash(num_perm=k_sig) m5 = MinHash(num_perm=k_sig) for d in set1: m1.update(d.encode('utf8')) for d in set2: m2.update(d.encode('utf8')) for d in set3: m3.update(d.encode('utf8')) for d in set4: m4.update(d.encode('utf8')) for d in set5: m5.update(d.encode('utf8'))
if __name__ == '__main__': # data1 = 'VGFGEEWEDAAWCN' data1 = ['TAG', 'VGF', 'GTB', 'EEW'] # data1 = ['TAG', 'RBT', 'WCDS'] data2 = 'TAGDSAFDVGFGTEEWEQWWRFRSDAAWCDSNBH' data3 = 'TAGSSAFDDBFDTWEEWTDWWRFRSCASWCDSQBH' data2 = [data2[i:i + 3] for i in range(len(data2) - 2)] data3 = [data3[i:i + 3] for i in range(len(data3) - 2)] # for i in range(len(data2)-2): # print(data2[i:i+3]) m1, m2 = MinHash(), MinHash() # for d in data1: d = 'AVB' m1.update(d.encode('utf8')) # for d in data2: d = 'BVA' m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m2.jaccard(m1)) # sumSimi = 0 # m2 = MinHash() # m2.update(data2.encode('utf8')) # for d in data1: # print(d) # m1 = MinHash() # m1.update(d.encode('utf8'))
def main(): path = '../../../Desktop' # Directory in local computer """Shows basic usage of the Drive v3 API. Prints the names and ids of the first 10 files the user has access to. """ creds = None # The file token.pickle stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first # time. if os.path.exists('token.pickle'): with open('token.pickle', 'rb') as token: creds = pickle.load(token) # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( 'credentials.json', SCOPES) creds = flow.run_local_server() # Save the credentials for the next run with open('token.pickle', 'wb') as token: pickle.dump(creds, token) service = build('drive', 'v3', credentials=creds) # Call the Drive v3 API results = service.files().list( pageSize=1000, fields="nextPageToken, files(parents, name, id, modifiedTime, mimeType, md5Checksum)").execute() items = results.get('files', []) currentTimesinceEpoc = time.time() currentday = time.strftime('%d', time.localtime(currentTimesinceEpoc)) currentmonth = time.strftime('%m', time.localtime(currentTimesinceEpoc)) currentyear = time.strftime('%Y', time.localtime(currentTimesinceEpoc)) currentdate = dt.datetime(int(currentyear),int(currentmonth), int(currentday),0,0,0) folderModifTimes = {} if not items: print('No files found.') else: '''location heirarchy for Drive files''' for item in items: modifyear, modifmonth, modifday = item['modifiedTime'].split('-') modifday = modifday.split('T')[0] modifdate = dt.datetime(int(modifyear),int(modifmonth), int(modifday),0,0,0) modificationTimesinceEpoc = str(currentTimesinceEpoc - (currentdate-modifdate).total_seconds()) item ['modificationTimesinceEpoc'] = modificationTimesinceEpoc if item.get('parents') != None and len(item.get('parents')) >1: #more tham 1 parent not handled print("many parents") if item.get('parents') == None: if item['mimeType'] == 'application/vnd.google-apps.folder': parentname = 'Drive' parentid = 'drive' else: parentname = 'Parent-less Files' parentid = 'parentlessfiles' else: parentname = (service.files().get(fileId=item.get('parents')[0]).execute())['name'] parentid = item.get('parents')[0] if folderModifTimes.get(parentid) == None or float(folderModifTimes[parentid]) < float(modificationTimesinceEpoc): folderModifTimes[parentid] = modificationTimesinceEpoc item['parentid'] = parentid item['parentname'] = parentname new_items = [] print('location' + '^' + 'dir' + 'root' + '^' + 'Root' + '^' + str(currentTimesinceEpoc) + '^' + 'dir'+os.path.abspath(path)+ '^' + 'Desktop' + '^' + str(currentTimesinceEpoc)) print('location' + '^' + 'dir' + 'root' + '^' + 'Root' + '^' + str(currentTimesinceEpoc) + '^' + 'dir'+'https://drive.google.com/open?id=' +'drive'+ '^' + 'Drive' + '^' + str(currentTimesinceEpoc)) print('location' + '^' + 'dir' + 'https://drive.google.com/open?id=' +'drive' + '^' + 'Drive' + '^' + str(currentTimesinceEpoc) + '^' + 'dir'+'https://drive.google.com/open?id=' +'parentlessfiles'+ '^' + 'Parent-less Files' + '^' + folderModifTimes['parentlessfiles']) for item in items: if item['mimeType'] == 'application/vnd.google-apps.folder': #if a folder if folderModifTimes.get(item['id']) == None: print('location' + '^' + 'dir' + 'https://drive.google.com/open?id=' +item['parentid'] + '^' + item['parentname'] + '^' + folderModifTimes[item['parentid']] + '^' + 'dir'+'https://drive.google.com/open?id=' +item['id']+ '^' + item['name'] + '^' + item ['modificationTimesinceEpoc']) else: print('location' + '^' + 'dir' + 'https://drive.google.com/open?id=' +item['parentid'] + '^' + item['parentname'] + '^' + folderModifTimes[item['parentid']] + '^' + 'dir'+'https://drive.google.com/open?id=' +item['id']+ '^' + item['name'] + '^' + folderModifTimes[item['id']]) else: #if not a folder new_items.append(item) #further links would only be between files print('location' + '^' + 'dir' +'https://drive.google.com/open?id=' + item['parentid'] + '^' + item['parentname'] + '^' + folderModifTimes[item['parentid']] + '^' +'https://drive.google.com/open?id=' +item['id']+ '^' + item['name'] + '^' + item ['modificationTimesinceEpoc']) items = new_items for r, d, f in os.walk(path): f = [file for file in f if not (file[0] == '.' or file[0] == '_')] d[:] = [dr for dr in d if not (dr[0] == '.' or dr[0] == '_')] for file in f: filepath = os.path.join(r, file) filepath2 = filepath.split('/') filename = filepath2[-1] filemtime = str(os.path.getmtime(os.path.abspath(filepath))) mime = magic.Magic(mime=True) mimeType = mime.from_file(os.path.abspath(filepath)) md5Checksum = hs.fileChecksum(os.path.abspath(filepath), "md5") item = {'id': os.path.abspath(filepath),'name' :filename, 'modificationTimesinceEpoc': filemtime, 'mimeType': mimeType, 'md5Checksum': md5Checksum } items.append(item) with open('email_metadata', 'r') as f: attachments = f.read().split('\n') for attachment in attachments: if len(attachment.split("^"))==1: break attachment_id, attachment_name, attachment_mimeType, attachment_md5Checksum, attachment_mtime = attachment.split("^") item = {'id': attachment_id+'~'+attachment_name,'name' : attachment_name, 'modificationTimesinceEpoc': attachment_mtime, 'mimeType': attachment_mimeType, 'md5Checksum': attachment_md5Checksum } items.append(item) ''' same hash''' for f1 in items: for f2 in items: if f1['id']!=f2['id'] and f1.get('md5Checksum') != None and f2.get('md5Checksum') != None and f1.get('md5Checksum') == f2.get('md5Checksum'): f1mtime = f1['modificationTimesinceEpoc'] f2mtime = f2['modificationTimesinceEpoc'] print('content' + '^'+ f1['id'] + '^' + f1['name'] + '^' + f1mtime + '^'+ f2['id'] + '^' + f2['name'] + '^' + f2mtime) '''content similarity''' text_files = [] '''scanning the directory''' for f in items: filepath = f['id'] file_type = f['mimeType'] major_minor = file_type.split('/') if major_minor[0] == 'text' or file_type == 'application/vnd.google-apps.document': text_files.append(f) documents =[] os.mkdir('DriveTextFiles') #To temporarily store drive text files for f in text_files: try: documents.append((open(f['id']).read(),f)) except FileNotFoundError: # downloading drive text files try : file_id = f['id'] if f['mimeType'] == 'application/vnd.google-apps.document': request = service.files().export_media(fileId=file_id, mimeType='text/plain') else: request = service.files().get_media(fileId=file_id) fh = io.FileIO(os.path.join('DriveTextFiles',f['id']), 'wb') downloader = googleapiclient.http.MediaIoBaseDownload(fh, request) done = False while done is False: status, done = downloader.next_chunk() documents.append((open(os.path.join('DriveTextFiles',f['id'])).read(),f)) except (googleapiclient.errors.HttpError, FileNotFoundError): pass lsh = MinHashLSH(threshold=0.3, num_perm=128) for f in documents: setdoc = set(f[0].split()) m = MinHash(num_perm=128) for d in setdoc: m.update(d.encode('utf8')) lsh.insert(f[1]['id'] + '^' +f[1]['name']+ '^' +f[1]['modificationTimesinceEpoc'], m) results = [] for doc in documents: setdoc = set(doc[0].split()) m = MinHash(num_perm=128) for d in setdoc: m.update(d.encode('utf8')) result = lsh.query(m) results.append((doc[1]['id'] + '^' +doc[1]['name']+ '^' +doc[1]['modificationTimesinceEpoc'],result)) '''forming links between files with similar content''' for result in results: f2mtime = result[0].split('^')[2] for r in result[1]: if r!=result[0]: f1mtime = r.split('^')[2] print('content' + '^'+ r.split('^')[0] + '^' + r.split('^')[1] + '^' + f1mtime +'^'+ result[0].split('^')[0] + '^' + result[0].split('^')[1] + '^' + f2mtime) shutil.rmtree('DriveTextFiles') '''name similarity''' for f1 in items: for f2 in items: distance1 = textdistance.jaro.distance(f1['name'],f2['name']) distance2 = textdistance.levenshtein.distance(f1['name'],f2['name']) if ((distance1<=0.30 and f1['id']!=f2['id'] and distance2<0.75*min(len(f1['name']),len(f2['name']),8)) or distance1<=0.15 or distance2<=0.25*min(len(f1['name']),len(f2['name']),8)) and f1['id']!=f2['id']: f1mtime = f1['modificationTimesinceEpoc'] f2mtime = f2['modificationTimesinceEpoc'] print('name' + '^'+ f1['id'] + '^' + f1['name'] + '^' + f1mtime +'^' + f2['id'] + '^' + f2['name'] + '^' + f2mtime) '''time similarity''' file_threshhold = 10 filetimes = [] i = 0 for item in items: filetimes.append([float(item['modificationTimesinceEpoc'])]) i = i+1 kmeans = KMeans(n_clusters=int(i/file_threshhold) +1, random_state=0).fit(filetimes) labels = kmeans.labels_ for j in range(int(i/file_threshhold)+1) : #iterating through all clusters idx = [] for i in range(0, len(labels)) : if labels[i] == j : idx.append(i) filesj = [items[i] for i in idx] #all the files in a cluster #forming similar time links for f1 in filesj: for f2 in filesj: if f1['id']!=f2['id'] : f1mtime = f1['modificationTimesinceEpoc'] f2mtime = f2['modificationTimesinceEpoc'] print('time' + '^'+ f1['id'] + '^' + f1['name'] + '^' + f1mtime + '^'+ f2['id'] + '^' + f2['name'] + '^' + f2mtime)
import database as mydb from datasketch import MinHashLSHForest, MinHash, MinHashLSH import pdb from Translator import translate from Article import Article # Create MinHash objects lsh = MinHashLSH(threshold=0.1, num_perm=128, storage_config={ "type": "redis", "basename": b"docs_tech", "redis": { "host": "localhost", "port": 6379 } }) lsh = MinHashLSH(threshold=0.2, num_perm=128) articles_en = mydb.execute_query("SELECT id, keyword, title FROM english") keywords_en = [ Article(id=item[0], keyword=item[1], content=item[2]) for item in articles_en ] for item in keywords_en: minhash = MinHash(num_perm=128) list_keyword = item.keyword.split(",") for k in list_keyword: minhash.update(k.encode("utf-8")) lsh.insert(str(item.id), minhash) # forest.add(str(item.id), minhash) # forest.index()
def compare(first_signature_input, second_signature_input): first_minhash = MinHash(hashvalues=first_signature_input) second_minhash = MinHash(hashvalues=second_signature_input) return first_minhash.jaccard(second_minhash)
for file in f: filepath = os.path.join(r, file) mime = magic.Magic(mime=True) file_type = mime.from_file(filepath) major_minor = file_type.split('/') if major_minor[0] == 'text' : text_files.append(os.path.abspath(filepath)) documents = [(open(f).read(),f) for f in text_files] lsh = MinHashLSH(threshold=0.4, num_perm=128) for f in documents: setdoc = set(f[0].split()) m = MinHash(num_perm=128) for d in setdoc: m.update(d.encode('utf8')) lsh.insert(f[1], m) results = [] for doc in documents: setdoc = set(doc[0].split()) m = MinHash(num_perm=128) for d in setdoc: m.update(d.encode('utf8')) result = lsh.query(m) results.append(result) results = np.array(results)
def minHash(self, code_tokens): minHash = MinHash(num_perm=self.num_perm) for d in code_tokens: # TODO modify this for n-grams minHash.update("".join(d).encode('utf-8')) return minHash
def compare(signature1, signature2): mh1, mh2 = MinHash(hashvalues=signature1), MinHash(hashvalues=signature2) return mh1.jaccard(mh2)
def generate_minhash(self): self.minhash = MinHash(num_perm=400) for shingle in self.get_n_best_shingles(n=400): self.minhash.update(shingle.encode('utf8'))
def test_is_empty(self): m = MinHash() lm = LeanMinHash(m) self.assertTrue(lm.is_empty())
def minhash_IDs(IDs): m = MinHash() for ID in IDs: m.update(ID.encode('utf-8')) return m
def hash(s): s.lower().translate(str.maketrans('', '', string.punctuation)) mh = MinHash(num_perm=128) for d in [s[i:i + N] for i in range(len(s) - N + 1)]: mh.update(d.encode('utf8')) return mh
def run(codebase_path, updates_file_path, commits): print("Creating Clone Index from HEAD~" + str(commits + 1)) result = subprocess.run([ 'git', '-C', str(codebase_path), 'log', '-' + str(commits + 1), '--no-merges', '--pretty=format:"%h"' ], stdout=subprocess.PIPE) result_commits = result.stdout.decode('utf-8') result_commits = result_commits.replace('"', '') result_commits = result_commits.split('\n') # checkout to the current commit subprocess.run([ 'git', '-C', str(codebase_path), 'checkout', result_commits[len(result_commits) - 1] ], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) # start the timer index_cr_start = timer() codebase = CodebaseReader(codebase_path) lines_per_files = codebase.get_lines_per_file() # Create LSH index lsh_index = MinHashLSH(threshold=config.THRESHOLD, num_perm=config.PERMUTATIONS) for file in lines_per_files: min_hash = MinHash(num_perm=config.PERMUTATIONS) for line in lines_per_files[file]: min_hash.update(line.encode('utf8')) lsh_index.insert(file, min_hash) index_cr_end = timer() index_cr_diff = round(index_cr_end - index_cr_start, 5) incremental_step_time = 0 commits_processed = 0 # we use this instead of len(data['commits']) bcs there might me commits that only affect # excluded (e.g. test) files and in that case the specific commit does not get processed try: with open(updates_file_path) as f: data = json.load(f) commits = data['commits'] for commit in commits: creates_lst = [] updates_lst = [] deletes_lst = [] renames_lst = [] print('========> Running Analysis for codebase @commit: ', commit['id'], "<========") # checkout to the current commit subprocess.run([ 'git', '-C', str(codebase_path), 'checkout', commit['id'] ], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) is_processed = False for change in commit['changes']: change_type = change['type'] if change_type in ['A', 'M', 'D']: affected_filename = change['filename'] file_path = Path(affected_filename) # skip directories not read when creating the initial index & skip invalid files if is_in_exlcuded_dir( file_path) or is_in_excluded_format(file_path): continue is_processed = True # if I get here then there is at least 1 change in that commit that is processed file_path = codebase_path / file_path print('-> Parsing change [', change_type, '] for file [', file_path, ']') if change_type == 'A': creates_lst.append(str(file_path)) elif change_type == 'M': updates_lst.append(str(file_path)) elif change_type == 'D': deletes_lst.append(str(file_path)) else: affected_filenames = change['filename'] from_filename = Path(affected_filenames[0]) to_filename = Path(affected_filenames[1]) # skip directories not read when creating the initial index & skip invalid files if is_in_exlcuded_dir(from_filename) or is_in_excluded_format(from_filename) or \ is_in_exlcuded_dir(to_filename) or is_in_excluded_format(to_filename): continue is_processed = True # if I get here then there is at least 1 change in that commit that is processed from_filename = codebase_path / from_filename to_filename = codebase_path / to_filename print('-> Parsing change [', change_type, '] for renamed/moved file [', from_filename, ']', 'to [', to_filename, ']') renames_lst.append( (str(from_filename), str(to_filename))) if is_processed: changes_handler = ChangesHandler(lsh_index, codebase, deletes_lst, updates_lst, creates_lst, renames_lst) # start incremental step timer start = timer() # handle commit changes changes_handler.handle_changes() # end incremental step timer end = timer() time_diff = round(end - start, 5) print("Detection/Index update time: " + str(time_diff) + " seconds") commits_processed += 1 incremental_step_time += time_diff else: print("Commit " + commit['id'] + " was skipped because all files were excluded") # checkout back to HEAD subprocess.run( ['git', '-C', str(codebase_path), 'checkout', '-'], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) print( "============================================================") print("Total LOCs: ", codebase.get__initial_codebase_lines()) print("Total Index creation time: ", index_cr_diff, " seconds") print("Total commits: ", len(commits)) print("Total commits processed: ", commits_processed) if commits_processed > 0: print( "Average Incremental Step Time: ", round(incremental_step_time / float(commits_processed), 5), " seconds") else: print("0 commits out of ", len(commits), " were processed. Something went terribly wrong!") print( "============================================================") f.close() except IOError: traceback.print_exc() print("File \"" + str(updates_file_path) + "\" not found.")
def query(self, v, n): m = MinHash(num_perm=self._n_perm) for e in v: m.update(str(e).encode('utf8')) return map(int, self._index.query(m, n))
def get_minhash_signature(shingle_set): mh = MinHash() for el in shingle_set: mh.update(el.encode('utf8')) return mh.hashvalues
def _main(): if len(sys.argv) != 5: usage() folder = sys.argv[1] label_fn = sys.argv[2] k = int(sys.argv[3]) outFn = sys.argv[4] # Get sample labels labels = get_labels(folder,label_fn) # Randomly choose samples from labels with at least 10 samples in them samples = dict() for c in labels: if len(labels[c]) < k: continue for s in random.sample(labels[c],k): samples[s] = c stats = dict() history = dict() # Iterate over samples and calculate their similarities for s1 in samples: c1 = samples[s1] if s1 not in history: history[s1] = set() if c1 not in stats: stats[c1] = dict() stats[c1]['jaccard'] = dict() stats[c1]['lsh'] = dict() stats[c1]['hamming'] = dict() for s2 in samples: # Don't duplicate similarity measurements if s1 == s2: continue if s2 in history: if s1 in history[s2]: continue c2 = samples[s2] if c2 not in stats: stats[c2] = dict() stats[c2]['jaccard'] = dict() stats[c2]['lsh'] = dict() stats[c2]['hamming'] = dict() if c2 not in stats[c1]['jaccard']: stats[c1]['jaccard'][c2] = list() stats[c1]['lsh'][c2] = Counter() stats[c1]['hamming'][c2] = list() if c1 not in stats[c2]['jaccard']: stats[c2]['jaccard'][c1] = list() stats[c2]['lsh'][c1] = Counter() stats[c2]['hamming'][c1] = list() # Note that we've compared these samples now history[s1].add(s2) # Read API sequences lseq1 = readFile(folder,s1) lseq2 = readFile(folder,s2) seq1 = set(lseq1) seq2 = set(lseq2) # https://ekzhu.github.io/datasketch/lsh.html # Compare these two samples m1 = MinHash(num_perm=128) m2 = MinHash(num_perm=128) for d in seq1: m1.update(d.encode('utf8')) for d in seq2: m2.update(d.encode('utf8')) # Calculate LSH similarity lsh = MinHashLSH(threshold=0.7, num_perm=128) lsh.insert(samples[s1],m1) result = lsh.query(m2) if len(result) == 1: rl = True else: rl = False # Calculate Jaccard similarity rj = float(len(seq1.intersection(seq2)))/float(len(seq1.union(seq2))) # Pad smallest sequence if len(lseq1) < len(lseq2): diff = len(lseq2) - len(lseq1) lseq1.extend([0]*diff) elif len(lseq1) > len(lseq2): diff = len(lseq1) - len(lseq2) lseq2.extend([0]*diff) # Calculate Hamming distance rh = sum(s1 != s2 for s1,s2 in zip(lseq1,lseq2)) # Keep track of similarities stats[c1]['jaccard'][c2].append(rj) stats[c1]['lsh'][c2][rl] += 1 stats[c1]['hamming'][c2].append(rh) stats[c2]['jaccard'][c1].append(rj) stats[c2]['lsh'][c1][rl] += 1 stats[c2]['hamming'][c1].append(rh) # Print status sys.stdout.write('{0} {4} {1} {5}: Jaccard similarity: {2} | > 0.7 LSH similarity: {3} | Hamming distance: {6}\n'.format(samples[s1],samples[s2],rj,rl,s1,s2,rh)) # Print summary stats with open(outFn,'w') as fw: fw.write('class:\n') fw.write(' class jaccard_average LSH_ similar_counts hamming_average\n') fw.write('\n') for c in stats: fw.write('{0}:\n'.format(c)) for c2 in stats[c]['jaccard']: add = float(sum(stats[c]['jaccard'][c2])) total = float(len(stats[c]['jaccard'][c2])) add2 = float(sum(stats[c]['hamming'][c2])) total2 = float(len(stats[c]['hamming'][c2])) fw.write(' {0} {1} {2} {3}\n'.format(c2, add/total, stats[c]['lsh'][c2], add2/total2))
def get_mh(values, permutations=512): mh = MinHash(num_perm=permutations) for el in values: mh.update(str(el).encode('utf8')) return mh
from datasketch import MinHash, MinHashLSH data1 = ['这个', '程序', '代码', '太乱', '那个', '代码', '规范'] data2 = ['这个', '程序', '代码', '不', '规范', '那个', '更', '规范'] data3 = ['这个', '程序', '代码', '不', '规范', '那个', '规范', '些'] # 创建MinHash对象 m1 = MinHash() m2 = MinHash() m3 = MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) for d in data3: m3.update(d.encode('utf8')) # 创建LSH lsh = MinHashLSH(threshold=0.5, num_perm=128) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("近似的邻居(Jaccard相似度>0.5)", result)
def make_min_hash(self,words): min_hash = MinHash(self.__num_permutation) for word in words: min_hash.update(word.encode('utf8')) return min_hash
data1 = [ 'minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'datasets' ] data2 = [ 'minhash', 'is', 'a', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents' ] data3 = [ 'minhash', 'is', 'probability', 'data', 'structure', 'for', 'estimating', 'the', 'similarity', 'between', 'documents' ] # Create MinHash objects m1 = MinHash(num_perm=128) m2 = MinHash(num_perm=128) m3 = MinHash(num_perm=128) for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) for d in data3: m3.update(d.encode('utf8')) # Create a MinHash LSH Forest with the same num_perm parameter forest = MinHashLSHForest(num_perm=128) # Add m2 and m3 into the index forest.add("m2", m2) forest.add("m3", m3)
artist_shingle = defaultdict(list) corpus = processLyrics(corpus) for artist,lyrics in corpus.items(): tokens = clean_text(lyrics) artist_shingle[artist].append(tokens) from datasketch import MinHashLSHForest, MinHash from sklearn.metrics import jaccard_similarity_score g = [] listlsh = [] lsh = MinHashLSHForest(num_perm=128) for artist,sets in artist_shingle.items(): a = MinHash(num_perm=128) for d in sets[0]: a.update(d.encode('utf8')) listlsh.append(a) lsh.add(artist,a) lsh.index() tester = {} with open('tester.json') as file: tester = json.loads(file.read().encode('latin-1')) numcorrect_1 =0 numcorrect_5 = 0 numcorrect_10 = 0 total = 0 for artist,songlist in tester.items(): for song in songlist: