def query_all_documents_hamminghash(directory, shingle_pickle_filename, hamminghash_buckets_filename, num_hashes): generate_map(shingle_pickle_filename) with open(hamminghash_buckets_filename, 'rb') as handle: hamminghash_buckets = pickle.load(handle) results = dict() for filename in os.listdir(directory): filepath = directory + '/' + filename hamminghash = hamming_hash_single_document(filepath, num_hashes) possible_sources = set() for i in range( (num_hashes + HAMMING_BAND_SIZE - 1) // HAMMING_BAND_SIZE): arr = hamminghash[i * HAMMING_BAND_SIZE:(i + 1) * HAMMING_BAND_SIZE] hash_value = vector_hash(arr) if hash_value in hamminghash_buckets[i]: for source_document in hamminghash_buckets[i][hash_value]: possible_sources.add(source_document) # if len(possible_sources) != 0: # print(filename, sorted(possible_sources)) results[filename] = sorted(possible_sources) return results
def query_all_documents_cosinehash(directory, shingle_pickle_filename, cosinehash_buckets_filename, num_hashes): generate_map(shingle_pickle_filename) with open(cosinehash_buckets_filename, 'rb') as handle: cosinehash_buckets = pickle.load(handle) results = dict() for filename in os.listdir(directory): filepath = directory + '/' + filename cosinehash = cosine_hash_single_document(filepath, num_hashes) possible_sources = dict() for i in range( (num_hashes + COSINE_BAND_SIZE - 1) // COSINE_BAND_SIZE): arr = cosinehash[i * COSINE_BAND_SIZE:(i + 1) * COSINE_BAND_SIZE] hash_value = vector_hash(arr) if hash_value in cosinehash_buckets[i]: for source_document in cosinehash_buckets[i][hash_value]: if source_document not in possible_sources: possible_sources[source_document] = 1 else: possible_sources[source_document] += 1 ans = [] if len(possible_sources) != 0: for source_document, frequency in possible_sources.items(): if frequency > COSINE_HASH_THRESHOLD: #change if threshold becomes dynamic ans.append(source_document) # if(len(ans) != 0): # print(filename, sorted(ans)) results[filename] = sorted(ans) return results
def minhash_all_documents(directory,pickle_filename,num_hashes): generate_map(pickle_filename) minhash_buckets = [dict() for i in range((num_hashes + BAND_SIZE - 1) // BAND_SIZE)] for filename in os.listdir(directory): filepath = directory + '/' + filename minhash = minhash_document(filepath,num_hashes) for i in range((num_hashes + BAND_SIZE - 1) // BAND_SIZE): arr = minhash[i * BAND_SIZE:(i + 1) * BAND_SIZE] hash_value = vector_hash(arr) if hash_value not in minhash_buckets[i]: minhash_buckets[i][hash_value] = [filename] else : minhash_buckets[i][hash_value].append(filename) with open('minhash_buckets.pickle', 'wb') as handle: pickle.dump(minhash_buckets, handle)
def hamming_hash_all_documents(directory, pickle_filename, num_hashes): generate_map(pickle_filename) hamminghash_buckets = [dict() for i in range((num_hashes + HAMMING_BAND_SIZE - 1) // HAMMING_BAND_SIZE)] for filename in os.listdir(directory): filepath = directory + '/' + filename hamming_hash = hamming_hash_single_document(filepath, num_hashes) for i in range((num_hashes + HAMMING_BAND_SIZE - 1) // HAMMING_BAND_SIZE): arr = hamming_hash[i * HAMMING_BAND_SIZE:(i + 1) * HAMMING_BAND_SIZE] hash_value = vector_hash(arr) if hash_value not in hamminghash_buckets[i]: hamminghash_buckets[i][hash_value] = [filename] else: hamminghash_buckets[i][hash_value].append(filename) with open('hamminghash_buckets.pickle', 'wb') as handle: pickle.dump(hamminghash_buckets, handle)
def cosine_hash_all_documents(directory,pickle_filename,num_hashes): generate_map(pickle_filename) cosinehash_buckets = [dict() for i in range((num_hashes + COSINE_BAND_SIZE - 1) // COSINE_BAND_SIZE)] for filename in os.listdir(directory): filepath = directory + '/' + filename cosine_hash = cosine_hash_single_document(filepath,num_hashes) for i in range((num_hashes + COSINE_BAND_SIZE - 1) // COSINE_BAND_SIZE): arr = cosine_hash[i * COSINE_BAND_SIZE:(i + 1) * COSINE_BAND_SIZE] hash_value = vector_hash(arr) if hash_value not in cosinehash_buckets[i]: cosinehash_buckets[i][hash_value]=[filename] else: cosinehash_buckets[i][hash_value].append(filename) with open('cosinehash_buckets.pickle', 'wb') as handle: pickle.dump(cosinehash_buckets, handle)