예제 #1
0
def query_all_documents_hamminghash(directory, shingle_pickle_filename,
                                    hamminghash_buckets_filename, num_hashes):
    generate_map(shingle_pickle_filename)
    with open(hamminghash_buckets_filename, 'rb') as handle:
        hamminghash_buckets = pickle.load(handle)

    results = dict()
    for filename in os.listdir(directory):

        filepath = directory + '/' + filename
        hamminghash = hamming_hash_single_document(filepath, num_hashes)
        possible_sources = set()

        for i in range(
            (num_hashes + HAMMING_BAND_SIZE - 1) // HAMMING_BAND_SIZE):

            arr = hamminghash[i * HAMMING_BAND_SIZE:(i + 1) *
                              HAMMING_BAND_SIZE]
            hash_value = vector_hash(arr)

            if hash_value in hamminghash_buckets[i]:
                for source_document in hamminghash_buckets[i][hash_value]:
                    possible_sources.add(source_document)

        # if len(possible_sources) != 0:
        #     print(filename, sorted(possible_sources))

        results[filename] = sorted(possible_sources)
    return results
예제 #2
0
def query_all_documents_cosinehash(directory, shingle_pickle_filename,
                                   cosinehash_buckets_filename, num_hashes):
    generate_map(shingle_pickle_filename)
    with open(cosinehash_buckets_filename, 'rb') as handle:
        cosinehash_buckets = pickle.load(handle)

    results = dict()
    for filename in os.listdir(directory):
        filepath = directory + '/' + filename
        cosinehash = cosine_hash_single_document(filepath, num_hashes)
        possible_sources = dict()

        for i in range(
            (num_hashes + COSINE_BAND_SIZE - 1) // COSINE_BAND_SIZE):
            arr = cosinehash[i * COSINE_BAND_SIZE:(i + 1) * COSINE_BAND_SIZE]
            hash_value = vector_hash(arr)

            if hash_value in cosinehash_buckets[i]:
                for source_document in cosinehash_buckets[i][hash_value]:
                    if source_document not in possible_sources:
                        possible_sources[source_document] = 1
                    else:
                        possible_sources[source_document] += 1
        ans = []
        if len(possible_sources) != 0:
            for source_document, frequency in possible_sources.items():
                if frequency > COSINE_HASH_THRESHOLD:  #change if threshold becomes dynamic
                    ans.append(source_document)
        # if(len(ans) != 0):
        #     print(filename, sorted(ans))

        results[filename] = sorted(ans)
    return results
def minhash_all_documents(directory,pickle_filename,num_hashes):
    generate_map(pickle_filename)
    minhash_buckets = [dict() for i in range((num_hashes + BAND_SIZE - 1) // BAND_SIZE)]
    for filename in os.listdir(directory):
        filepath = directory + '/' + filename
        minhash = minhash_document(filepath,num_hashes)        
        for i in range((num_hashes + BAND_SIZE - 1) // BAND_SIZE):
            arr = minhash[i * BAND_SIZE:(i + 1) * BAND_SIZE]
            hash_value = vector_hash(arr)
            if hash_value not in minhash_buckets[i]:
                minhash_buckets[i][hash_value] = [filename]
            else : 
                minhash_buckets[i][hash_value].append(filename)
    
    with open('minhash_buckets.pickle', 'wb') as handle:
        pickle.dump(minhash_buckets, handle)
def hamming_hash_all_documents(directory, pickle_filename, num_hashes):
    generate_map(pickle_filename)
    hamminghash_buckets = [dict() for i in range((num_hashes + HAMMING_BAND_SIZE - 1) // HAMMING_BAND_SIZE)]

    for filename in os.listdir(directory):
        filepath = directory + '/' + filename
        hamming_hash = hamming_hash_single_document(filepath, num_hashes)

        for i in range((num_hashes + HAMMING_BAND_SIZE - 1) // HAMMING_BAND_SIZE):

            arr = hamming_hash[i * HAMMING_BAND_SIZE:(i + 1) * HAMMING_BAND_SIZE]
            hash_value = vector_hash(arr)
            if hash_value not in hamminghash_buckets[i]:
                hamminghash_buckets[i][hash_value] = [filename]
            else:
                hamminghash_buckets[i][hash_value].append(filename)

    with open('hamminghash_buckets.pickle', 'wb') as handle:
        pickle.dump(hamminghash_buckets, handle)
def cosine_hash_all_documents(directory,pickle_filename,num_hashes):
    generate_map(pickle_filename)
    cosinehash_buckets = [dict() for i in range((num_hashes + COSINE_BAND_SIZE - 1) // COSINE_BAND_SIZE)]
    
    for filename in os.listdir(directory):
        filepath = directory + '/' + filename
        cosine_hash = cosine_hash_single_document(filepath,num_hashes)
        
        for i in range((num_hashes + COSINE_BAND_SIZE - 1) // COSINE_BAND_SIZE):
            
            arr = cosine_hash[i * COSINE_BAND_SIZE:(i + 1) * COSINE_BAND_SIZE]
            hash_value = vector_hash(arr)
            
            if hash_value not in cosinehash_buckets[i]:
                cosinehash_buckets[i][hash_value]=[filename]
            else:
                cosinehash_buckets[i][hash_value].append(filename)
    
    with open('cosinehash_buckets.pickle', 'wb') as handle:
        pickle.dump(cosinehash_buckets, handle)