payload = record.payload.read() doc_uri[record_id] = record['WARC-Target-URI'] text = HTMLPreprocessing(payload).get_text() doc_dict[record_id] = text doc_count += 1 print 'create vectors' tfidf = TFIDF(doc_dict) vect_length = tfidf.vect_length # length of the input vector num_hashtables = 1 # number of iterations digest_length = 0 print 'perform lsh' lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables) for i, k in enumerate(tfidf._id_list): vect = tfidf.get_vector(i) lsh.index(vect, extra_data=tfidf._id_list[i]) ''' Query documents ''' dedup = set() keys = lsh.hash_tables[0].keys() i = 0 for key in keys: bucket = lsh.hash_tables[0].get_val(key) for query_object in bucket: candidates = lsh.query(query_object[0], distance_func='cosine') for c in candidates: candidate_key = c[0][ 1] # warc id is appended as extra data in lsh.index() if candidate_key == query_object[1]: continue if str(query_object[1]) <= str(candidate_key): candidate_distance = c[1]
class TestLsh(TestCase): """TODO: Test Case docstring goes here.""" def setUp(self): self.lsh = LSH(3, 2, 1) self.lsh_two_tables = LSH(3, 2, 2) # Overwrite randomly initalized planes with known values. self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])] self.lsh_two_tables.planes = [ np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]), np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]), ] def test_hashing(self): vector_ones = [1, 1] # This will add each plane without a scalar. # each value greater than zero will append a 1 to the string, 0 otherwise. self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_ones), "100") vector_twos = [-2, 2] self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_twos), "101") def test_table_indexing(self): self.lsh.index([1, 1], "data1") self.lsh.index([-2, 2], "data2") self.assertDictEqual(self.lsh.hash_tables[0], { "100": [([1, 1], "data1")], "101": [([-2, 2], "data2")] }) self.lsh_two_tables.index([1, 1], "data1") self.lsh_two_tables.index([-2, 2], "data2") self.assertDictEqual( self.lsh_two_tables.hash_tables[0], { "100": [([1, 1], "data1")], "101": [([-2, 2], "data2")] }, ) self.assertDictEqual( self.lsh_two_tables.hash_tables[1], { "010": [([1, 1], "data1")], "011": [([-2, 2], "data2")] }, ) def test_query(self): self.lsh.index([1, 1], "data1") self.lsh.index([-2, 2], "data2") output = self.lsh.query([1, 1], 1) self.assertEqual(output, ["data1"]) self.lsh_two_tables.index([1, 1], "data1") self.lsh_two_tables.index([-2, 2], "data2") output = self.lsh_two_tables.query([1, 1], 1) self.assertEqual(output, ["data1"]) self.lsh_two_tables.index([-1, -1], "data3") self.lsh_two_tables.index([6, 6], "data4") self.lsh_two_tables.index([-10, -10], "data5") output = self.lsh_two_tables.query([6, 6], 2) self.assertEqual(output, ["data4", "data1"])
def main(args): # Get input params input_dir = args["dir"] th = args["th"] # Read all files contained in the input directory print("Loading documents...") onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))] docs = [] for fname in onlyfiles: with open(join(input_dir, fname), "r") as file: docs += [file.read()] # Clean documents removing trailing and duplicate blanks print("Cleaning documents...") docs = [re.sub('\W+', ' ', doc) for doc in docs] # Compute shingles of size n print("Computing shingles...") sh = Shingling(args["n"]) shingles = sh.transform(docs) # Compute jaccard similarities print("Jaccard similarities (on hashed shingles) > " + str(th) + ":") similarities = {(onlyfiles[i], onlyfiles[j]): compare_shingles(shingles[i], shingles[j]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show similarities greater than the threshold print( sorted([(k, v) for k, v in similarities.items() if v > th], key=itemgetter(1), reverse=True)) # Compute minHash signatures print("Computing signatures...") mh = MinHashing(args["k"]) signatures = mh.transform(shingles) # Compute similarity esrimations print("Similarity estimations using minHashing > " + str(th) + ":") estimations = {(onlyfiles[i], onlyfiles[j]): compare_signatures(signatures[:, i], signatures[:, j]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show similarity estimations greater than a threshold print( sorted([(k, v) for k, v in estimations.items() if v > th], key=itemgetter(1), reverse=True)) # Show Differences between estimations and real similarities errors = {(onlyfiles[i], onlyfiles[j]): abs(estimations[(onlyfiles[i], onlyfiles[j])] - similarities[(onlyfiles[i], onlyfiles[j])]) for i in range(0, len(docs)) for j in range(i + 1, len(docs))} # Show errors greater than 5% print("Estimaions with error greater than 5%:") print( sorted([(k, v) for k, v in errors.items() if v > 0.05], key=itemgetter(1), reverse=True)) # Apply LSH to find pairs of probable similar items lsh = LSH(signatures, th) lsh.index() candidates = lsh.get_pairs() # Show candidates print("Identified candidates with LSH:") print([(onlyfiles[t[0]], onlyfiles[t[1]]) for t in candidates])
#perform Similarity Search and get TF-IDF scores of question tokens from similaritySearch import SimilaritySearch similaritySearchObj = SimilaritySearch(questionTokens) docList = similaritySearchObj.term_document_matrix print("Update: TF-IDF Generation Complete") print(docList.shape) #Now add all the docs to the lsh #reduce the size of the space matrix from scipy.sparse import csr_matrix matrix = csr_matrix(docList) print('Update: Converted TF-IDF Matrix to Sparce matrix') lsh = LSH(8, matrix.shape[1], num_hashtables=10, storage_config={"dict": None}) print("Update: LSH initialised") for ix in range(matrix.shape[0]): x = matrix.getrow(ix) lsh.index(x, extra_data=ix) print("Update: LSH indexing Complete") #get the buckets satisfying a given criteria lsh.getBestRepresentative(listOfDocs)