Пример #1
0
        payload = record.payload.read()
        doc_uri[record_id] = record['WARC-Target-URI']
        text = HTMLPreprocessing(payload).get_text()
        doc_dict[record_id] = text
        doc_count += 1

print 'create vectors'
tfidf = TFIDF(doc_dict)
vect_length = tfidf.vect_length  # length of the input vector
num_hashtables = 1  # number of iterations
digest_length = 0
print 'perform lsh'
lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables)
for i, k in enumerate(tfidf._id_list):
    vect = tfidf.get_vector(i)
    lsh.index(vect, extra_data=tfidf._id_list[i])
''' Query documents '''
dedup = set()
keys = lsh.hash_tables[0].keys()
i = 0
for key in keys:
    bucket = lsh.hash_tables[0].get_val(key)
    for query_object in bucket:
        candidates = lsh.query(query_object[0], distance_func='cosine')
        for c in candidates:
            candidate_key = c[0][
                1]  # warc id is appended as extra data in lsh.index()
            if candidate_key == query_object[1]:
                continue
            if str(query_object[1]) <= str(candidate_key):
                candidate_distance = c[1]
Пример #2
0
class TestLsh(TestCase):
    """TODO: Test Case docstring goes here."""
    def setUp(self):
        self.lsh = LSH(3, 2, 1)
        self.lsh_two_tables = LSH(3, 2, 2)

        # Overwrite randomly initalized planes with known values.
        self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])]
        self.lsh_two_tables.planes = [
            np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]),
            np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]),
        ]

    def test_hashing(self):
        vector_ones = [1, 1]
        # This will add each plane without a scalar.
        # each value greater than zero will append a 1 to the string, 0 otherwise.
        self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_ones), "100")

        vector_twos = [-2, 2]
        self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_twos), "101")

    def test_table_indexing(self):
        self.lsh.index([1, 1], "data1")
        self.lsh.index([-2, 2], "data2")
        self.assertDictEqual(self.lsh.hash_tables[0], {
            "100": [([1, 1], "data1")],
            "101": [([-2, 2], "data2")]
        })

        self.lsh_two_tables.index([1, 1], "data1")
        self.lsh_two_tables.index([-2, 2], "data2")
        self.assertDictEqual(
            self.lsh_two_tables.hash_tables[0],
            {
                "100": [([1, 1], "data1")],
                "101": [([-2, 2], "data2")]
            },
        )
        self.assertDictEqual(
            self.lsh_two_tables.hash_tables[1],
            {
                "010": [([1, 1], "data1")],
                "011": [([-2, 2], "data2")]
            },
        )

    def test_query(self):
        self.lsh.index([1, 1], "data1")
        self.lsh.index([-2, 2], "data2")
        output = self.lsh.query([1, 1], 1)
        self.assertEqual(output, ["data1"])

        self.lsh_two_tables.index([1, 1], "data1")
        self.lsh_two_tables.index([-2, 2], "data2")
        output = self.lsh_two_tables.query([1, 1], 1)
        self.assertEqual(output, ["data1"])

        self.lsh_two_tables.index([-1, -1], "data3")
        self.lsh_two_tables.index([6, 6], "data4")
        self.lsh_two_tables.index([-10, -10], "data5")
        output = self.lsh_two_tables.query([6, 6], 2)
        self.assertEqual(output, ["data4", "data1"])
Пример #3
0
def main(args):
    # Get input params
    input_dir = args["dir"]
    th = args["th"]

    # Read all files contained in the input directory
    print("Loading documents...")
    onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
    docs = []
    for fname in onlyfiles:
        with open(join(input_dir, fname), "r") as file:
            docs += [file.read()]

    # Clean documents removing trailing and duplicate blanks
    print("Cleaning documents...")
    docs = [re.sub('\W+', ' ', doc) for doc in docs]

    # Compute shingles of size n
    print("Computing shingles...")
    sh = Shingling(args["n"])
    shingles = sh.transform(docs)

    # Compute jaccard similarities
    print("Jaccard similarities (on hashed shingles) > " + str(th) + ":")
    similarities = {(onlyfiles[i], onlyfiles[j]):
                    compare_shingles(shingles[i], shingles[j])
                    for i in range(0, len(docs))
                    for j in range(i + 1, len(docs))}
    # Show similarities greater than the threshold
    print(
        sorted([(k, v) for k, v in similarities.items() if v > th],
               key=itemgetter(1),
               reverse=True))

    # Compute minHash signatures
    print("Computing signatures...")
    mh = MinHashing(args["k"])
    signatures = mh.transform(shingles)

    # Compute similarity esrimations
    print("Similarity estimations using minHashing > " + str(th) + ":")
    estimations = {(onlyfiles[i], onlyfiles[j]):
                   compare_signatures(signatures[:, i], signatures[:, j])
                   for i in range(0, len(docs))
                   for j in range(i + 1, len(docs))}
    # Show similarity estimations greater than a threshold
    print(
        sorted([(k, v) for k, v in estimations.items() if v > th],
               key=itemgetter(1),
               reverse=True))

    # Show Differences between estimations and real similarities
    errors = {(onlyfiles[i], onlyfiles[j]):
              abs(estimations[(onlyfiles[i], onlyfiles[j])] -
                  similarities[(onlyfiles[i], onlyfiles[j])])
              for i in range(0, len(docs)) for j in range(i + 1, len(docs))}
    # Show errors greater than 5%
    print("Estimaions with error greater than 5%:")
    print(
        sorted([(k, v) for k, v in errors.items() if v > 0.05],
               key=itemgetter(1),
               reverse=True))

    # Apply LSH to find pairs of probable similar items
    lsh = LSH(signatures, th)
    lsh.index()
    candidates = lsh.get_pairs()

    # Show candidates
    print("Identified candidates with LSH:")
    print([(onlyfiles[t[0]], onlyfiles[t[1]]) for t in candidates])
Пример #4
0
    #perform  Similarity Search and get TF-IDF scores  of question tokens
    from similaritySearch import SimilaritySearch
    similaritySearchObj = SimilaritySearch(questionTokens)
    docList = similaritySearchObj.term_document_matrix

    print("Update: TF-IDF Generation Complete")

    print(docList.shape)
    #Now add all the docs to the lsh

    #reduce the size of the space matrix
    from scipy.sparse import csr_matrix
    matrix = csr_matrix(docList)

    print('Update: Converted  TF-IDF Matrix to Sparce matrix')

    lsh = LSH(8,
              matrix.shape[1],
              num_hashtables=10,
              storage_config={"dict": None})

    print("Update: LSH initialised")
    for ix in range(matrix.shape[0]):
        x = matrix.getrow(ix)
        lsh.index(x, extra_data=ix)

    print("Update: LSH indexing Complete")

    #get the buckets satisfying a given criteria
    lsh.getBestRepresentative(listOfDocs)