Python LSH.index примеры использования

Язык программирования: Python

Пространство имен/Пакет: lsh

Класс/Тип: LSH

Метод/Функция: index

Примеров на hotexamples.com: 4

Python LSH.index - 4 примера найдено. Это лучшие примеры Python кода для lsh.LSH.index, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

LSH(30)

query(9)

index(4)

saveLSH(3)

loadLSH(3)

estimateJaccard(2)

stats(2)

queryProtein(2)

train(2)

calculateLSH(2)

erase(1)

insert(1)

build_lsh(1)

query_remove_matrix(1)

checkJaccardResultsOfProtein(1)

closest_match(1)

compare_pairs(1)

loadDataSet(1)

knn(1)

insert_document(1)

compute_lsh(1)

find_candidate_pairs(1)

hash(1)

get_similar(1)

get_pairs(1)

__init__(1)

get_keys_str(1)

get_candidate_set(1)

getBestRepresentative(1)

generate_indices(1)

fit(1)

get_nearest_films(1)

Пример #1

Показать файл

        payload = record.payload.read()
        doc_uri[record_id] = record['WARC-Target-URI']
        text = HTMLPreprocessing(payload).get_text()
        doc_dict[record_id] = text
        doc_count += 1

print 'create vectors'
tfidf = TFIDF(doc_dict)
vect_length = tfidf.vect_length  # length of the input vector
num_hashtables = 1  # number of iterations
digest_length = 0
print 'perform lsh'
lsh = LSH(digest_length, vect_length, num_hashtables=num_hashtables)
for i, k in enumerate(tfidf._id_list):
    vect = tfidf.get_vector(i)
    lsh.index(vect, extra_data=tfidf._id_list[i])
''' Query documents '''
dedup = set()
keys = lsh.hash_tables[0].keys()
i = 0
for key in keys:
    bucket = lsh.hash_tables[0].get_val(key)
    for query_object in bucket:
        candidates = lsh.query(query_object[0], distance_func='cosine')
        for c in candidates:
            candidate_key = c[0][
                1]  # warc id is appended as extra data in lsh.index()
            if candidate_key == query_object[1]:
                continue
            if str(query_object[1]) <= str(candidate_key):
                candidate_distance = c[1]

Пример #2

Показать файл

Файл: test_lsh.py Проект: pdoyle5000/lsh

class TestLsh(TestCase):
    """TODO: Test Case docstring goes here."""
    def setUp(self):
        self.lsh = LSH(3, 2, 1)
        self.lsh_two_tables = LSH(3, 2, 2)

        # Overwrite randomly initalized planes with known values.
        self.lsh.planes = [np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]])]
        self.lsh_two_tables.planes = [
            np.array([[0.1, 0.2], [-0.1, -0.2], [-1.0, 1.0]]),
            np.array([[-0.1, -0.2], [0.1, 0.2], [-2.0, 2.0]]),
        ]

    def test_hashing(self):
        vector_ones = [1, 1]
        # This will add each plane without a scalar.
        # each value greater than zero will append a 1 to the string, 0 otherwise.
        self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_ones), "100")

        vector_twos = [-2, 2]
        self.assertEqual(self.lsh.hash(self.lsh.planes[0], vector_twos), "101")

    def test_table_indexing(self):
        self.lsh.index([1, 1], "data1")
        self.lsh.index([-2, 2], "data2")
        self.assertDictEqual(self.lsh.hash_tables[0], {
            "100": [([1, 1], "data1")],
            "101": [([-2, 2], "data2")]
        })

        self.lsh_two_tables.index([1, 1], "data1")
        self.lsh_two_tables.index([-2, 2], "data2")
        self.assertDictEqual(
            self.lsh_two_tables.hash_tables[0],
            {
                "100": [([1, 1], "data1")],
                "101": [([-2, 2], "data2")]
            },
        )
        self.assertDictEqual(
            self.lsh_two_tables.hash_tables[1],
            {
                "010": [([1, 1], "data1")],
                "011": [([-2, 2], "data2")]
            },
        )

    def test_query(self):
        self.lsh.index([1, 1], "data1")
        self.lsh.index([-2, 2], "data2")
        output = self.lsh.query([1, 1], 1)
        self.assertEqual(output, ["data1"])

        self.lsh_two_tables.index([1, 1], "data1")
        self.lsh_two_tables.index([-2, 2], "data2")
        output = self.lsh_two_tables.query([1, 1], 1)
        self.assertEqual(output, ["data1"])

        self.lsh_two_tables.index([-1, -1], "data3")
        self.lsh_two_tables.index([6, 6], "data4")
        self.lsh_two_tables.index([-10, -10], "data5")
        output = self.lsh_two_tables.query([6, 6], 2)
        self.assertEqual(output, ["data4", "data1"])

Пример #3

Показать файл

Файл: main.py Проект: bgrana/DM_SimilarObjects

def main(args):
    # Get input params
    input_dir = args["dir"]
    th = args["th"]

    # Read all files contained in the input directory
    print("Loading documents...")
    onlyfiles = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
    docs = []
    for fname in onlyfiles:
        with open(join(input_dir, fname), "r") as file:
            docs += [file.read()]

    # Clean documents removing trailing and duplicate blanks
    print("Cleaning documents...")
    docs = [re.sub('\W+', ' ', doc) for doc in docs]

    # Compute shingles of size n
    print("Computing shingles...")
    sh = Shingling(args["n"])
    shingles = sh.transform(docs)

    # Compute jaccard similarities
    print("Jaccard similarities (on hashed shingles) > " + str(th) + ":")
    similarities = {(onlyfiles[i], onlyfiles[j]):
                    compare_shingles(shingles[i], shingles[j])
                    for i in range(0, len(docs))
                    for j in range(i + 1, len(docs))}
    # Show similarities greater than the threshold
    print(
        sorted([(k, v) for k, v in similarities.items() if v > th],
               key=itemgetter(1),
               reverse=True))

    # Compute minHash signatures
    print("Computing signatures...")
    mh = MinHashing(args["k"])
    signatures = mh.transform(shingles)

    # Compute similarity esrimations
    print("Similarity estimations using minHashing > " + str(th) + ":")
    estimations = {(onlyfiles[i], onlyfiles[j]):
                   compare_signatures(signatures[:, i], signatures[:, j])
                   for i in range(0, len(docs))
                   for j in range(i + 1, len(docs))}
    # Show similarity estimations greater than a threshold
    print(
        sorted([(k, v) for k, v in estimations.items() if v > th],
               key=itemgetter(1),
               reverse=True))

    # Show Differences between estimations and real similarities
    errors = {(onlyfiles[i], onlyfiles[j]):
              abs(estimations[(onlyfiles[i], onlyfiles[j])] -
                  similarities[(onlyfiles[i], onlyfiles[j])])
              for i in range(0, len(docs)) for j in range(i + 1, len(docs))}
    # Show errors greater than 5%
    print("Estimaions with error greater than 5%:")
    print(
        sorted([(k, v) for k, v in errors.items() if v > 0.05],
               key=itemgetter(1),
               reverse=True))

    # Apply LSH to find pairs of probable similar items
    lsh = LSH(signatures, th)
    lsh.index()
    candidates = lsh.get_pairs()

    # Show candidates
    print("Identified candidates with LSH:")
    print([(onlyfiles[t[0]], onlyfiles[t[1]]) for t in candidates])

Пример #4

Показать файл

Файл: main.py Проект: xiaolinpeter/LSH-based-FAQ-Generation

    #perform  Similarity Search and get TF-IDF scores  of question tokens
    from similaritySearch import SimilaritySearch
    similaritySearchObj = SimilaritySearch(questionTokens)
    docList = similaritySearchObj.term_document_matrix

    print("Update: TF-IDF Generation Complete")

    print(docList.shape)
    #Now add all the docs to the lsh

    #reduce the size of the space matrix
    from scipy.sparse import csr_matrix
    matrix = csr_matrix(docList)

    print('Update: Converted  TF-IDF Matrix to Sparce matrix')

    lsh = LSH(8,
              matrix.shape[1],
              num_hashtables=10,
              storage_config={"dict": None})

    print("Update: LSH initialised")
    for ix in range(matrix.shape[0]):
        x = matrix.getrow(ix)
        lsh.index(x, extra_data=ix)

    print("Update: LSH indexing Complete")

    #get the buckets satisfying a given criteria
    lsh.getBestRepresentative(listOfDocs)