Python LSH.query примеры использования

Язык программирования: Python

Пространство имен/Пакет: snapy

Класс/Тип: LSH

Метод/Функция: query

Примеров на hotexamples.com: 5

Python LSH.query - 5 примеров найдено. Это лучшие примеры Python кода для snapy.LSH.query, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

LSH(19)

adjacency_list(6)

query(5)

update(4)

contains(2)

edge_list(2)

remove(2)

Пример #1

Показать файл

Файл: Cluster.py Проект: Daybreak2019/SLTCS

class LshCluster():
    def __init__(self, Content, nGram=3, PrenutNum=16, BandNum=8, MinJaccard=0.2):
        self.nGram      = nGram
        self.BandNum    = BandNum
        self.PrenutNum  = PrenutNum
        self.MinJaccard = MinJaccard
        self.Seed       = 3
        self.CreateLsh (Content)

    def Transform(self, Contexts):
        NewContexts = []
        for ctx in Contexts:
            ctx = ctx.strip()
            ctx = ctx.replace("_", "")
            ctx = ctx.lower()
            NewContexts.append(ctx) 
     
        return NewContexts

    def CreateLsh(self, InContext):
        Labels = range(len(InContext))
        InContext = self.Transform (InContext)
        Hash   = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed)
        self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum)
        self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard))

    def QuerySimilars(self, Index):
        if Index >= self.MaxIndex:
            return []
        Results = self.Lsh.query(Index, min_jaccard=self.MinJaccard)
        return Results

Пример #2

Показать файл

def find_near_duplicate(dataset, query, targets, labels, min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type='char'):
    """Using LSH object finds the near duplicate strings.

    Args:
        query_sentences (dict): Dict with query strings and version of string in lower case and without comma.
        sentences (dict): Dict with target strings and version of string in lower case and without comma.
        min_jaccard_value (float): Minimum value for the Jaccard Distance.
        no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets.
        n_permutations (int): Number of permutations used to create minhash signatures used in LSH model.
        n_gram (int): Size of each overlapping text shingle to break text into prior to hashing.
    """
    for i in targets:
        if len(i) < n_gram:
            pp(i)
    # Create MinHash object.
    minhash = MinHash(targets, n_gram=n_gram, n_gram_type=n_gram_type,
                      permutations=n_permutations, hash_bits=64, seed=SEED)

    # Create LSH model.
    lsh = LSH(minhash, labels, no_of_bands=no_of_bands)

    # Query to find near duplicates the string in `search`
    closest_results = lsh.query(labels[0], min_jaccard=min_jaccard_value)

    # print("QUERY: {}".format(labels[0]))
    # pp(closest_results)

    return {"dataset": dataset, "query": labels[0], "duplicates": ' '.join(closest_results)}

Пример #3

Показать файл

def test_lsh_query():
    lsh = LSH(minhash, labels)
    with pytest.raises(KeyError):
        lsh.query(10)
    with pytest.raises(KeyError):
        lsh.query(0)
    with pytest.raises(ValueError):
        lsh.query(2, sensitivity=100)
    result = lsh.query(1)
    assert result == [8, 4]
    result = lsh.query(1, sensitivity=29)
    assert result == [4]
    result = lsh.query(1, min_jaccard=0.55)
    assert result == [4]

Пример #4

Показать файл

#hash value size to be used to generate minhash signitures from shingles (32,64, or 128 bit).
#NOTE: should be chosen based on text length and a trade off between performance ad accuracy
hash_bits = 64

# Create MinHash object.
minhash = MinHash(content,
                  n_gram=n_gram,
                  permutations=permutations,
                  hash_bits=hash_bits,
                  seed=seed)

# Create LSH model.
lsh = LSH(minhash, labels, no_of_bands=50)

#query to find near duplicates for text 1
print(lsh.query(1, min_jaccard=.5))

#update model
#generate minhash aignitures for new text, and add new texts to LSH model
new_text = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
]

new_labels = ['new_doc1', 'new_doc2']

#1.create minhash signitues for new text
new_minhash = MinHash(new_text,
                      n_gram=n_gram,
                      permutations=permutations,
                      hash_bits=hash_bits,

Пример #5

Показать файл

Файл: minhash_example.py Проект: chenAsaraf/DNAMutationReport

from snapy import MinHash, LSH
import numpy as np
from fasta_parser import parse_to_list

contigs_file = "../contigs-outputs/basic_k-mer24/basic_try_k-mer24.contigs.fa"
print("parsing contigs to list...")
contigs_list, num_contigs = parse_to_list(contigs_file)
print("number of contigs (shorter then 1,00bp):", num_contigs)
labels = np.arange(num_contigs).tolist()
# Create MinHash object.
print("creating minhash object...")
minhash = MinHash(contigs_list, n_gram=24)
# Create LSH model.
print("creating LSH object...")
lsh = LSH(minhash, labels)
print("query object:")
print("similar strings to the string in index 0:", lsh.query(0))
print("similar strings to the string in index 1:", lsh.query(1))
print("similar strings to the string in index 2:", lsh.query(2))
print("similar strings to the string in index 3:", lsh.query(3))
print("similar strings to the string in index 4:", lsh.query(4))
print("similar strings to the string in index 5:", lsh.query(5))
print(lsh.contains())