예제 #1
0
def test_minhash_errors():
    with pytest.raises(ValueError):
        MinHash(content, n_gram_type='words')
    with pytest.raises(ValueError):
        MinHash(content, hash_bits=65)
    with pytest.raises(ValueError):
        MinHash(content, method='universal')
    with pytest.raises(ValueError):
        MinHash(content, n_gram=63)
예제 #2
0
    def create_lsh(self, content, no_of_bands, n_permutations, n_gram):
        """Create Minhash and Locality Sensitive Hashing (LSH) to detect near duplicate texts.

        Args:
            content (list): List with string to build LSH.
            no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets.
            n_permutations (int): Number of permutations used to create minhash signatures used in LSH model.
            n_gram (int): Size of each overlapping text shingle to break text into prior to hashing.
            no_of_bands(int): Number of bands to break minhash signature into before hashing into buckets.

        Returns:
            class 'snapy.lsh.LSH':  Snapy LSH object.

        """
        labels = range(len(content))

        # Create MinHash object.
        minhash = MinHash(content,
                          n_gram=n_gram,
                          permutations=n_permutations,
                          hash_bits=64,
                          seed=SEED)

        # Create LSH model.
        lsh = LSH(minhash, labels, no_of_bands=no_of_bands)

        return lsh
예제 #3
0
def test_update_lsh():
    lsh = LSH(minhash, labels)
    with pytest.raises(ValueError):
        lsh.update(minhash, labels)
    new_content = [
        'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
        'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.'
    ]
    new_labels = [11, 12]
    incorrect_minhash = MinHash(new_content, permutations=10)
    with pytest.raises(ValueError):
        lsh.update(incorrect_minhash, new_labels)
    correct_minhash = MinHash(new_content)
    lsh.update(correct_minhash, new_labels)
    assert lsh.permutations == 100
    assert list(lsh._i_bucket) == labels + [11, 12]
예제 #4
0
def find_near_duplicate(dataset, query, targets, labels, min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type='char'):
    """Using LSH object finds the near duplicate strings.

    Args:
        query_sentences (dict): Dict with query strings and version of string in lower case and without comma.
        sentences (dict): Dict with target strings and version of string in lower case and without comma.
        min_jaccard_value (float): Minimum value for the Jaccard Distance.
        no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets.
        n_permutations (int): Number of permutations used to create minhash signatures used in LSH model.
        n_gram (int): Size of each overlapping text shingle to break text into prior to hashing.
    """
    for i in targets:
        if len(i) < n_gram:
            pp(i)
    # Create MinHash object.
    minhash = MinHash(targets, n_gram=n_gram, n_gram_type=n_gram_type,
                      permutations=n_permutations, hash_bits=64, seed=SEED)

    # Create LSH model.
    lsh = LSH(minhash, labels, no_of_bands=no_of_bands)

    # Query to find near duplicates the string in `search`
    closest_results = lsh.query(labels[0], min_jaccard=min_jaccard_value)

    # print("QUERY: {}".format(labels[0]))
    # pp(closest_results)

    return {"dataset": dataset, "query": labels[0], "duplicates": ' '.join(closest_results)}
예제 #5
0
def find_adjacency(draws):
    draws_idx = list(draws)
    draws_nos = list(draws.values())

    # labels = [draws_idx[0]]
    # content = [" ".join(map(str, draws_nos[0]))]
    # print(labels)
    # print(content)

    # minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3)
    # lsh = LSH(minhash, labels, no_of_bands=50)
    # print(lsh.query(1, min_jaccard=0.5))

    new_labels = []
    new_content = []
    for i in range(0, len(draws)):
        new_labels.append(draws_idx[i])
        new_content.append(" ".join(map(str, draws_nos[i])))
    # print(new_labels)
    # print(new_content)
    new_labels.append(99999)
    new_content.append(" ".join(map(str, draws_nos[0])))

    minhash = MinHash(new_content,
                      n_gram=9,
                      permutations=500,
                      hash_bits=64,
                      seed=3)
    lsh = LSH(minhash, new_labels, no_of_bands=100)

    adjacency_list = lsh.adjacency_list(min_jaccard=0.5)
    for key, value in adjacency_list.items():
        if len(value) > 0:
            print(key, value)
예제 #6
0
def test_terms_minhash():
    minhash = MinHash(content, n_gram_type='term', seed=seed)
    assert minhash.n_gram_type == 'term'
    assert type(minhash.signatures) is np.ndarray
    signature = minhash.signatures
    assert signature.shape == (9, 100)
    assert signature[0][0] == -8115786556000760185
    assert np.array(signature[0][0]).dtype == 'int64'
    assert signature[-1][-1] == -579511180950999701
예제 #7
0
def multi_hash_tests(first_hash, second_hash, hash_size):
    minhash = MinHash(content, hash_bits=hash_size, seed=seed)
    assert minhash.seed == 3
    assert minhash.method == 'multi_hash'
    assert type(minhash.signatures) is np.ndarray
    assert minhash.signatures.shape == (9, 100)
    signature = minhash.signatures
    assert signature[0][0] == first_hash
    assert signature[-1][-1] == second_hash
예제 #8
0
def test_terms_minhash():
    minhash = MinHash(content, n_gram_type='term')
    assert minhash.n_gram_type == 'term'
    assert type(minhash.signatures) is np.ndarray
    signature = minhash.signatures
    assert signature.shape == (9, 100)
    assert signature[0][0] == -3695830800917301951
    assert np.array(signature[0][0]).dtype == 'int64'
    assert signature[-1][-1] == -7737361925742332862
예제 #9
0
def k_smallest_hash_tests(first_hash, second_hash, hash_size):
    minhash = MinHash(content,
                      permutations=53,
                      hash_bits=hash_size,
                      method='k_smallest_values',
                      seed=seed)
    assert minhash._hash_seeds == 83957611
    assert minhash.method == 'k_smallest_values'
    assert type(minhash.signatures) is np.ndarray
    assert minhash.signatures.shape == (9, 53)
    signature = minhash.signatures
    assert signature[0][0] == first_hash
    assert signature[-1][-1] == second_hash
    with pytest.raises(ValueError):
        MinHash(content,
                permutations=200,
                hash_bits=hash_size,
                method='k_smallest_values',
                seed=seed)
예제 #10
0
def test_minhash_defaults():
    minhash = MinHash(content)
    assert type(minhash.signatures) is np.ndarray
    assert minhash.signatures.shape == (9, 100)
    assert minhash.n_gram == 9
    assert minhash.n_gram_type == 'char'
    assert minhash.permutations == 100
    assert minhash.hash_bits == 64
    assert minhash.method == 'multi_hash'
    assert minhash._hash_seeds.shape[0] == 100
예제 #11
0
def get_lsh_model(documents, seed_int):
    seed = seed_int
    # Create MinHash object.
    minhash = MinHash(my_documents,
                      n_gram=9,
                      permutations=100,
                      hash_bits=64,
                      seed=seed)
    # Create LSH model.
    lsh = LSH(minhash, labels, no_of_bands=50)
    return lsh
def _create_lsh(
    text: List[str],
    labels: List[int],
    n_gram: int,
    n_permutations: int,
    hash_bits: int,
    no_of_bands: int,
) -> LSH:
    """Returns a `snapy.lsh.LSH` object constructed from `text` to detect near duplicate texts.
    """

    minhash = MinHash(text,
                      n_gram=n_gram,
                      permutations=n_permutations,
                      hash_bits=hash_bits,
                      seed=SEED)
    lsh = LSH(minhash, labels, no_of_bands=no_of_bands)

    typer.secho(
        f"{HASHING}  Hashed the normalized text using Locality-Sensitive Hashing (LSH).",
        bold=True,
    )

    return lsh
예제 #13
0
 def identify_dublicates(self, ctnt_to_dedup):
     _ix = [i for i in range(len(ctnt_to_dedup))]
     _mn_hash = MinHash(ctnt_to_dedup, n_gram=self.n_gram, seed=self.seed)
     _lsh = LSH(_mn_hash, _ix, no_of_bands=self.lsh_bands)
     candidates = _lsh.adjacency_list(min_jaccard=self.j_thresh)
     return candidates
예제 #14
0
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
content = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
    'A helium atom has about four times as much mass as a hydrogen atom, so the composition changes '
    'when described as the proportion of mass contributed by different atoms.',
    'Jupiter is primarily composed of hydrogen and a quarter of its mass being helium',
    'A helium atom has about four times as much mass as a hydrogen atom and the composition changes '
    'when described as a proportion of mass contributed by different atoms.',
    'Theoretical models indicate that if Jupiter had much more mass than it does at present, it would shrink.',
    'This process causes Jupiter to shrink by about 2 cm each year.',
    'Jupiter is mostly composed of hydrogen with a quarter of its mass being helium',
    'The Great Red Spot is large enough to accommodate Earth within its boundaries.'
]

minhash = MinHash(content, seed=3)


def test_initialize_from_empty_lsh():
    lsh = LSH()
    assert lsh.no_of_bands is None
    assert lsh._buckets == defaultdict(list)
    assert lsh._i_bucket == defaultdict(list)
    assert lsh.permutations is None
    lsh.update(minhash, labels)
    assert list(lsh._i_bucket) == labels
    buckets = lsh._buckets
    assert buckets[4466445138223010106] == [1, 8]
    assert buckets[-3939654010681976230] == [1, 4, 8]
    assert lsh.permutations == 100
    assert lsh.no_of_bands == 50
from snapy import MinHash, LSH
import numpy as np
from fasta_parser import parse_to_list

contigs_file = "../contigs-outputs/basic_k-mer24/basic_try_k-mer24.contigs.fa"
print("parsing contigs to list...")
contigs_list, num_contigs = parse_to_list(contigs_file)
print("number of contigs (shorter then 1,00bp):", num_contigs)
labels = np.arange(num_contigs).tolist()
# Create MinHash object.
print("creating minhash object...")
minhash = MinHash(contigs_list, n_gram=24)
# Create LSH model.
print("creating LSH object...")
lsh = LSH(minhash, labels)
print("query object:")
print("similar strings to the string in index 0:", lsh.query(0))
print("similar strings to the string in index 1:", lsh.query(1))
print("similar strings to the string in index 2:", lsh.query(2))
print("similar strings to the string in index 3:", lsh.query(3))
print("similar strings to the string in index 4:", lsh.query(4))
print("similar strings to the string in index 5:", lsh.query(5))
print(lsh.contains())

예제 #16
0
 def CreateLsh(self, InContext):
     Labels = range(len(InContext))
     InContext = self.Transform (InContext)
     Hash   = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed)
     self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum)
     self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard))
예제 #17
0
seed = 3

#size of each overlapping text shingle to break text into prior to hashing
n_gram = 9

#number of randomly sampled hash values to use for generating each texts minhash signature (larger = more accurate & slower)
permutations = 100

#hash value size to be used to generate minhash signitures from shingles (32,64, or 128 bit).
#NOTE: should be chosen based on text length and a trade off between performance ad accuracy
hash_bits = 64

# Create MinHash object.
minhash = MinHash(content,
                  n_gram=n_gram,
                  permutations=permutations,
                  hash_bits=hash_bits,
                  seed=seed)

# Create LSH model.
lsh = LSH(minhash, labels, no_of_bands=50)

#query to find near duplicates for text 1
print(lsh.query(1, min_jaccard=.5))

#update model
#generate minhash aignitures for new text, and add new texts to LSH model
new_text = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
]
예제 #18
0
def test_string_input_minhash():
    minhash = MinHash(content[0])
    assert type(minhash.signatures) is np.ndarray
    assert minhash.signatures.shape == (1, 100)