예제 #1
0
def find_near_duplicate(dataset, query, targets, labels, min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type='char'):
    """Using LSH object finds the near duplicate strings.

    Args:
        query_sentences (dict): Dict with query strings and version of string in lower case and without comma.
        sentences (dict): Dict with target strings and version of string in lower case and without comma.
        min_jaccard_value (float): Minimum value for the Jaccard Distance.
        no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets.
        n_permutations (int): Number of permutations used to create minhash signatures used in LSH model.
        n_gram (int): Size of each overlapping text shingle to break text into prior to hashing.
    """
    for i in targets:
        if len(i) < n_gram:
            pp(i)
    # Create MinHash object.
    minhash = MinHash(targets, n_gram=n_gram, n_gram_type=n_gram_type,
                      permutations=n_permutations, hash_bits=64, seed=SEED)

    # Create LSH model.
    lsh = LSH(minhash, labels, no_of_bands=no_of_bands)

    # Query to find near duplicates the string in `search`
    closest_results = lsh.query(labels[0], min_jaccard=min_jaccard_value)

    # print("QUERY: {}".format(labels[0]))
    # pp(closest_results)

    return {"dataset": dataset, "query": labels[0], "duplicates": ' '.join(closest_results)}
예제 #2
0
def test_lsh_errors():
    with pytest.raises(ValueError):
        LSH(content)
    with pytest.raises(ValueError):
        LSH(labels=labels)
    with pytest.raises(ValueError):
        LSH(minhash, labels, no_of_bands=49)
예제 #3
0
def find_adjacency(draws):
    draws_idx = list(draws)
    draws_nos = list(draws.values())

    # labels = [draws_idx[0]]
    # content = [" ".join(map(str, draws_nos[0]))]
    # print(labels)
    # print(content)

    # minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3)
    # lsh = LSH(minhash, labels, no_of_bands=50)
    # print(lsh.query(1, min_jaccard=0.5))

    new_labels = []
    new_content = []
    for i in range(0, len(draws)):
        new_labels.append(draws_idx[i])
        new_content.append(" ".join(map(str, draws_nos[i])))
    # print(new_labels)
    # print(new_content)
    new_labels.append(99999)
    new_content.append(" ".join(map(str, draws_nos[0])))

    minhash = MinHash(new_content,
                      n_gram=9,
                      permutations=500,
                      hash_bits=64,
                      seed=3)
    lsh = LSH(minhash, new_labels, no_of_bands=100)

    adjacency_list = lsh.adjacency_list(min_jaccard=0.5)
    for key, value in adjacency_list.items():
        if len(value) > 0:
            print(key, value)
예제 #4
0
class LshCluster():
    def __init__(self, Content, nGram=3, PrenutNum=16, BandNum=8, MinJaccard=0.2):
        self.nGram      = nGram
        self.BandNum    = BandNum
        self.PrenutNum  = PrenutNum
        self.MinJaccard = MinJaccard
        self.Seed       = 3
        self.CreateLsh (Content)

    def Transform(self, Contexts):
        NewContexts = []
        for ctx in Contexts:
            ctx = ctx.strip()
            ctx = ctx.replace("_", "")
            ctx = ctx.lower()
            NewContexts.append(ctx) 
     
        return NewContexts

    def CreateLsh(self, InContext):
        Labels = range(len(InContext))
        InContext = self.Transform (InContext)
        Hash   = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed)
        self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum)
        self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard))

    def QuerySimilars(self, Index):
        if Index >= self.MaxIndex:
            return []
        Results = self.Lsh.query(Index, min_jaccard=self.MinJaccard)
        return Results
예제 #5
0
def test_initialize_from_empty_lsh():
    lsh = LSH()
    assert lsh.no_of_bands is None
    assert lsh._buckets == defaultdict(list)
    assert lsh._i_bucket == defaultdict(list)
    assert lsh.permutations is None
    lsh.update(minhash, labels)
    assert list(lsh._i_bucket) == labels
    assert lsh.permutations == 100
    assert lsh.no_of_bands == 50
예제 #6
0
def test_initialize_from_empty_lsh():
    lsh = LSH()
    assert lsh.no_of_bands is None
    assert lsh._buckets == defaultdict(list)
    assert lsh._i_bucket == defaultdict(list)
    assert lsh.permutations is None
    lsh.update(minhash, labels)
    assert list(lsh._i_bucket) == labels
    buckets = lsh._buckets
    assert buckets[4466445138223010106] == [1, 8]
    assert buckets[-3939654010681976230] == [1, 4, 8]
    assert lsh.permutations == 100
    assert lsh.no_of_bands == 50
예제 #7
0
    def create_lsh(self, content, no_of_bands, n_permutations, n_gram):
        """Create Minhash and Locality Sensitive Hashing (LSH) to detect near duplicate texts.

        Args:
            content (list): List with string to build LSH.
            no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets.
            n_permutations (int): Number of permutations used to create minhash signatures used in LSH model.
            n_gram (int): Size of each overlapping text shingle to break text into prior to hashing.
            no_of_bands(int): Number of bands to break minhash signature into before hashing into buckets.

        Returns:
            class 'snapy.lsh.LSH':  Snapy LSH object.

        """
        labels = range(len(content))

        # Create MinHash object.
        minhash = MinHash(content,
                          n_gram=n_gram,
                          permutations=n_permutations,
                          hash_bits=64,
                          seed=SEED)

        # Create LSH model.
        lsh = LSH(minhash, labels, no_of_bands=no_of_bands)

        return lsh
예제 #8
0
def test_lsh_edge_list():
    lsh = LSH(minhash, labels)
    with pytest.raises(ValueError):
        lsh.edge_list(sensitivity=101)
    assert lsh.edge_list() == [(8, 1), (8, 4), (5, 3), (4, 1)]
    assert lsh.edge_list(sensitivity=20) == [(8, 1), (5, 3), (4, 1)]
    assert lsh.edge_list(min_jaccard=0.7) == []
    assert lsh.edge_list(min_jaccard=0.6) == [(5, 3)]
    assert lsh.edge_list(jaccard_weighted=True,
                         min_jaccard=0.55) == [(5, 3, 0.6), (4, 1, 0.58)]
예제 #9
0
def get_lsh_model(documents, seed_int):
    seed = seed_int
    # Create MinHash object.
    minhash = MinHash(my_documents,
                      n_gram=9,
                      permutations=100,
                      hash_bits=64,
                      seed=seed)
    # Create LSH model.
    lsh = LSH(minhash, labels, no_of_bands=50)
    return lsh
예제 #10
0
def test_update_lsh():
    lsh = LSH(minhash, labels)
    with pytest.raises(ValueError):
        lsh.update(minhash, labels)
    new_content = [
        'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
        'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.'
    ]
    new_labels = [11, 12]
    incorrect_minhash = MinHash(new_content, permutations=10)
    with pytest.raises(ValueError):
        lsh.update(incorrect_minhash, new_labels)
    correct_minhash = MinHash(new_content)
    lsh.update(correct_minhash, new_labels)
    assert lsh.permutations == 100
    assert list(lsh._i_bucket) == labels + [11, 12]
def _get_duplicate_ids(text: List[str], lsh: LSH,
                       min_jaccard: float) -> Iterable[str]:
    """Uses the given `lsh` object to find near duplicate text in `text`. Returns a list of
    indices into `text` which point to duplicate texts.
    """
    duplicate_ids = set()
    adjacency_list = lsh.adjacency_list(min_jaccard=min_jaccard)
    with typer.progressbar(adjacency_list.items(),
                           label="Deduplicating text") as progress:
        for query_id, similar_ids in progress:
            # If query_id exists in duplicate_ids, we have already accounted for it.
            if query_id in duplicate_ids:
                continue
            duplicate_ids.update(similar_ids)
    typer.secho(
        f"{SEARCH} Found a total of {len(duplicate_ids)} duplicate texts.",
        bold=True,
    )
    return list(duplicate_ids)
예제 #12
0
def test_lsh_adjacency_list():
    lsh = LSH(minhash, labels)
    with pytest.raises(ValueError):
        lsh.adjacency_list(sensitivity=1000)
    sensitivity_list = lsh.adjacency_list(sensitivity=2)
    assert sensitivity_list == {
        1: [8, 4],
        2: [],
        3: [5],
        4: [1, 8],
        5: [3],
        6: [],
        7: [],
        8: [1, 4],
        9: []
    }
    jaccard_list = lsh.adjacency_list(min_jaccard=0.6)
    assert jaccard_list == {
        1: [],
        2: [],
        3: [5],
        4: [],
        5: [3],
        6: [],
        7: [],
        8: [],
        9: []
    }
    default_list = lsh.adjacency_list()
    assert default_list == {
        1: [8, 4],
        2: [],
        3: [5],
        4: [1, 8],
        5: [3],
        6: [],
        7: [],
        8: [1, 4],
        9: []
    }
def _create_lsh(
    text: List[str],
    labels: List[int],
    n_gram: int,
    n_permutations: int,
    hash_bits: int,
    no_of_bands: int,
) -> LSH:
    """Returns a `snapy.lsh.LSH` object constructed from `text` to detect near duplicate texts.
    """

    minhash = MinHash(text,
                      n_gram=n_gram,
                      permutations=n_permutations,
                      hash_bits=hash_bits,
                      seed=SEED)
    lsh = LSH(minhash, labels, no_of_bands=no_of_bands)

    typer.secho(
        f"{HASHING}  Hashed the normalized text using Locality-Sensitive Hashing (LSH).",
        bold=True,
    )

    return lsh
예제 #14
0
 def identify_dublicates(self, ctnt_to_dedup):
     _ix = [i for i in range(len(ctnt_to_dedup))]
     _mn_hash = MinHash(ctnt_to_dedup, n_gram=self.n_gram, seed=self.seed)
     _lsh = LSH(_mn_hash, _ix, no_of_bands=self.lsh_bands)
     candidates = _lsh.adjacency_list(min_jaccard=self.j_thresh)
     return candidates
예제 #15
0
def test_lsh_remove():
    lsh = LSH(minhash, labels)
    lsh.remove(5)
    assert list(lsh._i_bucket) == [1, 2, 3, 4, 6, 7, 8, 9]
    with pytest.raises(KeyError):
        lsh.remove(11)
예제 #16
0
def test_lsh_contains():
    lsh = LSH(minhash, labels)
    assert lsh.contains() == labels
예제 #17
0
def test_lsh_query():
    lsh = LSH(minhash, labels)
    with pytest.raises(KeyError):
        lsh.query(10)
    with pytest.raises(KeyError):
        lsh.query(0)
    with pytest.raises(ValueError):
        lsh.query(2, sensitivity=100)
    result = lsh.query(1)
    assert result == [8, 4]
    result = lsh.query(1, sensitivity=29)
    assert result == [4]
    result = lsh.query(1, min_jaccard=0.55)
    assert result == [4]
예제 #18
0
 def CreateLsh(self, InContext):
     Labels = range(len(InContext))
     InContext = self.Transform (InContext)
     Hash   = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed)
     self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum)
     self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard))
from snapy import MinHash, LSH
import numpy as np
from fasta_parser import parse_to_list

contigs_file = "../contigs-outputs/basic_k-mer24/basic_try_k-mer24.contigs.fa"
print("parsing contigs to list...")
contigs_list, num_contigs = parse_to_list(contigs_file)
print("number of contigs (shorter then 1,00bp):", num_contigs)
labels = np.arange(num_contigs).tolist()
# Create MinHash object.
print("creating minhash object...")
minhash = MinHash(contigs_list, n_gram=24)
# Create LSH model.
print("creating LSH object...")
lsh = LSH(minhash, labels)
print("query object:")
print("similar strings to the string in index 0:", lsh.query(0))
print("similar strings to the string in index 1:", lsh.query(1))
print("similar strings to the string in index 2:", lsh.query(2))
print("similar strings to the string in index 3:", lsh.query(3))
print("similar strings to the string in index 4:", lsh.query(4))
print("similar strings to the string in index 5:", lsh.query(5))
print(lsh.contains())

예제 #20
0
#number of randomly sampled hash values to use for generating each texts minhash signature (larger = more accurate & slower)
permutations = 100

#hash value size to be used to generate minhash signitures from shingles (32,64, or 128 bit).
#NOTE: should be chosen based on text length and a trade off between performance ad accuracy
hash_bits = 64

# Create MinHash object.
minhash = MinHash(content,
                  n_gram=n_gram,
                  permutations=permutations,
                  hash_bits=hash_bits,
                  seed=seed)

# Create LSH model.
lsh = LSH(minhash, labels, no_of_bands=50)

#query to find near duplicates for text 1
print(lsh.query(1, min_jaccard=.5))

#update model
#generate minhash aignitures for new text, and add new texts to LSH model
new_text = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
]

new_labels = ['new_doc1', 'new_doc2']

#1.create minhash signitues for new text
new_minhash = MinHash(new_text,
예제 #21
0
def test_initialize_lsh_with_params():
    lsh = LSH(minhash, labels, no_of_bands=20)
    assert lsh.no_of_bands == 20
    assert lsh.permutations == 100
    assert list(lsh._i_bucket) == labels