Пример #1
0
class LshCluster():
    def __init__(self, Content, nGram=3, PrenutNum=16, BandNum=8, MinJaccard=0.2):
        self.nGram      = nGram
        self.BandNum    = BandNum
        self.PrenutNum  = PrenutNum
        self.MinJaccard = MinJaccard
        self.Seed       = 3
        self.CreateLsh (Content)

    def Transform(self, Contexts):
        NewContexts = []
        for ctx in Contexts:
            ctx = ctx.strip()
            ctx = ctx.replace("_", "")
            ctx = ctx.lower()
            NewContexts.append(ctx) 
     
        return NewContexts

    def CreateLsh(self, InContext):
        Labels = range(len(InContext))
        InContext = self.Transform (InContext)
        Hash   = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed)
        self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum)
        self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard))

    def QuerySimilars(self, Index):
        if Index >= self.MaxIndex:
            return []
        Results = self.Lsh.query(Index, min_jaccard=self.MinJaccard)
        return Results
Пример #2
0
def find_adjacency(draws):
    draws_idx = list(draws)
    draws_nos = list(draws.values())

    # labels = [draws_idx[0]]
    # content = [" ".join(map(str, draws_nos[0]))]
    # print(labels)
    # print(content)

    # minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3)
    # lsh = LSH(minhash, labels, no_of_bands=50)
    # print(lsh.query(1, min_jaccard=0.5))

    new_labels = []
    new_content = []
    for i in range(0, len(draws)):
        new_labels.append(draws_idx[i])
        new_content.append(" ".join(map(str, draws_nos[i])))
    # print(new_labels)
    # print(new_content)
    new_labels.append(99999)
    new_content.append(" ".join(map(str, draws_nos[0])))

    minhash = MinHash(new_content,
                      n_gram=9,
                      permutations=500,
                      hash_bits=64,
                      seed=3)
    lsh = LSH(minhash, new_labels, no_of_bands=100)

    adjacency_list = lsh.adjacency_list(min_jaccard=0.5)
    for key, value in adjacency_list.items():
        if len(value) > 0:
            print(key, value)
Пример #3
0
def test_lsh_adjacency_list():
    lsh = LSH(minhash, labels)
    with pytest.raises(ValueError):
        lsh.adjacency_list(sensitivity=1000)
    sensitivity_list = lsh.adjacency_list(sensitivity=2)
    assert sensitivity_list == {
        1: [8, 4],
        2: [],
        3: [5],
        4: [1, 8],
        5: [3],
        6: [],
        7: [],
        8: [1, 4],
        9: []
    }
    jaccard_list = lsh.adjacency_list(min_jaccard=0.6)
    assert jaccard_list == {
        1: [],
        2: [],
        3: [5],
        4: [],
        5: [3],
        6: [],
        7: [],
        8: [],
        9: []
    }
    default_list = lsh.adjacency_list()
    assert default_list == {
        1: [8, 4],
        2: [],
        3: [5],
        4: [1, 8],
        5: [3],
        6: [],
        7: [],
        8: [1, 4],
        9: []
    }
def _get_duplicate_ids(text: List[str], lsh: LSH,
                       min_jaccard: float) -> Iterable[str]:
    """Uses the given `lsh` object to find near duplicate text in `text`. Returns a list of
    indices into `text` which point to duplicate texts.
    """
    duplicate_ids = set()
    adjacency_list = lsh.adjacency_list(min_jaccard=min_jaccard)
    with typer.progressbar(adjacency_list.items(),
                           label="Deduplicating text") as progress:
        for query_id, similar_ids in progress:
            # If query_id exists in duplicate_ids, we have already accounted for it.
            if query_id in duplicate_ids:
                continue
            duplicate_ids.update(similar_ids)
    typer.secho(
        f"{SEARCH} Found a total of {len(duplicate_ids)} duplicate texts.",
        bold=True,
    )
    return list(duplicate_ids)
Пример #5
0
#generate minhash aignitures for new text, and add new texts to LSH model
new_text = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
]

new_labels = ['new_doc1', 'new_doc2']

#1.create minhash signitues for new text
new_minhash = MinHash(new_text,
                      n_gram=n_gram,
                      permutations=permutations,
                      hash_bits=hash_bits,
                      seed=seed)

#2.update lsh model with new hash signitures and verify lsh model updates reflected
lsh.update(new_minhash, new_labels)
print(lsh.contains())

#print the adjacency_list of all docs
print(lsh.adjacency_list())

#print the edge list of all docs that are flagged as duplicates to plot in text similarity graph
print(lsh.edge_list())

#remove text and label from model (if its not there , you will get an error returned)
lsh.remove(6)
print(lsh.contains())

#get matrix(n*m) of text signatures generated by minhash function (n=text row, m=selected permutations)
minhash.signatures.shape
Пример #6
0
 def identify_dublicates(self, ctnt_to_dedup):
     _ix = [i for i in range(len(ctnt_to_dedup))]
     _mn_hash = MinHash(ctnt_to_dedup, n_gram=self.n_gram, seed=self.seed)
     _lsh = LSH(_mn_hash, _ix, no_of_bands=self.lsh_bands)
     candidates = _lsh.adjacency_list(min_jaccard=self.j_thresh)
     return candidates