def test_iter_wordgrams(self):
     sentence = 'nom de la rose'
     minlsh = Minlsh()
     results = list(minlsh._iter_wordgrams(sentence, 2))
     truth = ['nom de', 'nom', 'de la', 'de', 'la rose', 'la', 'rose']
     self.assertEqual(len(results), len(truth))
     self.assertEqual(set(results), set(truth))
 def test_iter_wordgrams_sklearn(self):
     sentences = ('nom de la rose', 'nom de la')
     tokenizer_func = partial(count_vectorizer_func, min_n=1, max_n=2)
     minlsh = Minlsh(tokenizer_func=tokenizer_func)
     rows, shape = list(minlsh._buildmatrixdocument(sentences, 2))
     self.assertEqual(shape, (2, 7))
     self.assertEqual(rows[0], [0, 1, 2, 3, 4, 5, 6])
     self.assertEqual(rows[1], [0, 1, 2, 4, 5])
class MinHashingBlocking(BaseBlocking):
    """ A blocking technique based on MinHashing
    """
    def __init__(self, ref_attr_index, target_attr_index,
                 threshold=0.1, kwordsgram=1, siglen=200):
        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
        self.threshold = threshold
        self.kwordsgram = kwordsgram
        self.siglen = siglen
        self.minhasher = Minlsh()
        self.nb_elements = None

    def _fit(self, refset, targetset):
        """ Find the blocking using minhashing
        """
        # If an element is None (missing), use instead the identity element.
        idelement = ''
        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
                        [elt[self.target_attr_index] or idelement for elt in targetset],
                        self.kwordsgram, self.siglen)
        self.nb_elements = len(refset)

    def _iter_blocks(self):
        """ Iterator over the different possible blocks.

        Returns
        -------

        (block1, block2): The blocks are always (reference_block, target_block)
                          and containts the indexes of the record in the
                          corresponding dataset.
        """
        rawneighbours = self.minhasher.predict(self.threshold)
        neighbours = []
        for data in rawneighbours:
            neighbours.append([[], []])
            for i in data:
                if i >= self.nb_elements:
                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
                else:
                    neighbours[-1][0].append(self.refids[i])
            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
                neighbours.pop()
        for block1, block2 in neighbours:
            if len(block1) and len(block2):
                yield block1, block2

    def _cleanup(self):
        """ Cleanup blocking for further use (e.g. in pipeline)
        """
        self.minhasher = Minlsh()
        self.nb_elements = None
 def test_all(self):
     sentences = [u"Un nuage flotta dans le grand ciel bleu.",
                  u"Des grands nuages noirs flottent dans le ciel.",
                  u"Je n'aime pas ce genre de bandes dessinées tristes.",
                  u"J'aime les bandes dessinées de genre comiques.",
                  u"Pour quelle occasion vous êtes-vous apprêtée ?",
                  u"Je les vis ensemble à plusieurs occasions.",
                  u"Je les ai vus ensemble à plusieurs occasions.",
                 ]
     minlsh = Minlsh()
     # XXX Should works independantly of the seed. Unstability due to the bands number ?
     minlsh.train((simplify(s, FRENCH_LEMMAS, remove_stopwords=True) for s in sentences), 1, 200)
     self.assertEqual(set([(0, 1), (2, 3), (5,6)]), minlsh.predict(0.4))
 def __init__(self, ref_attr_index, target_attr_index,
              threshold=0.1, kwordsgram=1, siglen=200):
     super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
     self.threshold = threshold
     self.kwordsgram = kwordsgram
     self.siglen = siglen
     self.minhasher = Minlsh()
     self.nb_elements = None
 def _cleanup(self):
     """ Cleanup blocking for further use (e.g. in pipeline)
     """
     self.minhasher = Minlsh()
     self.nb_elements = None