Python Minlsh 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nazca.utils.minhashing

클래스/타입: Minlsh

hotexamples.com에서의 예제들: 6

Python Minlsh - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nazca.utils.minhashing.Minlsh에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

predict(2)

train(2)

_buildmatrixdocument(1)

_iter_wordgrams(1)

예제 #1

파일 보기

파일: test_minhashing.py 프로젝트: pombredanne/http-hg.logilab.org-review-nazca

 def test_iter_wordgrams(self):
     sentence = 'nom de la rose'
     minlsh = Minlsh()
     results = list(minlsh._iter_wordgrams(sentence, 2))
     truth = ['nom de', 'nom', 'de la', 'de', 'la rose', 'la', 'rose']
     self.assertEqual(len(results), len(truth))
     self.assertEqual(set(results), set(truth))

예제 #2

파일 보기

파일: test_minhashing.py 프로젝트: pombredanne/http-hg.logilab.org-review-nazca

 def test_iter_wordgrams_sklearn(self):
     sentences = ('nom de la rose', 'nom de la')
     tokenizer_func = partial(count_vectorizer_func, min_n=1, max_n=2)
     minlsh = Minlsh(tokenizer_func=tokenizer_func)
     rows, shape = list(minlsh._buildmatrixdocument(sentences, 2))
     self.assertEqual(shape, (2, 7))
     self.assertEqual(rows[0], [0, 1, 2, 3, 4, 5, 6])
     self.assertEqual(rows[1], [0, 1, 2, 4, 5])

예제 #3

파일 보기

파일: blocking.py 프로젝트: pombredanne/http-hg.logilab.org-review-nazca

class MinHashingBlocking(BaseBlocking):
    """ A blocking technique based on MinHashing
    """
    def __init__(self, ref_attr_index, target_attr_index,
                 threshold=0.1, kwordsgram=1, siglen=200):
        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
        self.threshold = threshold
        self.kwordsgram = kwordsgram
        self.siglen = siglen
        self.minhasher = Minlsh()
        self.nb_elements = None

    def _fit(self, refset, targetset):
        """ Find the blocking using minhashing
        """
        # If an element is None (missing), use instead the identity element.
        idelement = ''
        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
                        [elt[self.target_attr_index] or idelement for elt in targetset],
                        self.kwordsgram, self.siglen)
        self.nb_elements = len(refset)

    def _iter_blocks(self):
        """ Iterator over the different possible blocks.

        Returns
        -------

        (block1, block2): The blocks are always (reference_block, target_block)
                          and containts the indexes of the record in the
                          corresponding dataset.
        """
        rawneighbours = self.minhasher.predict(self.threshold)
        neighbours = []
        for data in rawneighbours:
            neighbours.append([[], []])
            for i in data:
                if i >= self.nb_elements:
                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
                else:
                    neighbours[-1][0].append(self.refids[i])
            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
                neighbours.pop()
        for block1, block2 in neighbours:
            if len(block1) and len(block2):
                yield block1, block2

    def _cleanup(self):
        """ Cleanup blocking for further use (e.g. in pipeline)
        """
        self.minhasher = Minlsh()
        self.nb_elements = None

예제 #4

파일 보기

파일: test_minhashing.py 프로젝트: pombredanne/http-hg.logilab.org-review-nazca

 def test_all(self):
     sentences = [u"Un nuage flotta dans le grand ciel bleu.",
                  u"Des grands nuages noirs flottent dans le ciel.",
                  u"Je n'aime pas ce genre de bandes dessinées tristes.",
                  u"J'aime les bandes dessinées de genre comiques.",
                  u"Pour quelle occasion vous êtes-vous apprêtée ?",
                  u"Je les vis ensemble à plusieurs occasions.",
                  u"Je les ai vus ensemble à plusieurs occasions.",
                 ]
     minlsh = Minlsh()
     # XXX Should works independantly of the seed. Unstability due to the bands number ?
     minlsh.train((simplify(s, FRENCH_LEMMAS, remove_stopwords=True) for s in sentences), 1, 200)
     self.assertEqual(set([(0, 1), (2, 3), (5,6)]), minlsh.predict(0.4))

예제 #5

파일 보기

파일: blocking.py 프로젝트: pombredanne/http-hg.logilab.org-review-nazca

 def __init__(self, ref_attr_index, target_attr_index,
              threshold=0.1, kwordsgram=1, siglen=200):
     super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
     self.threshold = threshold
     self.kwordsgram = kwordsgram
     self.siglen = siglen
     self.minhasher = Minlsh()
     self.nb_elements = None

예제 #6

파일 보기

파일: blocking.py 프로젝트: pombredanne/http-hg.logilab.org-review-nazca

 def _cleanup(self):
     """ Cleanup blocking for further use (e.g. in pipeline)
     """
     self.minhasher = Minlsh()
     self.nb_elements = None