예제 #1
0
    def calc_minhashes(_id, text, ds_key, sh_type, hashes, seeds, modulo):
        ##########################################
        def parse_text(text):
            soup = BeautifulSoup(text.replace('\\n',' '))
            [s.extract() for s in soup(['script', 'style'])]
            text = soup.get_text(separator=' ', strip=True)
            text = symbols.sub(' ', text.lower())
            # Remove spurious white space characters
            text = ' '.join(text.split())
            return text
        ##########################################
        def minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo):
            def calc_onehash(sh_type, shingle, seed, modulo):
                def c4_hash(shingle):
                    h = struct.unpack('<i',shingle)[0]
                    return  h % ((sys.maxsize + 1) * 2)
                if sh_type == 'c4':
                    return operator.xor(c4_hash(shingle), long(seed)) % modulo
                else:
                    return operator.xor(compute_positive_hash(shingle), long(seed)) % modulo

            minhashes = [sys.maxsize for _ in xrange(hashes)]
            for shingle in shingles:
                for hno in xrange(hashes):
                    h_value = calc_onehash(sh_type, shingle, seeds[hno], modulo)
                    minhashes[hno] = min(h_value, minhashes[hno])
            return minhashes
        ##########################################
        text = parse_text(text)
        shingles = text.split() if sh_type=='w' else set(_get_list_of_shingles(text))
        minhashes = minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo)
        return minhashes
예제 #2
0
def calc_minhashes(parsed_text, sh_type, hashes, seeds, modulo):
    def minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo):
        def calc_onehash(sh_type, shingle, seed, modulo):
            def c4_hash(shingle):
                h = struct.unpack('<i', shingle)[0]
                return h % ((sys.maxsize + 1) * 2)

            if sh_type == 'c4':
                return operator.xor(c4_hash(shingle), long(seed)) % modulo
            else:
                return operator.xor(compute_positive_hash(shingle),
                                    long(seed)) % modulo

        minhashes = [sys.maxsize for _ in xrange(hashes)]
        for shingle in shingles:
            for hno in xrange(hashes):
                h_value = calc_onehash(sh_type, shingle, seeds[hno], modulo)
                minhashes[hno] = min(h_value, minhashes[hno])
        return minhashes

    shingles = parsed_text.split() if sh_type == 'w' else set(
        _get_list_of_shingles(parsed_text))
    minhashes = minhashes_for_shingles(shingles, sh_type, hashes, seeds,
                                       modulo)
    return minhashes
예제 #3
0
def test_get_list_of_shingles_none_doc_param_return_empty_list():
    # set up
    expected_results = []

    # execute
    actual_results = shgl._get_list_of_shingles(None, 4)

    # asserts
    nt.eq_(actual_results, expected_results)
def test_get_list_of_shingles_none_doc_param_return_empty_list():
    # set up
    expected_results = []

    # execute
    actual_results = shgl._get_list_of_shingles(None, 4)

    # asserts
    nt.eq_(actual_results, expected_results)
예제 #5
0
def test_get_list_of_shingles_return_non_empty_list():
    # set up
    size = 4
    faux_doc = next(generator_string())
    expected_results = get_faux_list_of_four_chars_long_strings()

    # execute
    actual_results = shgl._get_list_of_shingles(faux_doc, size)

    # asserts
    nt.eq_(actual_results, expected_results)
def test_get_list_of_shingles_return_non_empty_list():
    # set up
    size = 4
    faux_doc = next(generator_string())
    expected_results = get_faux_list_of_four_chars_long_strings()

    # execute
    actual_results = shgl._get_list_of_shingles(faux_doc, size)

    # asserts
    nt.eq_(actual_results, expected_results)
def calc_minhashes(parsed_text, sh_type, hashes, seeds, modulo):
    def minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo):
        def calc_onehash(sh_type, shingle, seed, modulo):
            def c4_hash(shingle):
                h = struct.unpack('<i',shingle)[0]
                return  h % ((sys.maxsize + 1) * 2)
            if sh_type == 'c4':
                return operator.xor(c4_hash(shingle), long(seed)) % modulo
            else:
                return operator.xor(compute_positive_hash(shingle), long(seed)) % modulo

        minhashes = [sys.maxsize for _ in xrange(hashes)]
        for shingle in shingles:
            for hno in xrange(hashes):
                h_value = calc_onehash(sh_type, shingle, seeds[hno], modulo)
                minhashes[hno] = min(h_value, minhashes[hno])
        return minhashes

    shingles = parsed_text.split() if sh_type=='w' else set(_get_list_of_shingles(parsed_text))
    minhashes = minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo)
    return minhashes
예제 #8
0
 def shingles(self):
     return self.text.split() if self.sh_type == 'w' else set(
         _get_list_of_shingles(self.text))
예제 #9
0
 def shingles(self):
     return self.text.split() if self.sh_type=='w' else set(_get_list_of_shingles(self.text))
 def shingle_text(text, sh_type):
     retval = set(text.split()) if sh_type=='w' else set(_get_list_of_shingles(text))
     return retval
 def shingle_text(text, sh_type):
     retval = set(text.split()) if sh_type == 'w' else set(
         _get_list_of_shingles(text))
     return retval