def calc_minhashes(_id, text, ds_key, sh_type, hashes, seeds, modulo): ########################################## def parse_text(text): soup = BeautifulSoup(text.replace('\\n',' ')) [s.extract() for s in soup(['script', 'style'])] text = soup.get_text(separator=' ', strip=True) text = symbols.sub(' ', text.lower()) # Remove spurious white space characters text = ' '.join(text.split()) return text ########################################## def minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo): def calc_onehash(sh_type, shingle, seed, modulo): def c4_hash(shingle): h = struct.unpack('<i',shingle)[0] return h % ((sys.maxsize + 1) * 2) if sh_type == 'c4': return operator.xor(c4_hash(shingle), long(seed)) % modulo else: return operator.xor(compute_positive_hash(shingle), long(seed)) % modulo minhashes = [sys.maxsize for _ in xrange(hashes)] for shingle in shingles: for hno in xrange(hashes): h_value = calc_onehash(sh_type, shingle, seeds[hno], modulo) minhashes[hno] = min(h_value, minhashes[hno]) return minhashes ########################################## text = parse_text(text) shingles = text.split() if sh_type=='w' else set(_get_list_of_shingles(text)) minhashes = minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo) return minhashes
def calc_minhashes(parsed_text, sh_type, hashes, seeds, modulo): def minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo): def calc_onehash(sh_type, shingle, seed, modulo): def c4_hash(shingle): h = struct.unpack('<i', shingle)[0] return h % ((sys.maxsize + 1) * 2) if sh_type == 'c4': return operator.xor(c4_hash(shingle), long(seed)) % modulo else: return operator.xor(compute_positive_hash(shingle), long(seed)) % modulo minhashes = [sys.maxsize for _ in xrange(hashes)] for shingle in shingles: for hno in xrange(hashes): h_value = calc_onehash(sh_type, shingle, seeds[hno], modulo) minhashes[hno] = min(h_value, minhashes[hno]) return minhashes shingles = parsed_text.split() if sh_type == 'w' else set( _get_list_of_shingles(parsed_text)) minhashes = minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo) return minhashes
def test_get_list_of_shingles_none_doc_param_return_empty_list(): # set up expected_results = [] # execute actual_results = shgl._get_list_of_shingles(None, 4) # asserts nt.eq_(actual_results, expected_results)
def test_get_list_of_shingles_return_non_empty_list(): # set up size = 4 faux_doc = next(generator_string()) expected_results = get_faux_list_of_four_chars_long_strings() # execute actual_results = shgl._get_list_of_shingles(faux_doc, size) # asserts nt.eq_(actual_results, expected_results)
def calc_minhashes(parsed_text, sh_type, hashes, seeds, modulo): def minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo): def calc_onehash(sh_type, shingle, seed, modulo): def c4_hash(shingle): h = struct.unpack('<i',shingle)[0] return h % ((sys.maxsize + 1) * 2) if sh_type == 'c4': return operator.xor(c4_hash(shingle), long(seed)) % modulo else: return operator.xor(compute_positive_hash(shingle), long(seed)) % modulo minhashes = [sys.maxsize for _ in xrange(hashes)] for shingle in shingles: for hno in xrange(hashes): h_value = calc_onehash(sh_type, shingle, seeds[hno], modulo) minhashes[hno] = min(h_value, minhashes[hno]) return minhashes shingles = parsed_text.split() if sh_type=='w' else set(_get_list_of_shingles(parsed_text)) minhashes = minhashes_for_shingles(shingles, sh_type, hashes, seeds, modulo) return minhashes
def shingles(self): return self.text.split() if self.sh_type == 'w' else set( _get_list_of_shingles(self.text))
def shingles(self): return self.text.split() if self.sh_type=='w' else set(_get_list_of_shingles(self.text))
def shingle_text(text, sh_type): retval = set(text.split()) if sh_type=='w' else set(_get_list_of_shingles(text)) return retval
def shingle_text(text, sh_type): retval = set(text.split()) if sh_type == 'w' else set( _get_list_of_shingles(text)) return retval