def add(self, uuid): """ Adds a key to the HyperLogLog """ if uuid: # Computing the hash try: x = smhasher.murmur3_x86_64(uuid) except UnicodeEncodeError: x = smhasher.murmur3_x86_64(uuid.encode('ascii', 'ignore')) # Finding the register to update by using thef first b bits as an index j = x & ((1 << self.b) - 1) # Remove those b bits w = x >> self.b # Find the first 0 in the remaining bit pattern self.M[j] = max(self.M[j], self._get_rho(w, self.bitcount_arr))
def _make_hashfuncs(key): if isinstance(key, unicode): key = key.encode('utf-8') else: key = str(key) rval = [] current_hash = None for i in range(nbr_slices): seed = current_hash or 0 current_hash = smhasher.murmur3_x86_64(key, seed) rval.append(current_hash % nbr_bits) return rval
# split the line by \t doc_id,content = line.split("\t") if( (not (doc_id)) or (not(content))): print "line not formated properly" break # remove punctuations content = test_re(content) # split into an array of words content_arr = content.split() #print len(content_arr) shingle = "" # if line len is less than 8 then just emit the whole line as shingle if( len(content_arr) < k): shingle = ''.join(content_arr) #s = "%s\t%s"%(doc_id,shingle) shing_hash = smhasher.murmur3_x86_64(shingle) # now output this 64 times 0 - 63 for j in xrange(64): # calculate the hash of shingle # test j'th Bit bit_num = testBit(shing_hash,j) # set bit_out = 1 if its set and bit out = 0 if not if bit_num > 0: bit_out = 1 else: bit_out = -1 #print "bit num",j,"is =",bit_out # we have to pass the bit as string( can hadoop accept bints / floats ) s = "%s,%s\t%s"%(doc_id,str(j),str(bit_out)) # to avoid overhead of copying why dont u just output +1 or -1 print s
def hash(tohash): '''fast, deterministic hash function''' return smhasher.murmur3_x86_64(str(tohash))
#!/usr/bin/env python import sys sys.path.append('.') import smhasher #import murmurhash # input comes from STDIN (standard input) for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() # split the line into words words = line.split() # increase counters for word in words: # write the results to STDOUT (standard output); # what we output here will be the input for the # Reduce step, i.e. the input for reducer.py # # tab-delimited; the trivial word count is 1 print '%s\t%s' % (word, str(smhasher.murmur3_x86_64("hello")))
def get_hash64(self, str): return smhasher.murmur3_x86_64(str)
def hash_bytes(self, s): return smhasher.murmur3_x86_64(s)
def hash_str(s): return smhasher.murmur3_x86_64(s)
def hash(tohash): return smhasher.murmur3_x86_64(str(tohash))