Python hash示例，mmh3.hash Python示例

示例#1

0

显示文件

文件： bloom.py 项目： jettify/aioredis_bloom

 def _hash_bits(self, key):
     # http://spyced.blogspot.com/2009
     # /01/all-you-ever-wanted-to-know-about.html
     hash1 = mmh3.hash(key, 0)
     hash2 = mmh3.hash(key, hash1)
     for i in range(self._hash_funcs):
         yield abs((hash1 + i * hash2) % self._bits_per_slice)

示例#2

0

显示文件

文件： fast_solution.py 项目： timpalpant/KaggleTSTextClassification

def data(path, label_path=None):
    fd = open(path)
    fd.readline() # skip headers
    hash_cols = [3,4,34,35,61,64,65,91,94,95]
    npairs = len(hash_cols)
    x = [0] * (146 + npairs*(npairs-1)/2)
    if label_path:
        label = open(label_path)
        label.readline() # skip headers
    for t, line in enumerate(fd):
        # parse x
        row = line.rstrip().split(',')
        for m, feat in enumerate(row):
            if m == 0:
                ID = int(feat)
            else:
                # one-hot encode everything with hash trick
                # categorical: one-hotted
                # boolean: ONE-HOTTED
                # numerical: ONE-HOTTED!
                # note, the build in hash(), although fast is not stable,
                #       i.e., same value won't always have the same hash
                #       on different machines
                x[m] = abs(mmh3.hash(str(m) + '_' + feat)) % D
        for i in xrange(10):
            for j in xrange(i+1,10):
                m += 1
                x[m] = abs(mmh3.hash(str(m)+'_'+row[hash_cols[i]]+"_x_"+row[hash_cols[j]])) % D
        # parse y, if provided
        if label_path:
            # use float() to prevent future type casting, [1:] to ignore id
            y = [float(y) for y in label.readline().split(',')[1:]]
        yield (ID, x, y) if label_path else (ID, x)

示例#3

0

显示文件

文件： m_api.py 项目： J4LP/mumble

 def authenticate(self, name, password, certificates, certhash, certstrong, current=None):
     with self.app.app_context():
         if name == 'SuperUser':
             return RET_FALLTHROUGH
         user = User.query.filter_by(user_id=name).first()
         if not user:
             try:
                 uuid.UUID(name, version=4)
             except ValueError:
                 return RET_DENIED
             guest_user = GuestUser.query.get(name)
             if guest_user:
                 if not guest_user.password == password or guest_user.banned:
                     return RET_DENIED
                 if guest_user.corporation:
                     self.app.logger.debug('Authenticating guest with: {} {} {}'.format(abs(mmh3.hash(guest_user.id.hex)), '[{}][GUEST] {}'.format(self.get_ticker(guest_user.corporation), guest_user.name), [u'Guest']))
                     return abs(mmh3.hash(guest_user.id.hex)), '[{}][GUEST] {}'.format(self.get_ticker(guest_user.corporation), guest_user.name), [u'Guest']
                 else:
                     self.app.logger.debug('Authenticating guest with: {} {} {}'.format(abs(mmh3.hash(guest_user.id.hex)), '[GUEST] {}'.format(guest_user.name), [u'Guest']))
                     return abs(mmh3.hash(guest_user.id.hex)), '[GUEST] {}'.format(guest_user.name), ['Guest']
             else:
                 return RET_DENIED
         if not user.mumble_password == password:
             return RET_DENIED
         self.app.logger.debug('Authenticating user with: {} {} {}'.format(mmh3.hash(user.user_id), '[{}] {}'.format(self.get_ticker(user.corporation_name), user.main_character), user.groups))
         return mmh3.hash(user.user_id), '[{}] {}'.format(self.get_ticker(user.corporation_name), user.main_character), user.groups

示例#4

0

显示文件

文件： bloom.py 项目： dariajung/bloom

    def hash(self, string):
        hash_arr = []
        hash1 = mmh3.hash(string, 0)
        hash2 = mmh3.hash(string, hash1)
        for i in range(self.k):
            hash_arr.append(abs((hash1 + i * hash2) % self.m))

        return hash_arr

示例#5

0

显示文件

文件： sketching_files.py 项目： bajib/Log_Clustering

 def Hashmap_WordVector(self,nbits):
     length=len(self.Words_Vector)
     self.bl_bits=nbits
     self.bloom_vector=self.bl_bits*bitarray('0')
     for i in range(length):
         self.hashmap1.append(mmh3.hash(self.Words_Vector[i]) % self.bl_bits )
         self.hashmap2.append(mmh3.hash(self.Words_Vector[i],self.hashmap1[i]) % self.bl_bits )
         self.hashmap3.append(mmh3.hash(self.Words_Vector[i],self.hashmap2[i]) % self.bl_bits )
         self.bloom_vector[self.hashmap1[i]]=1
         self.bloom_vector[self.hashmap2[i]]=1
         self.bloom_vector[self.hashmap3[i]]=1

示例#6

0

显示文件

文件： vw_hash.py 项目： Faye2014/seldon-server

def get_hash(label,namespace,feature,stride,mask):
    if namespace:
        namespace_hash = mmh3.hash(namespace,0)
    else:
        namespace_hash = 0
    if is_number(feature):
        feature_hash = int(feature) + namespace_hash
    else:
        feature_hash = mmh3.hash(feature,namespace_hash)
    feature_hash_oaa = feature_hash * stride
    return (feature_hash_oaa + label - 1) & mask

示例#7

0

显示文件

文件： bsbi.py 项目： bogdancarpusor/html_indexer

 def parse_block(block):
     index_block = []
     for file_path in block:
         file_path_hash = mmh3.hash(file_path)
         with open(file_path, 'r') as input_file:
             for line in input_file:
                 items = line.strip().split(' ')
                 index_block.append(
                     (mmh3.hash(items[0]),
                      [file_path_hash,
                      items[1]])
                 )
     return index_block

示例#8

0

显示文件

文件： searcher.py 项目： KopBob/technosphere

def select_terms_meta(query_terms, term_dict_stream):
    """
    reads term dictionary generator and selects query terms meta info
    """
    terms_meta_dict = {}

    for term in query_terms:
        term_hash = mmh3.hash(term.encode("utf-8"))
        terms_meta_dict[term_hash] = {
            "term": term,
            "seek_offset": None,
            "size": None
        }

    seek_offset = 0
    unseen_terms = terms_meta_dict.keys()
    for dict_term_hash, dict_term_size in term_dict_stream:
        if dict_term_hash in unseen_terms:
            terms_meta_dict[dict_term_hash]["seek_offset"] = seek_offset
            terms_meta_dict[dict_term_hash]["size"] = dict_term_size

            unseen_terms.remove(dict_term_hash)
            if len(unseen_terms) == 0:
                break

        seek_offset += dict_term_size

    query_terms_dict = {}
    for _, term_meta in terms_meta_dict.items():
        query_terms_dict[term_meta["term"]] = {
            "seek_offset": term_meta["seek_offset"],
            "size": term_meta["size"]
        }

    return query_terms_dict

示例#9

0

显示文件

文件： FTRLProximal.py 项目： cckk3333/rt-pred

    def _indices(self, x):
        ''' A helper generator that yields the indices in x

            The purpose of this generator is to make the following
            code a bit cleaner when doing feature interaction.
        '''

        # first yield index of the bias term
        yield 0, 1.

        # then yield the linear indices
        if self.interaction != 2:
            for i,val in x:
                yield i,val

        # now yield interactions (if applicable)
        if self.interaction:
            D = self.D
            L = len(x)

            x = sorted(x)
            for i in xrange(L):
                for j in xrange(i+1, L):
                    # one-hot encode interactions with hash trick
                    yield abs(hash(str(x[i][0]) + '_' + str(x[j][0]))) % D, x[i][1]*x[j][1]

示例#10

0

显示文件

文件： processor.py 项目： pangbo-1988/data_diff

 def process(self):
     # load data
     data = self.load()
     # index to elastic search
     print "\nStart processing"
     cursor = Cursor(self.es, self.data_from)
     cursor_num = cursor.get_new_cursor()
     for each_data in data:
         key_string = ''
         for each_key_string in key_value:
             key_string += each_data[each_key_string]
         hashkey = mmh3.hash(key_string)
         print "parsing id: ", hashkey
         # try to read record
         try:
             res = self.es.get(   index="deltadb", 
                             doc_type="data", 
                             id=hashkey)
             if res["found"]:
                 node = self.update_node(res["_source"], each_data, cursor_num)
             else:
                 node = self.create_node(each_data, cursor_num)
         except:
             node = self.create_node(each_data, cursor_num)
         # insert back to es
         try:
             res = self.es.index(index="deltadb", 
                                 doc_type="data", 
                                 id=hashkey, 
                                 body=node)
         except:
             continue
     print "\nProcess finish."

示例#11

0

显示文件

文件： work.py 项目： kenluck2001/customHashing

def getHash(word):
	'''
		This return the hash value and does the anding with 0xffffffffL on a 32 bit system
	'''
	curHash = mmh3.hash(word)
	curHash = curHash & 0xffffffffL
	return curHash

示例#12

0

显示文件

文件： dupli.py 项目： c0ns0le/technosphere

def shingles2sketch(shingles, m_baskets=20):
    baskets = defaultdict(lambda: -float("inf"))
    for shingle in shingles:
        h = mmh3.hash(shingle.encode('utf8'))
        if baskets[h % m_baskets] < h:
            baskets[h % m_baskets] = h
    return sorted(baskets.values())

示例#13

0

显示文件

文件： cursor.py 项目： pangbo-1988/data_diff

 def save_cursor(self, cursor_data):
     cursor_id = mmh3.hash(self.data_from)
     res = self.es.index(index="lookup", 
                         doc_type="data", 
                         id=cursor_id, 
                         body=cursor_data)
     return

示例#14

0

显示文件

文件： api.py 项目： fulltrader/cccrawler

def saveSuccessCrawlDoc(crawldoc):
    '''step1: save crawl success crawldoc to crawl_result, make sure docid is unique
       step2: save outlinks(found new url) to crawl_pending
       step3: update crawl url status which at crawl_pending to crawled'''
    values = crawldoc.convert
    # only string can save to db, change dict or list to string.
    # reference: cccrawler.proto.db.models.CrawlResult
    values['reservation_dict'] = str(crawldoc.reservation_dict)
    values['history'] = str(values['history'])
    values['header'] = str(values['header'])
    values['created_at'] = timeutils.utcnow_ts()
    utils.convert_datetimes(values, 'created_at', 'deleted_at', 'updated_at')
    crawldoc_ref = models.CrawlResult()
    crawldoc_ref.update(values)
    crawldoc_ref.save()

    _updateCrawlStatus(crawldoc.pending_id,'crawled',crawlfail=False)

    cl = deweight.get_client()
    fresh_docs = []
    for doc in crawldoc.outlinks:
        real_url = urlutils.normalize(doc.url)
        docid = mmh3.hash(real_url)
        if not cl.has(docid):
            fresh_doc = addPendingCrawlDocDict(doc.url, int(crawldoc.level),
                        crawldoc.docid, crawldoc.reservation_dict,doc.text,
                        real_url, docid)
            print '@'*60
            print fresh_doc
            print '@'*60
            fresh_docs.append(fresh_doc)
    rushPendingCrawlDoc(fresh_docs)

示例#15

0

显示文件

文件： 10.py 项目： athanasopoulos/python-askiseis-

 def lookup(self, string):
     for seed in xrange(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             
             return "--%s--"%(words_list[i])
     return "Probably"

示例#16

0

显示文件

文件： 7-1.py 项目： dimitrisdan/BigData

    def lookup(self, string):

        hashlist = [mmh3.hash(string, seed=x) % 1000000 for x in xrange(Bloom.numberofhash)]
        for x in hashlist:
            if not Bloom.bit[x]:
                return False
        return True

示例#17

0

显示文件

文件： 7-1.py 项目： dimitrisdan/BigData

    def add(self, string):

        # Hash the string
        hashlist = [mmh3.hash(string, seed=x) % 1000000 for x in xrange(Bloom.numberofhash)]

        for x in hashlist:
            Bloom.bit[x] = 1

示例#18

0

显示文件

文件： bloom.py 项目： brainix/pottery

    def _bit_offsets(self, value):
        '''The bit offsets to set/check in this Bloom filter for a given value.

        Instantiate a Bloom filter:

            >>> dilberts = BloomFilter(
            ...     num_values=100,
            ...     false_positives=0.01,
            ...     key='dilberts',
            ... )

        Now let's look at a few examples:

            >>> tuple(dilberts._bit_offsets('rajiv'))
            (183, 319, 787, 585, 8, 471, 711)
            >>> tuple(dilberts._bit_offsets('raj'))
            (482, 875, 725, 667, 109, 714, 595)
            >>> tuple(dilberts._bit_offsets('dan'))
            (687, 925, 954, 707, 615, 914, 620)

        Thus, if we want to insert the value 'rajiv' into our Bloom filter,
        then we must set bits 183, 319, 787, 585, 8, 471, and 711 all to 1.  If
        any/all of them are already 1, no problems.

        Similarly, if we want to check to see if the value 'rajiv' is in our
        Bloom filter, then we must check to see if the bits 183, 319, 787, 585,
        8, 471, and 711 are all set to 1.  If even one of those bits is set to
        0, then the value 'rajiv' must never have been inserted into our Bloom
        filter.  But if all of those bits are set to 1, then the value 'rajiv'
        was *probably* inserted into our Bloom filter.
        '''
        encoded_value = self._encode(value)
        for seed in range(self.num_hashes()):
            yield mmh3.hash(encoded_value, seed=seed) % self.size()

示例#19

0

显示文件

文件： pypbf.py 项目： xsswfm/pypbf

def makeHashFuncs(key, size, numHashes):
    hashValue = []
    for i in range(1, (numHashes+1)):
        value = mmh3.hash(key,i) % size
        #print value
        hashValue.append(value)
    return hashValue

示例#20

0

显示文件

文件： cache.py 项目： Hameloid/ubuntu-hangups

def get_image_cache_name(url):
    last_segment = url.split('/')[-1]
    if last_segment.count('.') == 1:
        extension = '.' + url.split('.')[-1]
    else:
        extension = ""
    return 'img' + str(mmh3.hash(url.encode('utf-8'))) + extension.lower()

示例#21

0

显示文件

文件： bloomfilter_test01.py 项目： curlyCheng/project

	def in_bf(self, elem):
		for x in xrange(self.hash_count):
			index = mmh3.hash(elem, x) % self.size
			if (self.bit_arr[index] == 0):
				return False
		return True

示例#22

0

显示文件

文件： Indexer.py 项目： pashna/SearchIndexer

    def add_document_indexes(self, text, url, is_print=False):
        # TODO: Maybe, it is good idea to change key from string to hash
        self.documents.append(url)
        doc_id = len(self.documents)-1
        word_list = self._split_text(text.lower())

        for word in word_list:
            #"""
            try:
                word = word.encode('utf-8')

                w_hash = mmh3.hash(word) % self.count_of_files
                if is_print:
                    print word, w_hash
                r_index = self.full_index[w_hash]

                if r_index.has_key(word):
                    r_index[word]["docs"].append(doc_id)
                else:
                    r_index[word] = {}
                    r_index[word]["docs"] = [doc_id]

                if not r_index.has_key('encoding'):
                    r_index['encoding'] = self._encoding

            except Exception as e:
                print "EXCEPRION", word
                traceback.print_exc()

示例#23

0

显示文件

文件： lib_redis_insert.py 项目： caar2000/AIL-framework

def select_hash(hashkind, line):
    """Select the kind of hashing for the line.

    :param hashkind: -- (str) The name of the hash
    :param line: -- (str) The string to hash.

    This function is a kind of hash selector which will use the hash passed
    in argument to hash the string also passed in argument.

    """
    if hashkind == "md5":
        hashline = hashlib.md5(line).hexdigest()

    elif hashkind == "sha1":
        hashline = hashlib.sha1(line).hexdigest()

    elif hashkind == "crc":
        crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
        crc32.update(line)
        hashline = crc32.hexdigest()

    elif hashkind == "murmur":
        hashline = mmh3.hash(line)

    return str(hashline)

示例#24

0

显示文件

文件： count_min_sketch.py 项目： grantholly/cms_stream_processing

 def count(self, item):
     counts = []
     for k, v in zip(self.sketch, range(self.hashes)):
         for j in k:
             search_key = mmh3.hash(item, v) % self.size
             counts.append(k[search_key])
     return min(counts)

示例#25

0

显示文件

文件： count_last_seen_sketch.py 项目： grantholly/cms_stream_processing

 def last_seen(self, item):
     timestamps = []
     for k, v in zip(self.sketch, range(self.hashes)):
         for j in k:
             search_key = mmh3.hash(item, v) % self.size
             timestamps.append(k[search_key])
     return max(timestamps)

示例#26

0

显示文件

文件： royalroads.py 项目： burjorjee/royal-roads

    def contingentParitiesFunction(pop, verbose=False):
        assert(pop.shape[1] == order * height)
        popMissteps = []
        traceAndFitness = []
        for c in xrange(pop.shape[0]):
            output = 0
            ctr = 0
            length = pop.shape[1]
            loci = np.arange(length)
            missteps = []
            trace = ""
            while ctr < height:
                rng.seed(abs(mmh3.hash(trace)))
                acc = 0
                trace += "|"
                for i in xrange(order):
                    idx = rng.randint(length - (ctr * order + i)) + 1
                    swap = loci[-idx]
                    loci[-idx] = loci[ctr * order + i]
                    loci[ctr * order + i] = swap
                    trace += "%2d:%s|" % (swap + 1, int(pop[c, swap]))
                    acc += pop[c, swap]
                output += acc % 2

                if acc % 2 == 0:
                    missteps.append(ctr + 1)

                ctr +=1
            popMissteps.append(missteps)
            traceAndFitness.append((trace, height - len(missteps)))
        if verbose:
            for t in sorted(traceAndFitness):
                print "%s   %s " % t
        return np.array([height - len(missteps) for missteps in popMissteps]), popMissteps

示例#27

0

显示文件

文件： sentrygun.py 项目： s0lst1c3/sentrygun

def alert_factory(location=None,
                bssid=None,
                channel=None,
                essid=None,
                tx=None,
                intent=None):

    # all arguments are required
    assert not any([
                location is None,
                bssid is None,
                channel is None,
                essid is None,
                tx is None,
                intent is None,
            ])

    # return dict from arguments
    _id = str(mmh3.hash(''.join([ bssid, str(channel), intent])))

    return {
        
        'id' : _id,
        'location' : location,
        'bssid' : bssid,
        'channel' : channel,
        'tx' : tx,
        'essid' : essid,
        'intent' : intent,
        'timestamp' : time.time(),
    }

示例#28

0

显示文件

文件： utils.py 项目： tmarballi/appscale

def get_scatter_prop(element_list):
  """ Gets the scatter property for an entity's key path.

  This will return a property for only a small percentage of entities.

  Args:
    element_list: A list of entity_pb.Path_Element objects.
  Returns:
    An entity_pb.Property object or None.
  """
  def id_from_element(element):
    if element.has_name():
      return element.name()
    elif element.has_id():
      return str(element.id())
    else:
      return ''

  to_hash = ''.join([id_from_element(element) for element in element_list])
  full_hash = mmh3.hash(to_hash)
  hash_bytes = struct.pack('i', full_hash)[0:2]
  hash_int = struct.unpack('H', hash_bytes)[0]
  if hash_int >= dbconstants.SCATTER_PROPORTION:
    return None

  scatter_property = entity_pb.Property()
  scatter_property.set_name('__scatter__')
  scatter_property.set_meaning(entity_pb.Property.BYTESTRING)
  scatter_property.set_multiple(False)
  property_value = scatter_property.mutable_value()
  property_value.set_stringvalue(hash_bytes)

  return scatter_property

示例#29

0

显示文件

文件： client32.py 项目： MarwanG/Graal

	def readHash(self):
		hll = Hll(self.p)
		x = sys.stdin.readline().rstrip('\n')
		while x:
			hll.AddItem(mmh3.hash(x))
			x = sys.stdin.readline().rstrip('\n')
		print hll.Count()

示例#30

0

显示文件

文件： bloomfilter.py 项目： datateller/ywbserver-py3

 def lookup(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             #return "Nope"
             return False
     return True

示例#31

0

显示文件

文件： BloomFilter.py 项目： gitrekm/CS4035-Cyber-Data-Analytics

 def lookup(self, element):
     for seed in self.seeds:
         result = hash(element, seed) % self.size
         if self.hash_values[result] == 0:
             return False
     return True

示例#32

0

显示文件

文件： bloomfilter.py 项目： anarmanafov1/kvs

 def exists(self, item):
     for i in range(self.hash_count):
         hashed_index = mmh3.hash(item, i) % self.size
         if self.bit_array[hashed_index] == 0:
             return False
     return True

示例#33

0

显示文件

from ll import LL
import math


class HyperLogLog(LL):
    def __len__(self):
        indicator = sum(2**-m.counter for m in self.registers)
        E = self.alpha * (self.num_registers**2) / float(indicator)

        if E <= 5.0 / 2.0 * self.num_registers:
            V = sum(1 for m in self.registers if m.counter == 0)
            if V != 0:
                Estar = self.num_registers * \
                    math.log(self.num_registers / (1.0 * V), 2)
            else:
                Estar = E
        else:
            if E <= 2**32 / 30.0:
                Estar = E
            else:
                Estar = -2**32 * math.log(1 - E / 2**32, 2)
        return Estar


if __name__ == "__main__":
    import mmh3
    hll = HyperLogLog(8)
    for i in xrange(100000):
        hll.add(mmh3.hash(str(i)))
    print len(hll)

示例#34

0

显示文件

文件： bloom_filters.py 项目： Moziofmoon/algorithm-python

 def add(self, s):
     for seed in range(self.hash_num):
         result = mmh3.hash(s, seed) % self.size
         self.bit_array[result] = 1

示例#35

0

显示文件

    def get_machoc_hash(self):
        # Get Machoc Hash adapted from https://github.com/conix-security/machoke
        binary = self.r2p
        binary.cmd("aaa")
        mmh3_line = ""
        machoke_line = ""

        funcs = json.loads(binary.cmd("aflj"))
        if funcs is None:
            print("r2 could not retrieve functions list")

        def get_machoke_from_function(r2p, function):
            """Return machoke from specific
            :rtype: object
            """
            r2p.cmd("s {}".format(function["offset"]))
            agj_error = 0
            while True:
                try:
                    fcode = json.loads(r2p.cmd("agj"))
                    break
                except:
                    print >> sys.stderr, "Fail agj: %s" % hex(
                        function["offset"])
                if agj_error == 5:
                    break
                agj_error += 1
            blocks = []
            id_block = 1
            try:
                for block in fcode[0]["blocks"]:
                    blocks.append({
                        "id_block": id_block,
                        "offset": hex(block["offset"])
                    })
                    id_block += 1
            except:
                return ""
            line = ""
            id_block = 1
            for block in fcode[0]["blocks"]:
                word = "{}:".format(id_block)
                for instruction in block["ops"]:
                    # Check if call
                    if instruction["type"] == "call":
                        word = "{}c,".format(word)
                        for ublock in blocks:
                            if hex(instruction["offset"] +
                                   2) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])

                    # Check if jmp
                    if instruction["type"] == "jmp":
                        for ublock in blocks:
                            if instruction["esil"] == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])

                    # Check if conditional jmp
                    elif instruction["type"] == "cjmp":
                        for ublock in blocks:
                            if hex(instruction["jump"]) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])
                            if hex(instruction["offset"] +
                                   2) == ublock["offset"]:
                                word = "{}{},".format(word, ublock["id_block"])
                    else:
                        pass
                if word[-2] == "c":
                    for ublock in blocks:
                        if hex(instruction["offset"] + 4) == ublock["offset"]:
                            word = "{}{},".format(word, ublock["id_block"])

                    if word[-2] == "c":
                        word = "{}{},".format(word, id_block + 1)

                if word[-1] == ":" and id_block != len(fcode[0]["blocks"]):
                    word = "{}{},".format(word, id_block + 1)
                # Clean word
                if word[-1] == ",":
                    word = "{};".format(word[:-1])
                elif word[-1] == ":":
                    word = "{};".format(word)
                line = "{}{}".format(line, word)
                id_block += 1
            return line

        for function in funcs:
            machoke = get_machoke_from_function(binary, function)
            machoke_line = "{}{}".format(machoke_line, machoke)
            mmh3_line = "{}{}".format(
                mmh3_line,
                hex(mmh3.hash(machoke) & 0xffffffff).replace("0x", "").replace(
                    "L", ""),
            )
        binary.quit()

        return mmh3_line

示例#36

0

显示文件

  def test_hash_values(self):
    """ Test that on randomized data, values computed from mmh3 and pymmh3 match. """

    for i in range(10):
      random_value = str(random.random())
      self.assertEqual(mmh3.hash(random_value), pymmh3.hash(random_value))

示例#37

0

显示文件

文件： utils.py 项目： fossabot/unleash-client-python

def normalized_hash(identifier: str, activation_group: str) -> int:
    return mmh3.hash("{}:{}".format(identifier, activation_group)) % 100 + 1

示例#38

0

显示文件

 def lookup(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         if self.bit_array[result] == 0:
             return "Nope"
     return "Probably"

示例#39

0

显示文件

 def add(self, string):
     for seed in range(self.hash_count):
         result = mmh3.hash(string, seed) % self.size
         self.bit_array[result] = 1

示例#40

0

显示文件

文件： custom_bloom_filter.py 项目： KumarAbhinav2/elasticfilter

def lookup(string, bit_array, hash_count, size):
    for seed in range(hash_count):
        result = mmh3.hash(string, seed) % size
        if bit_array[result] == 0:
            return False
    return True

示例#41

0

显示文件

文件： bloom_filter.py 项目： yakun0113/cmpe273-assignment3

	def add(self, item):
		digests = []
		for i in range (self.hash_counts):
			digest = mmh3.hash(item, i) % self.bit_array_use
			self.bit_array_size[digest] = True

示例#42

0

显示文件

文件： classify.py 项目： Jamiroquai88/jsalt-lab

def get_feature(feat_str, model):
    # The feature string may be unicode, but MurmurHash3 expects ASCII encoded strings.
    return mmh3.hash(feat_str.encode('ascii', 'xmlcharrefreplace')) % model.num_features

示例#43

0

显示文件

def schingling(doc):
    return [mmh3.hash(doc[i:i + 9], signed=False) for i in range(len(doc) - 9)]

示例#44

0

显示文件

文件： utils.py 项目： alexschimpf/unleash-client-python

def normalized_hash(identifier, activation_group):
    return mmh3.hash("{}:{}".format(activation_group, identifier),
                     signed=False) % 100 + 1

示例#45

0

显示文件

文件： bitextor-warc2preprocess.py 项目： sanjibnarzary/bitextor

                "mimeFile": mimeFile,
                "normHtmlFile": normHtmlFile,
                "plainTextFile": plainTextFile
            }

    # If enabled, remove boilerplate HTML
    if options.boilerpipe:
        logging.info(url + ": deboiling html")
        extractor = ExtrB(extractor='ArticleExtractor', html=text)
        deboiled = str(extractor.getHTML())
    else:
        deboiled = text

    # We compute a hash on the HTML (either normalized one or after boilerpipe if enabled):
    # if we get duplicate files we discard them
    html_hash = mmh3.hash(deboiled, signed=False)
    # checking for duplicate content (duplicates are discarded)
    if html_hash in seen_html:
        logging.info("Repeated file:\t" + url)
        continue

    # get text with Alcazar library
    if options.parser == "alcazar":
        logging.info(url + ": Getting text with Alcazar")
        btext = alcazar.bodytext.parse_article(deboiled)
        if btext.body_text:
            plaintext = btext.body_text
        else:
            plaintext = ""

    # or get text with beautifulsoup

示例#46

0

显示文件

def hash32(data: bytes) -> bytes:
    return struct.pack('i', mmh3.hash(data))

示例#47

0

显示文件

文件： bloom_filter.py 项目： lybroman/code_snippet

 def _hashes(self, key):
     for i in xrange(self.k):
         yield mmh3.hash(key, self.k)

示例#48

0

显示文件

文件： bloomfilter.py 项目： anarmanafov1/kvs

 def add(self, item):
     for i in range(self.hash_count):
         hashed_index = mmh3.hash(item, i) % self.size
         self.bit_array[hashed_index] = 1

示例#49

0

显示文件

 def _hash(self, element):
     return [
         b % self.size
         for b in [mmh3.hash(element, i) for i in range(self.hash_count)]
     ]

示例#50

0

显示文件

文件： bloom_filter.py 项目： lybroman/code_snippet

 def _hashes_opt(self, key):
     # Kirsch - Mitzenmacher - Optimization
     h0 = mmh3.hash(key, 1)
     h1 = mmh3.hash(key, 10)
     for i in xrange(self.k):
         yield h0 + i * h1

示例#51

0

显示文件

def murmur3_32(text):
    val = mmh3.hash(text)
    return val if val >= 0 else val + 2**32

示例#52

0

显示文件

文件： generateAUCs_pipelined.py 项目： TavorB/spectral_jaccard_similarity

def runSim(args):

    ## avoid one processes starting multiple threads
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"
    os.environ["OMP_NUM_THREADS"] = "1"

    dataset = args[0]
    if dataset[-1] != '/':
        dataset += '/'
    ground_truth, theta = args[1]

    ## load dna reads into reads_lst
    reads_lst = []
    fastaFile = dataset + "/reads.fasta"
    with open(fastaFile) as handle:
        for values in SimpleFastaParser(handle):
            reads_lst.append(values[1])
    n = len(reads_lst)

    ## load precomputed Jaccard Similarities
    JSims = np.loadtxt(dataset + "/minHashes/JSims.txt")

    ## load alignments
    gt_file = "{}/{}_ground_truth.txt".format(dataset, ground_truth)

    with open(gt_file) as f:
        lines = [[float(x) for x in line.rstrip('\n').split('\t')]
                 for line in f]

    refDict = {}
    for i in range(n):
        refDict[i] = {}
    for line in lines:
        refDict[int(line[0]) - 1][int(line[1]) -
                                  1] = line[2] / (line[3] + line[4] - line[2])
        refDict[int(line[1]) - 1][int(line[0]) -
                                  1] = line[2] / (line[3] + line[4] - line[2])

    ## convert each read into it's k-mers
    symLength = 7  # k in k-mer

    def generateSymSets(reads_lst, symLength):
        symSets = {}
        for i, read in enumerate(reads_lst):
            lst = set()
            for j in range(len(read) - symLength):
                lst.add(read[j:j + symLength])
            symSets[i] = lst
        return symSets

    symSets = generateSymSets(reads_lst, symLength)

    ## load precomputed minHashes
    minHashArr = np.zeros((n, 1000))
    for i in range(n):
        minHashArr[i] = np.load(dataset +
                                "minHashes/minHashes_{}.txt".format(i),
                                allow_pickle=True)

    ## load precomputed iid sequences and their minhashes
    numRandReads = 5
    randMinHashArr = np.zeros((numRandReads, 1000))
    for i in range(numRandReads):
        randMinHashArr[i] = np.load(dataset +
                                    "randReads/randMinHashes_{}.txt".format(i),
                                    allow_pickle=True)

    minHashArrExtended = np.vstack((minHashArr[:, :1000], randMinHashArr))

    ## checking to see precomputed minHashes works
    i = np.random.randint(0, n, size=100)
    j = np.random.randint(0, 1000, size=100)
    lst = []
    for iter_round in range(100):

        iterLst = list(symSets[i[iter_round]])

        lst.append(
            min([
                mmh3.hash(sym, j[iter_round], signed=False) for sym in iterLst
            ]))

    assert (np.alltrue(minHashArrExtended[i, j] == lst))

    ## checking to see precomputed JSim works
    i = np.random.randint(0, n, size=100)
    j = np.random.randint(0, n, size=100)
    lst = []
    for iter_round in range(100):
        i1 = i[iter_round]
        j1 = j[iter_round]

        lst.append(JSims[i1, j1] == 1.0 *
                   len(symSets[i1].intersection(symSets[j1])) /
                   (len(symSets[i1].union(symSets[j1]))))

    assert (np.alltrue(lst) and np.allclose(JSims, JSims.T))

    ## Testing SVD, JSimEmp, JSim Exact, reference vs all
    storageArrGround = []
    storageArrpHatSVD = []
    storageArrJsimExact = []
    storageArrJsimEmp = []
    storageArrNumOnesCol = []
    storageArrQjs = []
    storageArrwSJS = []

    h = 1000

    for ref_read in trange(n):

        groundTruthLocs = np.array(list(refDict[ref_read].keys()))
        if len(groundTruthLocs) == 0:  ## read has no alignments in dataset
            continue
        refReadMatches = refDict[ref_read]
        groundTruthVals = [refReadMatches[i] for i in groundTruthLocs]

        lst = set(list(groundTruthLocs))
        rangeN = set(range(n))
        rangeN.discard(ref_read)

        toAppend = list(rangeN - lst)
        groundTruthLocs = np.hstack((groundTruthLocs, np.array(toAppend)))
        groundTruthVals += [0] * len(toAppend)

        empiricalMatrix = (minHashArrExtended == minHashArrExtended[ref_read])
        empiricalMatrix = np.delete(empiricalMatrix, ref_read, axis=0)[:, :h]

        updatedGroundTruthLocs = groundTruthLocs - 1 * (groundTruthLocs >=
                                                        ref_read)
        updatedGroundTruthLocs = updatedGroundTruthLocs.astype(int)

        jSimEmpirical = np.mean(empiricalMatrix, axis=1)
        jSimExact = np.delete(JSims[ref_read], ref_read)

        ## here we can modify what normalization is used without having to rerun SVDs
        # u = np.loadtxt(dataset+"/SVD/raw_pi_refread_{}.txt".format(ref_read))
        # pHatSVD = 1-np.abs(u[:n-1])/np.abs(np.median(u[:n-1])) ## normalize median of p_i
        # pHatSVD = 1-np.abs(u[:n-1])/np.abs(np.median(u[n-1:])) ## random read normalization
        # pHatSVD = 1-np.abs(u[:n-1])/np.max(np.abs(u[n-1:])) ## naive max normalziation

        pHatSVD = np.loadtxt(dataset +
                             "/SVD/pi_refread_{}.txt".format(ref_read))
        qSVD = np.loadtxt(dataset + "/SVD/qj_refread_{}.txt".format(ref_read))

        ## for approximation
        empQ = empiricalMatrix.sum(axis=0)
        x = np.matmul(empiricalMatrix - np.ones(empiricalMatrix.shape),
                      1 - np.array(empQ / np.max(empQ)))[:n - 1]
        x = np.abs(x - np.min(x))
        x /= np.max(x)
        storageArrwSJS.extend(x[updatedGroundTruthLocs])

        storageArrGround.extend(groundTruthVals)
        storageArrpHatSVD.extend(pHatSVD[updatedGroundTruthLocs])
        storageArrJsimEmp.extend(jSimEmpirical[:n - 1][updatedGroundTruthLocs])
        storageArrJsimExact.extend(jSimExact[updatedGroundTruthLocs])
        storageArrNumOnesCol.extend(np.mean(empiricalMatrix, axis=0))
        storageArrQjs.extend(qSVD)

    fpr, tpr, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrpHatSVD)
    fpr_jsim, tpr_jsim, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrJsimExact)
    fpr_js_emp, tpr_js_emp, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrJsimEmp)
    fpr_wsjs, tpr_wsjs, _ = roc_curve(
        np.array(storageArrGround) >= theta, storageArrwSJS)

    pickle.dump([
        auc(fpr, tpr),
        auc(fpr_jsim, tpr_jsim),
        auc(fpr_js_emp, tpr_js_emp),
        auc(fpr_wsjs, tpr_wsjs), storageArrNumOnesCol,
        np.corrcoef(storageArrpHatSVD, storageArrGround)[0, 1],
        np.corrcoef(storageArrJsimExact, storageArrGround)[0, 1],
        np.corrcoef(storageArrJsimEmp, storageArrGround)[0, 1], storageArrQjs,
        'SJS AUC,JS AUC, JS emp AUC, wSJS AUC,numOnes per col,SJS r^2,JS r^2,JS emp r^2,storageArrQjs'
    ],
                open(
                    "AUCs/{}_{}_{}.pkl".format(dataset[:-1], ground_truth,
                                               str(theta % 1).split('.')[1]),
                    "wb"))

示例#53

0

显示文件

 def insert(self, item):
   for i in range(self.qty_hash):
     t = mmh3.hash(bytes(item), i) % self.size
     self.bitarray[t] = True

示例#54

0

显示文件

文件： favUp.py 项目： rajivraj/fav-up

 def faviconHash(self, data, web_source=None):
     if web_source:
         b64data = base64.encodebytes(data).decode()
     else:
         b64data = base64.encodebytes(data)
     return mmh3.hash(b64data)

示例#55

0

显示文件

def hash(flowkey):
    global width
    flowkey_bytes = struct.pack("L", flowkey)
    r = mmh3.hash(flowkey_bytes, signed=False)
    return r % width

示例#56

0

显示文件

文件： BloomFilter.py 项目： gitrekm/CS4035-Cyber-Data-Analytics

 def add(self, element):
     for seed in self.seeds:
         result = hash(element, seed) % self.size
         self.hash_values[result] = 1
     return self.hash_values

示例#57

0

显示文件

def string_digest(item, index):
    return mmh3.hash(bytes(item, 'utf-8'), index)

示例#58

0

显示文件

文件： bloom_filter.py 项目： yakun0113/cmpe273-assignment3

	def is_member(self, item):
		for i in range (self.hash_counts):
			digest = mmh3.hash(item, i) % self.bit_array_use
			if self.bit_array_size[digest] == False:
				return False
		return True

示例#59

0

显示文件

def murmur(key):
    return mmh3.hash(key)

示例#60

0

显示文件

    #np.dot(m.transpose(),m)
    #Jaccard(m[0],m[10])

    #s=signature(m,10000)
    #s.shape

    #m_new = firma(m)

    m_new2 = m.dot(rndVecs) # projected matrix
    # Indexing text collection
    for doc_id in range(m_new2.shape[0]):
        docSgt = np.array(m_new2[doc_id, :] >= 0, dtype=np.int)
        for blk in range(NRBLK):
            # (blk*BLKSZ):((blk+1)*BLKSZ)
            blkData = docSgt[(blk*BLKSZ):((blk+1)*BLKSZ)]
            docHashVal = mmh3.hash(''.join(map(str, blkData))) % MAXBKTS
            hshTbl_blk = HshTabls[blk]
            if docHashVal not in hshTbl_blk:
                hshTbl_blk[docHashVal] = set()
            hshTbl_blk[docHashVal].add(doc_id + 1)
    collision = np.zeros((m.shape[0], m.shape[0]), dtype=np.int)
    for hshTbl_blk in HshTabls:
        for e in hshTbl_blk:
            for i in hshTbl_blk[e]:
                for o in hshTbl_blk[e]:
                    collision[i - 1][o - 1] += 1
    pldHaming=penalizedHcc(m_new2)
    simcos=(np.pi / 2) * (1 - hmmg(m_new2))
    simcospenalized =(np.pi / 2) * (1 - pldHaming)
    print("End!")