def _hash_bits(self, key): # http://spyced.blogspot.com/2009 # /01/all-you-ever-wanted-to-know-about.html hash1 = mmh3.hash(key, 0) hash2 = mmh3.hash(key, hash1) for i in range(self._hash_funcs): yield abs((hash1 + i * hash2) % self._bits_per_slice)
def data(path, label_path=None): fd = open(path) fd.readline() # skip headers hash_cols = [3,4,34,35,61,64,65,91,94,95] npairs = len(hash_cols) x = [0] * (146 + npairs*(npairs-1)/2) if label_path: label = open(label_path) label.readline() # skip headers for t, line in enumerate(fd): # parse x row = line.rstrip().split(',') for m, feat in enumerate(row): if m == 0: ID = int(feat) else: # one-hot encode everything with hash trick # categorical: one-hotted # boolean: ONE-HOTTED # numerical: ONE-HOTTED! # note, the build in hash(), although fast is not stable, # i.e., same value won't always have the same hash # on different machines x[m] = abs(mmh3.hash(str(m) + '_' + feat)) % D for i in xrange(10): for j in xrange(i+1,10): m += 1 x[m] = abs(mmh3.hash(str(m)+'_'+row[hash_cols[i]]+"_x_"+row[hash_cols[j]])) % D # parse y, if provided if label_path: # use float() to prevent future type casting, [1:] to ignore id y = [float(y) for y in label.readline().split(',')[1:]] yield (ID, x, y) if label_path else (ID, x)
def authenticate(self, name, password, certificates, certhash, certstrong, current=None): with self.app.app_context(): if name == 'SuperUser': return RET_FALLTHROUGH user = User.query.filter_by(user_id=name).first() if not user: try: uuid.UUID(name, version=4) except ValueError: return RET_DENIED guest_user = GuestUser.query.get(name) if guest_user: if not guest_user.password == password or guest_user.banned: return RET_DENIED if guest_user.corporation: self.app.logger.debug('Authenticating guest with: {} {} {}'.format(abs(mmh3.hash(guest_user.id.hex)), '[{}][GUEST] {}'.format(self.get_ticker(guest_user.corporation), guest_user.name), [u'Guest'])) return abs(mmh3.hash(guest_user.id.hex)), '[{}][GUEST] {}'.format(self.get_ticker(guest_user.corporation), guest_user.name), [u'Guest'] else: self.app.logger.debug('Authenticating guest with: {} {} {}'.format(abs(mmh3.hash(guest_user.id.hex)), '[GUEST] {}'.format(guest_user.name), [u'Guest'])) return abs(mmh3.hash(guest_user.id.hex)), '[GUEST] {}'.format(guest_user.name), ['Guest'] else: return RET_DENIED if not user.mumble_password == password: return RET_DENIED self.app.logger.debug('Authenticating user with: {} {} {}'.format(mmh3.hash(user.user_id), '[{}] {}'.format(self.get_ticker(user.corporation_name), user.main_character), user.groups)) return mmh3.hash(user.user_id), '[{}] {}'.format(self.get_ticker(user.corporation_name), user.main_character), user.groups
def hash(self, string): hash_arr = [] hash1 = mmh3.hash(string, 0) hash2 = mmh3.hash(string, hash1) for i in range(self.k): hash_arr.append(abs((hash1 + i * hash2) % self.m)) return hash_arr
def Hashmap_WordVector(self,nbits): length=len(self.Words_Vector) self.bl_bits=nbits self.bloom_vector=self.bl_bits*bitarray('0') for i in range(length): self.hashmap1.append(mmh3.hash(self.Words_Vector[i]) % self.bl_bits ) self.hashmap2.append(mmh3.hash(self.Words_Vector[i],self.hashmap1[i]) % self.bl_bits ) self.hashmap3.append(mmh3.hash(self.Words_Vector[i],self.hashmap2[i]) % self.bl_bits ) self.bloom_vector[self.hashmap1[i]]=1 self.bloom_vector[self.hashmap2[i]]=1 self.bloom_vector[self.hashmap3[i]]=1
def get_hash(label,namespace,feature,stride,mask): if namespace: namespace_hash = mmh3.hash(namespace,0) else: namespace_hash = 0 if is_number(feature): feature_hash = int(feature) + namespace_hash else: feature_hash = mmh3.hash(feature,namespace_hash) feature_hash_oaa = feature_hash * stride return (feature_hash_oaa + label - 1) & mask
def parse_block(block): index_block = [] for file_path in block: file_path_hash = mmh3.hash(file_path) with open(file_path, 'r') as input_file: for line in input_file: items = line.strip().split(' ') index_block.append( (mmh3.hash(items[0]), [file_path_hash, items[1]]) ) return index_block
def select_terms_meta(query_terms, term_dict_stream): """ reads term dictionary generator and selects query terms meta info """ terms_meta_dict = {} for term in query_terms: term_hash = mmh3.hash(term.encode("utf-8")) terms_meta_dict[term_hash] = { "term": term, "seek_offset": None, "size": None } seek_offset = 0 unseen_terms = terms_meta_dict.keys() for dict_term_hash, dict_term_size in term_dict_stream: if dict_term_hash in unseen_terms: terms_meta_dict[dict_term_hash]["seek_offset"] = seek_offset terms_meta_dict[dict_term_hash]["size"] = dict_term_size unseen_terms.remove(dict_term_hash) if len(unseen_terms) == 0: break seek_offset += dict_term_size query_terms_dict = {} for _, term_meta in terms_meta_dict.items(): query_terms_dict[term_meta["term"]] = { "seek_offset": term_meta["seek_offset"], "size": term_meta["size"] } return query_terms_dict
def _indices(self, x): ''' A helper generator that yields the indices in x The purpose of this generator is to make the following code a bit cleaner when doing feature interaction. ''' # first yield index of the bias term yield 0, 1. # then yield the linear indices if self.interaction != 2: for i,val in x: yield i,val # now yield interactions (if applicable) if self.interaction: D = self.D L = len(x) x = sorted(x) for i in xrange(L): for j in xrange(i+1, L): # one-hot encode interactions with hash trick yield abs(hash(str(x[i][0]) + '_' + str(x[j][0]))) % D, x[i][1]*x[j][1]
def process(self): # load data data = self.load() # index to elastic search print "\nStart processing" cursor = Cursor(self.es, self.data_from) cursor_num = cursor.get_new_cursor() for each_data in data: key_string = '' for each_key_string in key_value: key_string += each_data[each_key_string] hashkey = mmh3.hash(key_string) print "parsing id: ", hashkey # try to read record try: res = self.es.get( index="deltadb", doc_type="data", id=hashkey) if res["found"]: node = self.update_node(res["_source"], each_data, cursor_num) else: node = self.create_node(each_data, cursor_num) except: node = self.create_node(each_data, cursor_num) # insert back to es try: res = self.es.index(index="deltadb", doc_type="data", id=hashkey, body=node) except: continue print "\nProcess finish."
def getHash(word): ''' This return the hash value and does the anding with 0xffffffffL on a 32 bit system ''' curHash = mmh3.hash(word) curHash = curHash & 0xffffffffL return curHash
def shingles2sketch(shingles, m_baskets=20): baskets = defaultdict(lambda: -float("inf")) for shingle in shingles: h = mmh3.hash(shingle.encode('utf8')) if baskets[h % m_baskets] < h: baskets[h % m_baskets] = h return sorted(baskets.values())
def save_cursor(self, cursor_data): cursor_id = mmh3.hash(self.data_from) res = self.es.index(index="lookup", doc_type="data", id=cursor_id, body=cursor_data) return
def saveSuccessCrawlDoc(crawldoc): '''step1: save crawl success crawldoc to crawl_result, make sure docid is unique step2: save outlinks(found new url) to crawl_pending step3: update crawl url status which at crawl_pending to crawled''' values = crawldoc.convert # only string can save to db, change dict or list to string. # reference: cccrawler.proto.db.models.CrawlResult values['reservation_dict'] = str(crawldoc.reservation_dict) values['history'] = str(values['history']) values['header'] = str(values['header']) values['created_at'] = timeutils.utcnow_ts() utils.convert_datetimes(values, 'created_at', 'deleted_at', 'updated_at') crawldoc_ref = models.CrawlResult() crawldoc_ref.update(values) crawldoc_ref.save() _updateCrawlStatus(crawldoc.pending_id,'crawled',crawlfail=False) cl = deweight.get_client() fresh_docs = [] for doc in crawldoc.outlinks: real_url = urlutils.normalize(doc.url) docid = mmh3.hash(real_url) if not cl.has(docid): fresh_doc = addPendingCrawlDocDict(doc.url, int(crawldoc.level), crawldoc.docid, crawldoc.reservation_dict,doc.text, real_url, docid) print '@'*60 print fresh_doc print '@'*60 fresh_docs.append(fresh_doc) rushPendingCrawlDoc(fresh_docs)
def lookup(self, string): for seed in xrange(self.hash_count): result = mmh3.hash(string, seed) % self.size if self.bit_array[result] == 0: return "--%s--"%(words_list[i]) return "Probably"
def lookup(self, string): hashlist = [mmh3.hash(string, seed=x) % 1000000 for x in xrange(Bloom.numberofhash)] for x in hashlist: if not Bloom.bit[x]: return False return True
def add(self, string): # Hash the string hashlist = [mmh3.hash(string, seed=x) % 1000000 for x in xrange(Bloom.numberofhash)] for x in hashlist: Bloom.bit[x] = 1
def _bit_offsets(self, value): '''The bit offsets to set/check in this Bloom filter for a given value. Instantiate a Bloom filter: >>> dilberts = BloomFilter( ... num_values=100, ... false_positives=0.01, ... key='dilberts', ... ) Now let's look at a few examples: >>> tuple(dilberts._bit_offsets('rajiv')) (183, 319, 787, 585, 8, 471, 711) >>> tuple(dilberts._bit_offsets('raj')) (482, 875, 725, 667, 109, 714, 595) >>> tuple(dilberts._bit_offsets('dan')) (687, 925, 954, 707, 615, 914, 620) Thus, if we want to insert the value 'rajiv' into our Bloom filter, then we must set bits 183, 319, 787, 585, 8, 471, and 711 all to 1. If any/all of them are already 1, no problems. Similarly, if we want to check to see if the value 'rajiv' is in our Bloom filter, then we must check to see if the bits 183, 319, 787, 585, 8, 471, and 711 are all set to 1. If even one of those bits is set to 0, then the value 'rajiv' must never have been inserted into our Bloom filter. But if all of those bits are set to 1, then the value 'rajiv' was *probably* inserted into our Bloom filter. ''' encoded_value = self._encode(value) for seed in range(self.num_hashes()): yield mmh3.hash(encoded_value, seed=seed) % self.size()
def makeHashFuncs(key, size, numHashes): hashValue = [] for i in range(1, (numHashes+1)): value = mmh3.hash(key,i) % size #print value hashValue.append(value) return hashValue
def get_image_cache_name(url): last_segment = url.split('/')[-1] if last_segment.count('.') == 1: extension = '.' + url.split('.')[-1] else: extension = "" return 'img' + str(mmh3.hash(url.encode('utf-8'))) + extension.lower()
def in_bf(self, elem): for x in xrange(self.hash_count): index = mmh3.hash(elem, x) % self.size if (self.bit_arr[index] == 0): return False return True
def add_document_indexes(self, text, url, is_print=False): # TODO: Maybe, it is good idea to change key from string to hash self.documents.append(url) doc_id = len(self.documents)-1 word_list = self._split_text(text.lower()) for word in word_list: #""" try: word = word.encode('utf-8') w_hash = mmh3.hash(word) % self.count_of_files if is_print: print word, w_hash r_index = self.full_index[w_hash] if r_index.has_key(word): r_index[word]["docs"].append(doc_id) else: r_index[word] = {} r_index[word]["docs"] = [doc_id] if not r_index.has_key('encoding'): r_index['encoding'] = self._encoding except Exception as e: print "EXCEPRION", word traceback.print_exc()
def select_hash(hashkind, line): """Select the kind of hashing for the line. :param hashkind: -- (str) The name of the hash :param line: -- (str) The string to hash. This function is a kind of hash selector which will use the hash passed in argument to hash the string also passed in argument. """ if hashkind == "md5": hashline = hashlib.md5(line).hexdigest() elif hashkind == "sha1": hashline = hashlib.sha1(line).hexdigest() elif hashkind == "crc": crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF) crc32.update(line) hashline = crc32.hexdigest() elif hashkind == "murmur": hashline = mmh3.hash(line) return str(hashline)
def count(self, item): counts = [] for k, v in zip(self.sketch, range(self.hashes)): for j in k: search_key = mmh3.hash(item, v) % self.size counts.append(k[search_key]) return min(counts)
def last_seen(self, item): timestamps = [] for k, v in zip(self.sketch, range(self.hashes)): for j in k: search_key = mmh3.hash(item, v) % self.size timestamps.append(k[search_key]) return max(timestamps)
def contingentParitiesFunction(pop, verbose=False): assert(pop.shape[1] == order * height) popMissteps = [] traceAndFitness = [] for c in xrange(pop.shape[0]): output = 0 ctr = 0 length = pop.shape[1] loci = np.arange(length) missteps = [] trace = "" while ctr < height: rng.seed(abs(mmh3.hash(trace))) acc = 0 trace += "|" for i in xrange(order): idx = rng.randint(length - (ctr * order + i)) + 1 swap = loci[-idx] loci[-idx] = loci[ctr * order + i] loci[ctr * order + i] = swap trace += "%2d:%s|" % (swap + 1, int(pop[c, swap])) acc += pop[c, swap] output += acc % 2 if acc % 2 == 0: missteps.append(ctr + 1) ctr +=1 popMissteps.append(missteps) traceAndFitness.append((trace, height - len(missteps))) if verbose: for t in sorted(traceAndFitness): print "%s %s " % t return np.array([height - len(missteps) for missteps in popMissteps]), popMissteps
def alert_factory(location=None, bssid=None, channel=None, essid=None, tx=None, intent=None): # all arguments are required assert not any([ location is None, bssid is None, channel is None, essid is None, tx is None, intent is None, ]) # return dict from arguments _id = str(mmh3.hash(''.join([ bssid, str(channel), intent]))) return { 'id' : _id, 'location' : location, 'bssid' : bssid, 'channel' : channel, 'tx' : tx, 'essid' : essid, 'intent' : intent, 'timestamp' : time.time(), }
def get_scatter_prop(element_list): """ Gets the scatter property for an entity's key path. This will return a property for only a small percentage of entities. Args: element_list: A list of entity_pb.Path_Element objects. Returns: An entity_pb.Property object or None. """ def id_from_element(element): if element.has_name(): return element.name() elif element.has_id(): return str(element.id()) else: return '' to_hash = ''.join([id_from_element(element) for element in element_list]) full_hash = mmh3.hash(to_hash) hash_bytes = struct.pack('i', full_hash)[0:2] hash_int = struct.unpack('H', hash_bytes)[0] if hash_int >= dbconstants.SCATTER_PROPORTION: return None scatter_property = entity_pb.Property() scatter_property.set_name('__scatter__') scatter_property.set_meaning(entity_pb.Property.BYTESTRING) scatter_property.set_multiple(False) property_value = scatter_property.mutable_value() property_value.set_stringvalue(hash_bytes) return scatter_property
def readHash(self): hll = Hll(self.p) x = sys.stdin.readline().rstrip('\n') while x: hll.AddItem(mmh3.hash(x)) x = sys.stdin.readline().rstrip('\n') print hll.Count()
def lookup(self, string): for seed in range(self.hash_count): result = mmh3.hash(string, seed) % self.size if self.bit_array[result] == 0: #return "Nope" return False return True
def lookup(self, element): for seed in self.seeds: result = hash(element, seed) % self.size if self.hash_values[result] == 0: return False return True
def exists(self, item): for i in range(self.hash_count): hashed_index = mmh3.hash(item, i) % self.size if self.bit_array[hashed_index] == 0: return False return True
from ll import LL import math class HyperLogLog(LL): def __len__(self): indicator = sum(2**-m.counter for m in self.registers) E = self.alpha * (self.num_registers**2) / float(indicator) if E <= 5.0 / 2.0 * self.num_registers: V = sum(1 for m in self.registers if m.counter == 0) if V != 0: Estar = self.num_registers * \ math.log(self.num_registers / (1.0 * V), 2) else: Estar = E else: if E <= 2**32 / 30.0: Estar = E else: Estar = -2**32 * math.log(1 - E / 2**32, 2) return Estar if __name__ == "__main__": import mmh3 hll = HyperLogLog(8) for i in xrange(100000): hll.add(mmh3.hash(str(i))) print len(hll)
def add(self, s): for seed in range(self.hash_num): result = mmh3.hash(s, seed) % self.size self.bit_array[result] = 1
def get_machoc_hash(self): # Get Machoc Hash adapted from https://github.com/conix-security/machoke binary = self.r2p binary.cmd("aaa") mmh3_line = "" machoke_line = "" funcs = json.loads(binary.cmd("aflj")) if funcs is None: print("r2 could not retrieve functions list") def get_machoke_from_function(r2p, function): """Return machoke from specific :rtype: object """ r2p.cmd("s {}".format(function["offset"])) agj_error = 0 while True: try: fcode = json.loads(r2p.cmd("agj")) break except: print >> sys.stderr, "Fail agj: %s" % hex( function["offset"]) if agj_error == 5: break agj_error += 1 blocks = [] id_block = 1 try: for block in fcode[0]["blocks"]: blocks.append({ "id_block": id_block, "offset": hex(block["offset"]) }) id_block += 1 except: return "" line = "" id_block = 1 for block in fcode[0]["blocks"]: word = "{}:".format(id_block) for instruction in block["ops"]: # Check if call if instruction["type"] == "call": word = "{}c,".format(word) for ublock in blocks: if hex(instruction["offset"] + 2) == ublock["offset"]: word = "{}{},".format(word, ublock["id_block"]) # Check if jmp if instruction["type"] == "jmp": for ublock in blocks: if instruction["esil"] == ublock["offset"]: word = "{}{},".format(word, ublock["id_block"]) # Check if conditional jmp elif instruction["type"] == "cjmp": for ublock in blocks: if hex(instruction["jump"]) == ublock["offset"]: word = "{}{},".format(word, ublock["id_block"]) if hex(instruction["offset"] + 2) == ublock["offset"]: word = "{}{},".format(word, ublock["id_block"]) else: pass if word[-2] == "c": for ublock in blocks: if hex(instruction["offset"] + 4) == ublock["offset"]: word = "{}{},".format(word, ublock["id_block"]) if word[-2] == "c": word = "{}{},".format(word, id_block + 1) if word[-1] == ":" and id_block != len(fcode[0]["blocks"]): word = "{}{},".format(word, id_block + 1) # Clean word if word[-1] == ",": word = "{};".format(word[:-1]) elif word[-1] == ":": word = "{};".format(word) line = "{}{}".format(line, word) id_block += 1 return line for function in funcs: machoke = get_machoke_from_function(binary, function) machoke_line = "{}{}".format(machoke_line, machoke) mmh3_line = "{}{}".format( mmh3_line, hex(mmh3.hash(machoke) & 0xffffffff).replace("0x", "").replace( "L", ""), ) binary.quit() return mmh3_line
def test_hash_values(self): """ Test that on randomized data, values computed from mmh3 and pymmh3 match. """ for i in range(10): random_value = str(random.random()) self.assertEqual(mmh3.hash(random_value), pymmh3.hash(random_value))
def normalized_hash(identifier: str, activation_group: str) -> int: return mmh3.hash("{}:{}".format(identifier, activation_group)) % 100 + 1
def lookup(self, string): for seed in range(self.hash_count): result = mmh3.hash(string, seed) % self.size if self.bit_array[result] == 0: return "Nope" return "Probably"
def add(self, string): for seed in range(self.hash_count): result = mmh3.hash(string, seed) % self.size self.bit_array[result] = 1
def lookup(string, bit_array, hash_count, size): for seed in range(hash_count): result = mmh3.hash(string, seed) % size if bit_array[result] == 0: return False return True
def add(self, item): digests = [] for i in range (self.hash_counts): digest = mmh3.hash(item, i) % self.bit_array_use self.bit_array_size[digest] = True
def get_feature(feat_str, model): # The feature string may be unicode, but MurmurHash3 expects ASCII encoded strings. return mmh3.hash(feat_str.encode('ascii', 'xmlcharrefreplace')) % model.num_features
def schingling(doc): return [mmh3.hash(doc[i:i + 9], signed=False) for i in range(len(doc) - 9)]
def normalized_hash(identifier, activation_group): return mmh3.hash("{}:{}".format(activation_group, identifier), signed=False) % 100 + 1
"mimeFile": mimeFile, "normHtmlFile": normHtmlFile, "plainTextFile": plainTextFile } # If enabled, remove boilerplate HTML if options.boilerpipe: logging.info(url + ": deboiling html") extractor = ExtrB(extractor='ArticleExtractor', html=text) deboiled = str(extractor.getHTML()) else: deboiled = text # We compute a hash on the HTML (either normalized one or after boilerpipe if enabled): # if we get duplicate files we discard them html_hash = mmh3.hash(deboiled, signed=False) # checking for duplicate content (duplicates are discarded) if html_hash in seen_html: logging.info("Repeated file:\t" + url) continue # get text with Alcazar library if options.parser == "alcazar": logging.info(url + ": Getting text with Alcazar") btext = alcazar.bodytext.parse_article(deboiled) if btext.body_text: plaintext = btext.body_text else: plaintext = "" # or get text with beautifulsoup
def hash32(data: bytes) -> bytes: return struct.pack('i', mmh3.hash(data))
def _hashes(self, key): for i in xrange(self.k): yield mmh3.hash(key, self.k)
def add(self, item): for i in range(self.hash_count): hashed_index = mmh3.hash(item, i) % self.size self.bit_array[hashed_index] = 1
def _hash(self, element): return [ b % self.size for b in [mmh3.hash(element, i) for i in range(self.hash_count)] ]
def _hashes_opt(self, key): # Kirsch - Mitzenmacher - Optimization h0 = mmh3.hash(key, 1) h1 = mmh3.hash(key, 10) for i in xrange(self.k): yield h0 + i * h1
def murmur3_32(text): val = mmh3.hash(text) return val if val >= 0 else val + 2**32
def runSim(args): ## avoid one processes starting multiple threads os.environ["MKL_NUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" dataset = args[0] if dataset[-1] != '/': dataset += '/' ground_truth, theta = args[1] ## load dna reads into reads_lst reads_lst = [] fastaFile = dataset + "/reads.fasta" with open(fastaFile) as handle: for values in SimpleFastaParser(handle): reads_lst.append(values[1]) n = len(reads_lst) ## load precomputed Jaccard Similarities JSims = np.loadtxt(dataset + "/minHashes/JSims.txt") ## load alignments gt_file = "{}/{}_ground_truth.txt".format(dataset, ground_truth) with open(gt_file) as f: lines = [[float(x) for x in line.rstrip('\n').split('\t')] for line in f] refDict = {} for i in range(n): refDict[i] = {} for line in lines: refDict[int(line[0]) - 1][int(line[1]) - 1] = line[2] / (line[3] + line[4] - line[2]) refDict[int(line[1]) - 1][int(line[0]) - 1] = line[2] / (line[3] + line[4] - line[2]) ## convert each read into it's k-mers symLength = 7 # k in k-mer def generateSymSets(reads_lst, symLength): symSets = {} for i, read in enumerate(reads_lst): lst = set() for j in range(len(read) - symLength): lst.add(read[j:j + symLength]) symSets[i] = lst return symSets symSets = generateSymSets(reads_lst, symLength) ## load precomputed minHashes minHashArr = np.zeros((n, 1000)) for i in range(n): minHashArr[i] = np.load(dataset + "minHashes/minHashes_{}.txt".format(i), allow_pickle=True) ## load precomputed iid sequences and their minhashes numRandReads = 5 randMinHashArr = np.zeros((numRandReads, 1000)) for i in range(numRandReads): randMinHashArr[i] = np.load(dataset + "randReads/randMinHashes_{}.txt".format(i), allow_pickle=True) minHashArrExtended = np.vstack((minHashArr[:, :1000], randMinHashArr)) ## checking to see precomputed minHashes works i = np.random.randint(0, n, size=100) j = np.random.randint(0, 1000, size=100) lst = [] for iter_round in range(100): iterLst = list(symSets[i[iter_round]]) lst.append( min([ mmh3.hash(sym, j[iter_round], signed=False) for sym in iterLst ])) assert (np.alltrue(minHashArrExtended[i, j] == lst)) ## checking to see precomputed JSim works i = np.random.randint(0, n, size=100) j = np.random.randint(0, n, size=100) lst = [] for iter_round in range(100): i1 = i[iter_round] j1 = j[iter_round] lst.append(JSims[i1, j1] == 1.0 * len(symSets[i1].intersection(symSets[j1])) / (len(symSets[i1].union(symSets[j1])))) assert (np.alltrue(lst) and np.allclose(JSims, JSims.T)) ## Testing SVD, JSimEmp, JSim Exact, reference vs all storageArrGround = [] storageArrpHatSVD = [] storageArrJsimExact = [] storageArrJsimEmp = [] storageArrNumOnesCol = [] storageArrQjs = [] storageArrwSJS = [] h = 1000 for ref_read in trange(n): groundTruthLocs = np.array(list(refDict[ref_read].keys())) if len(groundTruthLocs) == 0: ## read has no alignments in dataset continue refReadMatches = refDict[ref_read] groundTruthVals = [refReadMatches[i] for i in groundTruthLocs] lst = set(list(groundTruthLocs)) rangeN = set(range(n)) rangeN.discard(ref_read) toAppend = list(rangeN - lst) groundTruthLocs = np.hstack((groundTruthLocs, np.array(toAppend))) groundTruthVals += [0] * len(toAppend) empiricalMatrix = (minHashArrExtended == minHashArrExtended[ref_read]) empiricalMatrix = np.delete(empiricalMatrix, ref_read, axis=0)[:, :h] updatedGroundTruthLocs = groundTruthLocs - 1 * (groundTruthLocs >= ref_read) updatedGroundTruthLocs = updatedGroundTruthLocs.astype(int) jSimEmpirical = np.mean(empiricalMatrix, axis=1) jSimExact = np.delete(JSims[ref_read], ref_read) ## here we can modify what normalization is used without having to rerun SVDs # u = np.loadtxt(dataset+"/SVD/raw_pi_refread_{}.txt".format(ref_read)) # pHatSVD = 1-np.abs(u[:n-1])/np.abs(np.median(u[:n-1])) ## normalize median of p_i # pHatSVD = 1-np.abs(u[:n-1])/np.abs(np.median(u[n-1:])) ## random read normalization # pHatSVD = 1-np.abs(u[:n-1])/np.max(np.abs(u[n-1:])) ## naive max normalziation pHatSVD = np.loadtxt(dataset + "/SVD/pi_refread_{}.txt".format(ref_read)) qSVD = np.loadtxt(dataset + "/SVD/qj_refread_{}.txt".format(ref_read)) ## for approximation empQ = empiricalMatrix.sum(axis=0) x = np.matmul(empiricalMatrix - np.ones(empiricalMatrix.shape), 1 - np.array(empQ / np.max(empQ)))[:n - 1] x = np.abs(x - np.min(x)) x /= np.max(x) storageArrwSJS.extend(x[updatedGroundTruthLocs]) storageArrGround.extend(groundTruthVals) storageArrpHatSVD.extend(pHatSVD[updatedGroundTruthLocs]) storageArrJsimEmp.extend(jSimEmpirical[:n - 1][updatedGroundTruthLocs]) storageArrJsimExact.extend(jSimExact[updatedGroundTruthLocs]) storageArrNumOnesCol.extend(np.mean(empiricalMatrix, axis=0)) storageArrQjs.extend(qSVD) fpr, tpr, _ = roc_curve( np.array(storageArrGround) >= theta, storageArrpHatSVD) fpr_jsim, tpr_jsim, _ = roc_curve( np.array(storageArrGround) >= theta, storageArrJsimExact) fpr_js_emp, tpr_js_emp, _ = roc_curve( np.array(storageArrGround) >= theta, storageArrJsimEmp) fpr_wsjs, tpr_wsjs, _ = roc_curve( np.array(storageArrGround) >= theta, storageArrwSJS) pickle.dump([ auc(fpr, tpr), auc(fpr_jsim, tpr_jsim), auc(fpr_js_emp, tpr_js_emp), auc(fpr_wsjs, tpr_wsjs), storageArrNumOnesCol, np.corrcoef(storageArrpHatSVD, storageArrGround)[0, 1], np.corrcoef(storageArrJsimExact, storageArrGround)[0, 1], np.corrcoef(storageArrJsimEmp, storageArrGround)[0, 1], storageArrQjs, 'SJS AUC,JS AUC, JS emp AUC, wSJS AUC,numOnes per col,SJS r^2,JS r^2,JS emp r^2,storageArrQjs' ], open( "AUCs/{}_{}_{}.pkl".format(dataset[:-1], ground_truth, str(theta % 1).split('.')[1]), "wb"))
def insert(self, item): for i in range(self.qty_hash): t = mmh3.hash(bytes(item), i) % self.size self.bitarray[t] = True
def faviconHash(self, data, web_source=None): if web_source: b64data = base64.encodebytes(data).decode() else: b64data = base64.encodebytes(data) return mmh3.hash(b64data)
def hash(flowkey): global width flowkey_bytes = struct.pack("L", flowkey) r = mmh3.hash(flowkey_bytes, signed=False) return r % width
def add(self, element): for seed in self.seeds: result = hash(element, seed) % self.size self.hash_values[result] = 1 return self.hash_values
def string_digest(item, index): return mmh3.hash(bytes(item, 'utf-8'), index)
def is_member(self, item): for i in range (self.hash_counts): digest = mmh3.hash(item, i) % self.bit_array_use if self.bit_array_size[digest] == False: return False return True
def murmur(key): return mmh3.hash(key)
#np.dot(m.transpose(),m) #Jaccard(m[0],m[10]) #s=signature(m,10000) #s.shape #m_new = firma(m) m_new2 = m.dot(rndVecs) # projected matrix # Indexing text collection for doc_id in range(m_new2.shape[0]): docSgt = np.array(m_new2[doc_id, :] >= 0, dtype=np.int) for blk in range(NRBLK): # (blk*BLKSZ):((blk+1)*BLKSZ) blkData = docSgt[(blk*BLKSZ):((blk+1)*BLKSZ)] docHashVal = mmh3.hash(''.join(map(str, blkData))) % MAXBKTS hshTbl_blk = HshTabls[blk] if docHashVal not in hshTbl_blk: hshTbl_blk[docHashVal] = set() hshTbl_blk[docHashVal].add(doc_id + 1) collision = np.zeros((m.shape[0], m.shape[0]), dtype=np.int) for hshTbl_blk in HshTabls: for e in hshTbl_blk: for i in hshTbl_blk[e]: for o in hshTbl_blk[e]: collision[i - 1][o - 1] += 1 pldHaming=penalizedHcc(m_new2) simcos=(np.pi / 2) * (1 - hmmg(m_new2)) simcospenalized =(np.pi / 2) * (1 - pldHaming) print("End!")