def __init__(self, tokens, length=100000): """Calculates a Charikar simhash with appropriate bitlength. Input can be any iterable, but for strings it will automatically break it into words first, assuming you don't want to iterate over the individual characters. Returns nothing. """ if isinstance(tokens,basestring): tokens = tokens.split() v = {} if isinstance(tokens,dict): for value,w in tokens.iteritems(): k = xxhash.xxh64(value).intdigest() x = v.get(k%length,0) if k & 1 << 63: v[k%length] = x + w else: v[k%length] = x - w else: for value in tokens: k = xxhash.xxh64(value).intdigest() x = v.get(k%length,0) if k & 1 << 63: v[k%length] = x + 1 else: v[k%length] = x - 1 self.hash = v self.vector = v
def __hash_from_argument(self, argument): arg_string = "" if hasattr(argument, 'md5hash'): return argument.md5hash if hasattr(argument, 'xxhash64'): return argument.xxhash64 if type(argument) is numpy.ndarray: if argument.size > 181440000: return self.__hash_choice(argument.data) else: return str(xxhash.xxh64(argument.data)) if type(argument) is pandas.core.frame.DataFrame: col_values_list = list(argument.columns.values) try: col_values_string = ''.join(col_values_list) arg_string = col_values_string if argument.values.size > 181440000: return str(xxh.hash64(argument.data)) + "+" + str(xxhash.xxh64(arg_string)) else: return self.__hash_choice(argument.values.data) + "+" + str(xxhash.xxh64(arg_string)) except: if argument.values.size > 181440000: return str(xxh.hash64(argument.values.data)) else: return self.__hash_choice(argument.values.data) if type(argument) is list or type(argument) is tuple: arg_string = str(len(argument)) arg_string += str(argument) return self.__hash_choice(arg_string)
def _xxhash(self): """ An xxhash.b64 hash of the array. Returns ------------- xx: int, xxhash.xxh64 hash of array. """ # repeat the bookkeeping to get a contiguous array inside # the function to avoid additional function calls # these functions are called millions of times so everything helps if self._modified_x or not hasattr(self, '_hashed_xx'): if self.flags['C_CONTIGUOUS']: hasher = xxhash.xxh64(self) self._hashed_xx = hasher.intdigest() else: # the case where we have sliced our nice # contiguous array into a non- contiguous block # for example (note slice *after* track operation): # t = util.tracked_array(np.random.random(10))[::-1] contiguous = np.ascontiguousarray(self) hasher = xxhash.xxh64(contiguous) self._hashed_xx = hasher.intdigest() self._modified_x = False return self._hashed_xx
def hashRequests(authTicket, payload): baseHash = xxhash.xxh64( authTicket.SerializeToString(), seed=0x1B845238 ).intdigest() # Serialize and hash each request return [xxhash.xxh64( request.SerializeToString(), seed=baseHash ).intdigest() for request in payload]
def string_hash(value,length=11): s = '' for i in range(0,length,11): s = s + xxhash.xxh64(value+str(i)).hexdigest() s = encode_hash(int(s,16))[:length] if len(s) < length: s = s + "A" * (length - len(s)) return s
def string_hash_bits(value,length_in_bits=128): ''' Length must be a multiple of 4''' hex_length = length_in_bits / 4 s = '' for i in range(0,length_in_bits,64): s = s + xxhash.xxh64(value+str(i)).hexdigest() s = s[:hex_length] x = int(s,16) return x
def xxhash64(path, block_size=4096): try: with open(path, 'rb') as rf: h = xxhash.xxh64() for chunk in iter(lambda: rf.read(block_size), b''): h.update(chunk) return h.hexdigest(), path except IOError: return None, path
def hash(self): """Return hash of motif. This is an unique identifier of a motif, regardless of the id. Returns: hash : str """ return xxhash.xxh64(self._pwm_to_str(3)).hexdigest()
def subscribe(request): """ Subcribe the given email to the given URL. TODO BEN: Include Subscription title or description in POST variables """ url = request.POST['subscription_url'] email = request.POST['email'] user, created_user = User.objects.get_or_create(email=email) if created_user: user_verification_hash = uuid.uuid4().hex user_verification = Verification.objects.create( verified=False, verification_hash=user_verification_hash) user.verification = user_verification user.save() content, created_content = SubscribedContent.objects.get_or_create(url=url) if created_content: content_response = requests.get(url) content_hash = xxhash.xxh64(content_response.text).hexdigest() content.latest_content_hash = content_hash if not Subscription.objects.filter(user=user, content=content).exists(): verification_hash = uuid.uuid4().hex verification_url = request.build_absolute_uri( reverse( 'verify', kwargs={ 'email': email, 'key': verification_hash} ) ) verification_item = Verification.objects.create( verified=False, verification_hash=verification_hash ) Subscription.objects.create( user=user, content=content, verification=verification_item ) email_sent = EMAIL.VERIFY_SUBSCRIPTION.send( email, {'verification_url': verification_url}) if email_sent: message = MESSAGES.EMAIL.VERIFICATION_SENT.format(email) else: message = MESSAGES.EMAIL.ERROR_SENDING_EMAIL.format( email) else: message = MESSAGES.EMAIL.ALREADY_SUBSCRIBED messages.add_message(request, messages.INFO, message) return redirect(request.META['HTTP_REFERER'], {'message': message})
def test_XXH64_reset(self): x = xxhash.xxh64() h = x.intdigest() for i in range(10, 50): x.update(os.urandom(i)) x.reset() self.assertEqual(h, x.intdigest())
def save(self, *args, **kwargs): new_hash = xxhash.xxh64(self.content_raw).hexdigest() mentioned_users = [] if new_hash != self.raw_content_hash or (not self.pk): # To (re-)render the content if content changed or topic is newly created self.content_rendered, mentioned_users = render_content(self.content_raw, sender=self.user.username) super(Topic, self).save(*args, **kwargs) self.raw_content_hash = new_hash for to in mentioned_users: notify.delay(to=to.username, sender=self.user.username, topic=self.pk)
def cached_parse_dhcp(self, lines, cur_time=None): if cur_time is None: cur_time = dt.utcnow() m = xxhash.xxh64() m.update("".join(lines[:self.dhcp_cache_len]).encode("utf8")) new_hash = m.digest() # new_len = len(lines) if new_hash != self.dhcp_hash: self.dhcp_cache_len = 0 self.dhcp_cache = [] m = xxhash.xxh64() lines = lines[self.dhcp_cache_len:] self.dhcp_cache.extend(self.from_dhcp(lines, cur_time)) m.update("".join(lines).encode("utf8")) self.dhcp_hash = m.digest() self.dhcp_cache_len += len(lines) return self.dhcp_cache
def xxhash_file(srcfile, logger, block_size=2**20): f_name = func_name() logger.info(f_name+"\t\tCalculating xx-hash on : "+srcfile) f = open(srcfile, 'r') x = xxhash.xxh64() while True: data = f.read(block_size) if not data: break x.update(data) return x.hexdigest()
def generate_content_hashes(self): """ Generate a dictionary which maps parl_ids to their respective hashes Used for speedy comparison of changes """ es_response = json.loads(self.get_content()) content_hashes = {} for res in es_response['result']: content_hashes[res['parl_id']] = xxhash.xxh64( json.dumps(res)).hexdigest() return json.dumps(content_hashes)
def save(self, *args, **kwargs): new_hash = xxhash.xxh64(self.content_raw).hexdigest() mentioned_users = [] if new_hash != self.raw_content_hash or (not self.pk): self.content_rendered, mentioned_users = render_content(self.content_raw, sender=self.user.username) super(Post, self).save(*args, **kwargs) t = self.topic t.reply_count = t.get_reply_count() t.last_replied = t.get_last_replied() t.save(update_fields=['last_replied', 'reply_count']) for to in mentioned_users: notify.delay(to=to.username, sender=self.user.username, post=self.pk)
def memoize_wrapper(*args, **kwargs): hash = xxhash.xxh64(str(args) + str(kwargs)).intdigest() path = path_pattern.format(hash=hash) try: with open(path, 'rb') as file: logger.debug("Loading pickle %s", path) data = pickle.load(file) except (FileNotFoundError, EOFError): data = fn(*args, **kwargs) with open(path, 'wb') as file: pickle.dump(data, file) return data
def test_XXH64(self): x = xxhash.xxh64() x.update('a') self.assertEqual(xxhash.xxh64('a').digest(), x.digest()) x.update('b') self.assertEqual(xxhash.xxh64('ab').digest(), x.digest()) x.update('c') self.assertEqual(xxhash.xxh64('abc').digest(), x.digest()) seed = random.randint(0, 2**32) x = xxhash.xxh64(seed=seed) x.update('a') self.assertEqual(xxhash.xxh64('a', seed).digest(), x.digest()) x.update('b') self.assertEqual(xxhash.xxh64('ab', seed).digest(), x.digest()) x.update('c') self.assertEqual(xxhash.xxh64('abc', seed).digest(), x.digest())
def parse(self, response): soup = BeautifulSoup(response.body) for script in soup(["script", "style"]): script.extract() text = soup.get_text() response.meta.update(score=KeywordScorer.score(text)) response.meta.update( content_hash=xxhash.xxh64(text.encode('ascii', 'ignore')).intdigest()) for link in self.link_extractor.extract_links(response): request = Request(url=link.url) request.meta.update(link_text=link.text) link_score = KeywordScorer.score(link.text) request.meta.update(score=link_score) yield request
def _calculate(reaction, descriptorDict, verbose=False, whitelist=None): """Calculate descriptors for this plugin with descriptorDict already created.""" # descriptor Value classes cat = DRP.models.CatRxnDescriptorValue perm = DRP.models.CategoricalDescriptorPermittedValue # reaction space descriptor heading = 'rxnSpaceHash1' if whitelist is None or heading in whitelist: h = xxhash.xxh64() # generates a hash for reactant in reaction.compounds.order_by('abbrev'): h.update(reactant.abbrev) p = perm.objects.get_or_create(descriptor=descriptorDict[ heading], value=h.hexdigest())[0] cat.objects.update_or_create(defaults={ 'value': p}, reaction=reaction, descriptor=descriptorDict['rxnSpaceHash1'])[0]
def feature_hash_string(s, window, dim): start = time.clock() # Generate window-char Markov chains & create feature hash vector v = {} for x in range(0, dim): v[x] = 0 length = len(s) max_num = 2.0 ** 64 for x in range(0, length - window): key = xxhash.xxh64(s[x:x + window]) % dim v[key] += 0x1 return numpy.asarray(v.values())
def wrapper(*args, **kwds): if not cache: return f(*args, **kwds) if key_func: key = 'django_vimeo_cache:{}'.format(key_func(*args, **kwds)) else: key = 'django_vimeo_cache:' + f.__name__ + ':' +\ str(list(args) + list(sorted(kwds.items()))) key = xxhash.xxh64(key).hexdigest() value = cache.get(key) if value is None: value = f(*args, **kwds) cache.set(key, value, expires) value = cache.get(key) if value is None: raise Exception('failed to fetch cached value, try again') return value
def bv_hash(self): """ Iterate over all the BinaryView (flat iteration over the hex values themselves) :return:(INT) Hash of the whole file """ # create file object br = BinaryReader(self.bv) # calculate file hash file_hash = xxhash.xxh64() # for some reason a BinaryReader won't read more then 1000 or so bytes temp_hash = br.read(1000) while temp_hash: file_hash.update(temp_hash) temp_hash = br.read(1000) return file_hash.hexdigest()
def ducos1xxh(lastBlockHash, expectedHash, difficulty): # XXHASH algorithm # Measure starting time timeStart = time() # Loop from 1 too 100*diff for ducos1xxres in range(100 * int(difficulty) + 1): # Generate hash ducos1xx = xxhash.xxh64(str(lastBlockHash) + str(ducos1xxres), seed=2811) ducos1xx = ducos1xx.hexdigest() # Check if result was found if ducos1xx == expectedHash: # Measure finish time timeStop = time() # Calculate hashrate timeDelta = timeStop - timeStart hashrate = ducos1xxres / timeDelta return [ducos1xxres, hashrate]
def read(mount_point, incoming_data, **kwargs): outgoing_data = {} flock = kwargs['flock'] offset = incoming_data['offset'] chunk_size = incoming_data['repeats'] f_path = ''.join([mount_point, incoming_data['target']]) with open(f_path, 'rb') as f: f.seek(offset) flock.lockf(f.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB, chunk_size, offset, 0) buf = f.read(chunk_size) flock.lockf(f.fileno(), fcntl.LOCK_UN, chunk_size, offset) outgoing_data['hash'] = xxhash.xxh64(buf).intdigest() outgoing_data['offset'] = offset outgoing_data['chunk_size'] = chunk_size outgoing_data['uuid'] = incoming_data['uuid'] outgoing_data['tid'] = incoming_data['tid'] # outgoing_data['buffer'] = buf[:256].decode() return outgoing_data
def extract_attribute(self, base_object: BDBasicBlock) -> int: # Check if value already exists BasicBlockHash_value = base_object.get_attribute_value( 'BasicBlockHash') if BasicBlockHash_value: pass else: hash_value = xxhash.xxh64() for instruction_expression in base_object.underlying_obj: for instruction in instruction_expression[0]: hash_value.update(instruction.text) base_object.add_attribute_value('BasicBlockHash', {'hash': hash_value.intdigest()}) BasicBlockHash_value = base_object.get_attribute_value( 'BasicBlockHash') return BasicBlockHash_value['hash'] if BasicBlockHash_value else None
def write(self, path, data, offset, fh): realpath = self.remotepath(path) cachefile = self.cachefile(realpath) if not os.path.exists(cachefile): if self.empty_file(realpath): self.create(path, 'wb') else: raise FuseOSError(ENOENT) with open(cachefile, 'rb+') as outfile: outfile.seek(offset, 0) outfile.write(data) self.attributes.insert(realpath, self.extract(os.lstat(cachefile))) task = Task( xxhash.xxh64(realpath).intdigest(), self._write, realpath, data, offset) self.taskpool.submit(task) return len(data)
def set_motifs(self, motifs): try: # Check if motifs is a list of Motif instances motifs[0].to_pwm() tmp = NamedTemporaryFile(mode="w", delete=False) for m in motifs: tmp.write("{}\n".format(m.to_pwm())) tmp.close() motif_file = tmp.name except AttributeError as e: motif_file = motifs self.motifs = motif_file with open(motif_file) as f: self.motif_ids = [m.id for m in read_motifs(f)] self.checksum = {} if self.use_cache: chksum = xxhash.xxh64("\n".join(sorted(self.motif_ids))).digest() self.checksum[self.motif_file] = chksum
def filesget(id): # helper: *deep level route* rfile = StaticFile.get(id=id) mimetype = (lambda x: 'application/octet-stream' if x == None else x)(mimetypes.guess_type(rfile.original, strict=True)) response.set_header('Accept-Ranges', 'bytes') response.set_header('Content-Length', str(rfile.length)) response.set_header('Content-Type', mimetype) response.set_header( 'Last-Modified', datetime.fromtimestamp( rfile.created, tz=pytz.timezone('GMT')).strftime('%a, %d %b %Y %H:%M:%S GMT')) response.set_header( 'ETag', '"{0}"'.format(xxhash.xxh64(rfile.content).hexdigest())) return io.BytesIO(rfile.content)
def iter_archive(self, archive_dir): archivehashes = [] if os.path.isdir(archive_dir): for path, dirs, files in walk(archive_dir): for filename in files: filepath = joinpath(path, filename) if os.path.isfile(filepath): filesize = stat(filepath).st_size try: filehash = xxhash.xxh64( open(filepath, 'rb').read()).hexdigest() except: print("Didnt like this file", filepath) if filehash != '': archivehashes.append([{ 'filehash': filehash, 'path': filepath }]) return archivehashes
def handle(self, *args, **kwargs): hashDictionary = {} collisionCount = 0 for reaction in Reaction.objects.all(): reactantString = '' h = xxhash.xxh64() for reactant in reaction.compounds: h.update(reactant.abbrev) reactantString += reactant.abbrev if h in hashDictionary: if hashDictionary[h].hexdigest() != reactantString: collisionCount += 1 else: hashDictionary[h] = reactantString if collisionCount > 0: e = EmailToAdmins('Dark Reactions Project: Hash Collision Failure', 'A collision between reaction space hashes has occured. Please contact the DRP development team and file a bug report.') e.send() exit(1)
def data(path, D): ''' GENERATOR: Apply hash-trick to the original csv row and for simplicity, we one-hot-encode everything INPUT: path: path to training or testing file D: the max index that we can hash to YIELDS: x: a list of hashed and one-hot-encoded 'indices' we only need the index since all values are either 0 or 1 y: y = 1 if we have a click, else we have y = 0 ''' with open(path, 'r', encoding='utf-8') as f: csvreader = reader(f) # create a CSV reader header = next(csvreader) for row in csvreader: # iterate over the available rows row = dict(zip(header, row)) # ts and bid_id are used only while updating train data for feat in ['bid_id', 'ts']: if feat in row: del row[feat] # process clicks y = 0. target = 'click' if target in row: if row[target] == '1': y = 1. del row[target] # build x x = [] for key in row: value = row[key] # one-hot encode everything with hash trick index = xxh64(key + '_' + value).intdigest() % D x.append(index) yield x, y
def iter_duplicates(self, duplicates_dir): tobevalidated = [] if os.path.isdir(duplicates_dir): for path, dirs, files in walk(duplicates_dir): for filename in files: filepath = joinpath(path, filename) if os.path.isfile(filepath): filesize = stat(filepath).st_size try: filehash = xxhash.xxh64( open(filepath, 'rb').read()).hexdigest() except: print("Didnt like this file: ", filepath) if filehash != '': tobevalidated.append([{ 'filehash': filehash, 'path': filepath }]) return tobevalidated
def store(val, srcID): # if srcID not seen yet, make new entry in srcIDs if srcID not in srcIDs: srcIDs[srcID] = [] key = xxhash.xxh64(val).intdigest() & 0xffff # remove expired keys in srcID's list srcIDs[srcID] = [k for k in srcIDs[srcID] if data[k][2] + 300 >= time.time()] # if over the limit, discard the store if len(srcIDs[srcID]) >= srcLimit: return # if key not yet stored if key not in data: data[key] = (val, srcID, time.time()) srcIDs[srcID].append(key) # check for key expiration elif data[key][2] + 300 < time.time(): srcIDs[data[key][1]].remove(key) data[key] = (val, srcID, time.time()) srcIDs[srcID].append(key)
def generate_content_hashes(self, content=None): """ Generate a dictionary which maps parl_ids to their respective hashes Used for speedy comparison of changes """ if not content: es_response = json.loads(self.get_content()) else: try: es_response = json.loads(content) except: es_response = json.loads(self.get_content()) content_hashes = {} for res in es_response['result']: content_hashes[res['parl_id']] = xxhash.xxh64( json.dumps(res)).hexdigest() return json.dumps(content_hashes)
def hash_file2(fpath, blocksize=65536, hasher='xx64'): r""" Hashes the data in a file on disk using xxHash xxHash is much faster than sha1, bringing computation time down from .57 seconds to .12 seconds for a 387M file. my_weights_fpath_ = ub.truepath('~/tmp/my_weights.pt') xdata = 2 ** np.array([8, 12, 14, 16]) ydatas = ub.ddict(list) for blocksize in xdata: print('blocksize = {!r}'.format(blocksize)) ydatas['sha1'].append(ub.Timerit(2).call(ub.hash_file, my_weights_fpath_, hasher='sha1', blocksize=blocksize).min()) ydatas['sha256'].append(ub.Timerit(2).call(ub.hash_file, my_weights_fpath_, hasher='sha256', blocksize=blocksize).min()) ydatas['sha512'].append(ub.Timerit(2).call(ub.hash_file, my_weights_fpath_, hasher='sha512', blocksize=blocksize).min()) ydatas['md5'].append(ub.Timerit(2).call(ub.hash_file, my_weights_fpath_, hasher='md5', blocksize=blocksize).min()) ydatas['xx32'].append(ub.Timerit(2).call(hash_file2, my_weights_fpath_, hasher='xx32', blocksize=blocksize).min()) ydatas['xx64'].append(ub.Timerit(2).call(hash_file2, my_weights_fpath_, hasher='xx64', blocksize=blocksize).min()) import netharn as nh nh.util.qtensure() nh.util.multi_plot(xdata, ydatas) """ import xxhash if hasher == 'xx32': hasher = xxhash.xxh32() elif hasher == 'xx64': hasher = xxhash.xxh64() with open(fpath, 'rb') as file: buf = file.read(blocksize) # otherwise hash the entire file while len(buf) > 0: hasher.update(buf) buf = file.read(blocksize) # Get the hashed representation text = ub.util_hash._digest_hasher(hasher, hashlen=None, base=ub.util_hash.DEFAULT_ALPHABET) return text
def handle(self, *args, **kwargs): """Handle the command call.""" hashDictionary = {} collisionCount = 0 for reaction in Reaction.objects.all(): reactantString = '' h = xxhash.xxh64() for reactant in reaction.compounds: h.update(reactant.abbrev) reactantString += reactant.abbrev if h in hashDictionary: if hashDictionary[h].hexdigest() != reactantString: collisionCount += 1 else: hashDictionary[h] = reactantString if collisionCount > 0: e = EmailToAdmins('Dark Reactions Project: Hash Collision Failure', 'A collision between reaction space hashes has occured. Please contact the DRP development team and file a bug report.') e.send() exit(1)
def _encValKey(v): ''' Encode a value as used in a key. Non-negative numbers are msgpack encoded. Negative numbers are encoded as a marker, then the encoded negative of that value, so that the ordering of the encodings is easily mapped to the ordering of the negative numbers. Strings too long are hashed. Note that this scheme prevents interleaving of value types: all string encodings compare larger than all negative number encodings compare larger than all nonnegative encodings. ''' if isinstance(v, int): if v >= 0: return s_msgpack.en(v) else: return NEGATIVE_VAL_MARKER_ENC + s_msgpack.en(-v) else: if len(v) >= LARGE_STRING_SIZE: return (HASH_VAL_MARKER_ENC + s_msgpack.en(xxhash.xxh64(v).intdigest())) else: return STRING_VAL_MARKER_ENC + s_msgpack.en(v)
def hash_file(path): """ Calculates hash for the file of given path Parameters ---------- path: string Path to file to be hashed Returns ------- Hash digest calculated for given file """ if path is None or not os.path.isfile(path): return None fo = open(path, 'rb') c = fo.read() r = xxhash.xxh64(c).hexdigest() fo.close() return r
def validate(data_dst, data): match = False try: dst_checksum = data_dst.attrs["checksum"] except KeyError: # checksum does not exist, since it is only updated when dump was # completed logger.warning( f'"{data_dst.path}" contains partial dump, rewrite') else: src_checksum = xxhash.xxh64(data.compute()).hexdigest() if dst_checksum == src_checksum: match = True else: # checksum mismatch, reset logger.warning( f'"{data_dst.path}" does not match the source') del data_dst.attrs["checksum"] return match, (data_dst, data)
def getHashSum(file_path): hashsums = {} result = {} hashsums['xxh32'] = xxhash.xxh32() hashsums['xxh64'] = xxhash.xxh64() hashsums['md5'] = hashlib.md5() with open(file_path, 'rb') as f: while True: chunk = f.read(64 * 1024) if len(chunk): for key in hashsums.keys(): hashsums[key].update(chunk) else: break for key, value in hashsums.items(): result[key] = value.hexdigest() return result
def _digestAndWrite(self, myqueue, topath): digest = xxhash.xxh64() try: os.makedirs(os.path.dirname(topath)) except: pass finally: with open(topath, 'w') as f: while True: item = myqueue.get() if isinstance(item, exitcode): myqueue.put(digest.hexdigest()) sys.stdout.write(self.greencolor + '[COPY PASS]'.rjust(16)) sys.stdout.flush() break digest.update(item) f.write(item)
def write_content(content, **kwargs): h64 = xxhash.xxh64(content).hexdigest() filepath = 'trnews-data/' + h64[:2] + '/' + h64[2:4] + '/' + h64 d = os.path.dirname(filepath) if not os.path.isdir(d): os.makedirs(d) elif os.path.exists(filepath): return filepath, False with open(filepath + ".meta", "wb") as fp: fp.write(yaml.safe_dump(kwargs, default_flow_style = False, allow_unicode=True, indent=2, encoding="utf-8")) with gzip.open(filepath, "wb") as fp: fp.write(content) return filepath, True
def constrained(pattern: Union[str, re.Pattern]) -> Type[SymbolName]: """Create a new SymbolName subclass using the provided string as validation RE.""" if isinstance(pattern, re.Pattern): regex = pattern pattern = pattern.pattern else: try: regex = re.compile(pattern) except re.error as e: raise TypeError( f"Invalid regular expression definition: '{pattern}'." ) from e assert isinstance(pattern, str) xxh64 = xxhash.xxh64() xxh64.update(pattern.encode()) subclass_name = f"SymbolName_{xxh64.hexdigest()[-8:]}" namespace = dict(regex=regex) return type(subclass_name, (SymbolName, ), namespace)
def _calculate_asset_hash(asset_file, dev_mode): if dev_mode: return random.random() """ 1. calculate the hash of asset file, use the hash as version number to control(maximize) the HTTP cache. 2. the hash value will be cached in memory until the python app server restarted. 3. only process text asset file(js and css), no binary file(img, fonts) processed. ##Todo## """ hash = _asset_hash_cache_.get(asset_file) if not hash: file = os.path.join(os.path.dirname(__file__), *[x for x in asset_file.split('/')]) if os.path.isfile(file): with open(file, 'r', encoding='utf-8') as f: data = f.read() hash = xxhash.xxh64(data).hexdigest() _asset_hash_cache_[asset_file] = hash return hash
def build_recursive_tree(tree, base, depth, width): """ Args: tree: Tree base: Node depth: int width: int """ if depth >= 0: depth -= 1 for _ in range(width): directory = Directory(None) tree.create_node("{0}".format(directory.name), "{0}".format(xxhash.xxh64(directory.name)), parent=base.identifier, data=directory) dirs_nodes = tree.children(base.identifier) for dir_node in dirs_nodes: newbase = tree.get_node(dir_node.identifier) build_recursive_tree(tree, newbase, depth, width) else: return
def parse(self, response): soup = BeautifulSoup(response.body) for script in soup(["script", "style"]): script.extract() text = soup.get_text() if text: response.meta.update( content_hash=xxhash.xxh64(text.encode('ascii', 'ignore')).intdigest()) try: langid = detect(text) except LangDetectException: return if langid == 'en': tagged = filter(lambda x: x[2] >= 0.99, tag_locations(MySpider.geo_names, text)) gid_count = collections.Counter(gid for name, gid, score in tagged) score = scorer( float(sum(gid_count.itervalues()))/ float(len(text)) ) response.meta.update(score=score) for link in self.link_extractor.extract_links(response): request = Request(url=link.url) request.meta.update(link_text=link.text) request.meta.update(score=score) yield request date = datetime.datetime.now() for gid, count in gid_count.iteritems(): yield LocationsItem( date=date, geoname_id=gid, count=count ) else: response.meta.update(score=0)
def _hash_image(image): """ Two hash variant are possible : - if imgui_cv.USE_FAST_HASH is True : select 100 random pixels and hash them - otherwise : compute the hash of the whole image (using xxhash for performance) :param image: :return:hash """ if USE_FAST_HASH: rng = np.random.RandomState(89) inds = rng.randint(low=0, high=image.size, size=100) b = image.flat[inds] result = hash(tuple(b.data)) return result else: # cf https://stackoverflow.com/questions/16589791/most-efficient-property-to-hash-for-numpy-array h = xxhash.xxh64() h.update(image) result = h.intdigest() h.reset() return result
def parse_binary_view(self): bv_hash = xxhash.xxh64() bv_hash.update(self.bv.file.filename) bv_object = BinaryView.BinaryViewNode(self.bv, str(bv_hash.hexdigest()), parent_uuid='0', parent_node_label='RootNode') # Update node list self.node_list.append(bv_object) func_index = 0 # Iterate all functions for function in self.bv.functions: # NOTE: THE FOLLOWING CODE IS NOT THREAD SAFE~!!!!!!!!!!!! # Each function needs its own basic block cache self.basic_block_cache = dict() self.parse_function(function.mlil, bv_object, str(func_index)) func_index += 1 self.run_post_processing() return self.node_list
def ducos1xxh(lastBlockHash, expectedHash, difficulty, efficiency): # XXHASH algorithm # Measure starting time timeStart = time() # Loop from 1 too 100*diff for ducos1xxres in range(100 * int(difficulty) + 1): # If efficiency lower than 100% sleep to use less CPU if ducos1xxres % 1000000 == 0 and float(100 - efficiency * 100) < 100: sleep(float(efficiency)) # Generate hash ducos1xx = xxhash.xxh64(str(lastBlockHash) + str(ducos1xxres), seed=2811) ducos1xx = ducos1xx.hexdigest() # Check if result was found if ducos1xx == expectedHash: # Measure finish time timeStop = time() # Calculate hashrate timeDelta = timeStop - timeStart hashrate = ducos1xxres / timeDelta return [ducos1xxres, hashrate]
def test_hash_file(fs): """ Test to hash a file Cases ----- - Not existing file (should return None) - Existing file (should return Digest) - None (should return None) - Pass directory (should return None) """ # prepare file system fs.create_file('/phonyDir/testfile', contents='test') # Not existing file (should return None) assert DupFinder.fs.hash_file('notexisting.txt') is None # Existing file (should return Digest) assert DupFinder.fs.hash_file('/phonyDir/testfile') == xxhash.xxh64( 'test').hexdigest() # None (should return None) assert DupFinder.fs.hash_file(None) is None # Pass directory (should return None) assert DupFinder.fs.hash_file('/phonyDir') is None assert DupFinder.fs.hash_file('/phonyDir/') is None
def test_key_s3_incomprehensible_range_start(self): """ Check that the key is constructed as we expect """ import common.tztools job_scope = JobScope( ad_account_id=gen_string_id(), report_type=ReportType.day_platform, report_variant=Entity.Campaign, range_start='blah-blah', ) # even though range_start is provided ^ above, it's not date-like and we # should be ok with that and just fall back to datetime.utcnow() now_dt = datetime(2000, 1, 2, 3, 4, 5) with mock.patch.object(common.tztools, 'now', return_value=now_dt) as now_mocked, mock.patch.object( uuid, 'uuid4', return_value='UUID-HERE' ): storage_key = cold_storage.store({'data': 'yeah!'}, job_scope) assert now_mocked.called prefix = xxhash.xxh64(job_scope.ad_account_id).hexdigest()[:6] expected_key = ( f'fb/' + f'{prefix}-{job_scope.ad_account_id}/' + f'{job_scope.report_type}/' + f'{now_dt.strftime("%Y")}/' + f'{now_dt.strftime("%m")}/' + f'{now_dt.strftime("%d")}/' + f'{now_dt.strftime("%Y-%m-%dT%H:%M:%SZ")}-' + f'{job_scope.job_id}-' + f'UUID-HERE' + f'.json' ) assert storage_key == expected_key
def touch_success(logger, incoming_message, dir_tree): logger.debug(f"Successful touch arrived incoming_message['target']") path = incoming_message['target'].split('/')[1:] # folder:file syncdir = dir_tree.get_dir_by_name(path[0]) dir_index = xxhash.xxh64(path[0]).hexdigest() if not syncdir: logger.debug( f"Directory {path[0]} already removed from active dirs list, dropping touch {path[1]}" ) return # There might be a raise when successful mkdir message will arrive after successful touch message # So we won't check here if dir is already synced f = syncdir.data.get_file_by_name(path[1]) # Now, when we got reply from client that file was created, # we can mark it as synced syncdir.data.size += 1 f.ondisk = True f.creation_time = datetime.datetime.strptime(incoming_message['timestamp'], '%Y/%m/%d %H:%M:%S.%f') f.uuid = uuid.uuid4().hex[ -5:] # Unique session ID, will be modified on each file modify action logger.debug(f"File {path[0]}/{path[1]} was created at: {f.creation_time}") logger.debug( f"File {path[0]}/{path[1]} is synced. Directory size updated to {syncdir.data.size} bytes" ) if syncdir.data.size > MAX_FILES_PER_DIR: try: logger.debug( f"Directory {path[0]} going to be removed from dir tree") dir_tree.remove_dir_by_name(path[0]) del dir_tree.synced_nodes[dir_index] del dir_tree.nids[dir_index] logger.debug( f"Directory {path[0]} is reached its size limit and removed from active dirs list" ) except (NodeIDAbsentError, KeyError): logger.debug( f"Directory {path[0]} already removed from active dirs list, skipping...." )
def test_xxh64_update(self): x = xxhash.xxh64() x.update('a') self.assertEqual(xxhash.xxh64('a').digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('a'), x.digest()) x.update('b') self.assertEqual(xxhash.xxh64('ab').digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('ab'), x.digest()) x.update('c') self.assertEqual(xxhash.xxh64('abc').digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('abc'), x.digest()) seed = random.randint(0, 2**64) x = xxhash.xxh64(seed=seed) x.update('a') self.assertEqual(xxhash.xxh64('a', seed).digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('a', seed), x.digest()) x.update('b') self.assertEqual(xxhash.xxh64('ab', seed).digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('ab', seed), x.digest()) x.update('c') self.assertEqual(xxhash.xxh64('abc', seed).digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('abc', seed), x.digest())
def _job_scope_to_storage_key( job_scope: JobScope, chunk_marker: Optional[int] = DEFAULT_CHUNK_NUMBER, custom_namespace: Optional[str] = None) -> str: """ Puts together the S3 object key we need for given report data. This is just a helper function :param job_scope: The job scope (dict representation) :param chunk_marker: Order number of written chunk :param custom_namespace: Custom job namespace :return string: The full S3 key to use """ assert isinstance(job_scope, JobScope) prefix = xxhash.xxh64(job_scope.ad_account_id).hexdigest()[:6] # datetime is a subclass of date, so we must check for date first if isinstance(job_scope.range_start, date): report_datetime = datetime.combine(job_scope.range_start, datetime.min.time()) elif isinstance(job_scope.range_start, datetime): report_datetime = job_scope.range_start else: # long import line to allow mocking of call to now() in tests. report_datetime = common.tztools.now() key = (f'{custom_namespace or job_scope.namespace}/' f'{prefix}-{job_scope.ad_account_id}/' f'{job_scope.report_type}/' f'{report_datetime.strftime("%Y")}/' f'{report_datetime.strftime("%m")}/' f'{report_datetime.strftime("%d")}/' f'{report_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")}-' f'{job_scope.job_id}-' f'{str(chunk_marker)+"-" if chunk_marker else ""}' f'{uuid.uuid4()}' f'.json') return key
def import_opml(user_id, path): _opml = opml.parse(path) uncategorized = None for outline in _opml: if hasattr(outline, 'xmlUrl'): if uncategorized is None: # does not defined yet uncategorized = Category.query.filter_by(user_id=user_id, name="Uncategorized").first() if uncategorized is None: # not found uncategorized = Category(user_id, "Uncategorized", order_id=9999) uncategorized.save() feed = Feed(outline.xmlUrl) feed.save() user_feed = UserFeed(user_id, uncategorized.id, feed.id, outline.text) user_feed.save() else: category = Category.query.filter_by(user_id=user_id, name=outline.text).first() if category is None: category = Category(user_id, outline.text) category.save() for child in outline: if hasattr(child, 'xmlUrl'): hash = xxhash.xxh64() feed = Feed.query.filter_by(feed_url_hash=hash).first() if feed is None: feed = Feed(child.xmlUrl) feed.save() user_feed = UserFeed(user_id=user_id, category_id=category.id, feed_id=feed.id, feed_name=child.text) user_feed.save() else: logger.warn("Nested category is not supported yet, ignored!")
def hashdirectory(self,directory,map): hashfunc = xxhash.xxh32() for file in os.listdir(directory): if(os.path.isdir(os.path.join(directory,file))): #print os.path.join(directory,file) key = self.hashdirectory(os.path.join(directory,file),map) if key in map: map[key] = map[key] + "?"+os.path.join(directory,file) else: map[key] = os.path.join(directory,file) hashfunc.update(key) if(os.path.isfile(os.path.join(directory,file))): hf = xxhash.xxh64() f = open(os.path.join(directory,file),'rb').read() byts = bytes(f) #mem = memoryview(byts) buffersize = 1048576 bytesize = sys.getsizeof(byts) self.ldb.pgb.step(bytesize/1024) if bytesize-buffersize>0: for i in range(0,bytesize-buffersize,buffersize): if bytesize-i>buffersize: hf.update(byts[i:(i+buffersize)]) else: hf.update(byts[i:]) else: hf.update(byts[0:]) key = hf.digest() if key in map: map[key] = map[key] + "?"+os.path.join(directory,file) else: map[key] = os.path.join(directory,file) hashfunc.update(key) key = hashfunc.digest() return key
def _hash_xxhash(buf): """ Produce a 8-bytes hash of *buf* using xxHash. """ return xxhash.xxh64(buf).digest()
def __init__(self, *args, **kwargs): super(Topic, self).__init__(*args, **kwargs) self.raw_content_hash = xxhash.xxh64(self.content_raw).hexdigest()
def generateRequestHash(authticket, request): firstHash = xxhash.xxh64(authticket, seed=0x1B845238).intdigest() return xxhash.xxh64(request, seed=firstHash).intdigest()