def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s", len(self._cache)) self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints)) for chunk in chunks(to_fetch, 128): for state in self.session.query(self.model).filter(self.model.fingerprint.in_(chunk)): self._cache[str(state.fingerprint)] = state.state
def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s", len(self._cache)) self.logger.debug("to fetch %d from %d", (len(to_fetch), len(fingerprints))) for chunk in chunks(to_fetch, 128): for state in self.model.objects.filter(crawl=self.crawl_id, fingerprint__in=chunk): self._cache[state.fingerprint] = state.state
def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._state_cache] self.logger.debug("cache size %s" % len(self._state_cache)) self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) for chunk in chunks(to_fetch, 65536): keys = [unhexlify(fprint) for fprint in chunk] table = self.connection.table(self._table_name) records = table.rows(keys, columns=[b's:state']) for key, cells in records: if b's:state' in cells: state = unpack('>B', cells[b's:state'])[0] self._state_cache[hexlify(key)] = state
def flush(self, force_clear): if len(self._state_cache) > self._cache_size_limit: force_clear = True table = self.connection.table(self._table_name) for chunk in chunks(list(self._state_cache.items()), 32768): with table.batch(transaction=True) as b: for fprint, state in chunk: hb_obj = prepare_hbase_object(state=state) b.put(unhexlify(fprint), hb_obj) if force_clear: self.logger.debug("Cache has %d requests, clearing" % len(self._state_cache)) self._state_cache.clear()
def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._state_cache] if not to_fetch: return self.logger.debug('Fetching %d/%d elements from HBase (cache size %d)', len(to_fetch), len(fingerprints), len(self._state_cache)) for chunk in chunks(to_fetch, 65536): keys = [unhexlify(fprint) for fprint in chunk] table = self.connection.table(self._table_name) records = table.rows(keys, columns=[b's:state']) for key, cells in records: if b's:state' in cells: state = unpack('>B', cells[b's:state'])[0] self._state_cache[hexlify(key)] = state
def test_non_multiple_length(self): assert list(chunks([1, 2, 3, 4, 5, 6, 7, 8], 3)) == [[1, 2, 3], [4, 5, 6], [7, 8]]
def test_multiple_length(self): assert list(chunks([1, 2, 3, 4, 5, 6], 2)) == [[1, 2], [3, 4], [5, 6]]
def test_empty_list(self): assert list(chunks([], 1)) == []