def add_urls_to_index(self, stream, params, filename, length): upload_key = params.get('param.upid') if upload_key: stream = SizeTrackingReader(stream, length, self.redis, upload_key) cdx_list = (super(WebRecRedisIndexer, self).add_urls_to_index(stream, params, filename, length)) with redis_pipeline(self.redis) as pi: for key_templ in self.size_keys: key = res_template(key_templ, params) pi.hincrby(key, 'size', length) if key_templ == self.rec_info_key_templ and cdx_list: pi.hset(key, 'updated_at', str(int(time.time()))) # write size to usage hashes ts = datetime.now().date().isoformat() if 'param.user' in params: if params['param.user'].startswith(self.temp_prefix): key = self.temp_usage_key rate_limit_key = self.get_rate_limit_key(params) if rate_limit_key: pi.incrby(rate_limit_key, length) pi.expire(rate_limit_key, self.rate_limit_ttl) else: key = self.user_usage_key if key: pi.hincrby(key, ts, length) return cdx_list
def add_urls_to_index(self, stream, params, filename, length): cdx_list = (super(WebRecRedisIndexer, self).add_urls_to_index(stream, params, filename, length)) with redis.utils.pipeline(self.redis) as pi: for key_templ in self.size_keys: key = res_template(key_templ, params) pi.hincrby(key, 'size', length) if key_templ == self.rec_info_key_templ and cdx_list: pi.hset(key, 'updated_at', str(int(time.time()))) # write size to usage hashes ts = datetime.now().date().isoformat() if 'param.user' in params: if params['param.user'].startswith(self.temp_prefix): key = self.temp_usage_key else: key = self.user_usage_key if key: pi.hincrby(key, ts, length) return cdx_list
def allow_new_file(self, filename, params): key = res_template(self.info_key, params) # ensure recording exists before writing anything # if not, abort opening new warc file here if not self.redis.exists(key): print('Writing skipped, recording does not exist for ' + filename) return False return True
def _is_write_req(self, req, params): if not req or not req.rec_headers or not self.skip_key_template: return False skip_key = res_template(self.skip_key_template, params) if self.redis.get(skip_key) == b'1': print('SKIPPING REQ', params.get('url')) return False return True
def _iter_sources(self, params): redis_key_pattern = res_template(self.redis_key_template, params) if '*' not in redis_key_pattern: keys = [redis_key_pattern.encode('utf-8')] else: keys = self.scan_keys(redis_key_pattern, params) mount_data_list = self._get_mounts(keys) source = None for key, mount_data in zip(keys, mount_data_list): key = key.decode('utf-8') if mount_data: source = init_index_source(mount_data.decode('utf-8'), source_list=self.SUPPORTED_SOURCES) elif self.mounts_only: continue else: source = self._get_source_for_key(key) yield key, source
def create_buffer(self, params, name): info_key = res_template(self.info_keys['rec'], params) return TempWriteBuffer(self.redis, info_key, name, params['url'])