def _set_cached_url(self, url, data, timeout): cache_key = self._get_cache_key(url) try: self.redis_client.setex(cache_key, timeout, json.dumps(data)) statsd_client.incr('redis_cache_write') except redis.RedisError: raise self.MetadataClientException('Unable to write to redis.')
def _domain_limit_urls(self, urls): allowed_urls = [] for url in urls: domain = urlparse.urlparse(url).netloc.encode('utf8') if self.domain_limiter.checked_insert(domain): allowed_urls.append(url) else: statsd_client.incr('domain_rate_limit_exceeded') return allowed_urls
def heartbeat(): status = 200 # Check cache connectivity try: current_app.redis_client.ping() statsd_client.incr('heartbeat.pass') except redis.ConnectionError: statsd_client.incr('heartbeat.fail') status = 500 return Response('', status=status)
def fetch_recommended_urls(start_time, redis_client=None): import time from proxy.app import get_pocket_client from proxy.stats import statsd_client statsd_client.incr('task_fetch_recommended_start') pocket_client = get_pocket_client(redis_client=redis_client) pocket_client.fetch_recommended_urls() job_time = int((time.time() - start_time) * 1000) statsd_client.timing('task_fetch_recommended_time', job_time)
def fetch_mozilla_data(urls, start_time, redis_client=None): import time from proxy.app import get_mozilla_client from proxy.stats import statsd_client statsd_client.incr('task_fetch_mozilla_start') mozilla_client = get_mozilla_client(redis_client=redis_client) url_data = mozilla_client.get_remote_urls(urls) statsd_client.gauge('task_fetch_mozilla_cached', len(url_data.keys())) job_time = int((time.time() - start_time) * 1000) statsd_client.timing('task_fetch_mozilla_time', job_time)
class PocketClient(object): class PocketException(Exception): pass def __init__(self, pocket_url, redis_client, redis_data_timeout, job_queue, job_ttl): self.pocket_url = pocket_url self.redis_client = redis_client self.redis_key = 'POCKET_RECOMMENDED_URLS' self.redis_in_flight_value = 'JOB_IN_FLIGHT' self.redis_data_timeout = redis_data_timeout self.job_queue = job_queue self.job_ttl = job_ttl def fetch_recommended_urls(self): with statsd_client.timer('pocket_request_timer'): try: response = requests.get(self.pocket_url) except requests.RequestException, e: raise self.PocketException( ('Unable to communicate ' 'with pocket: {error}').format(error=e)) if response.status_code != 200: statsd_client.incr('pocket_request_failure') raise self.PocketException( ('Error status returned from ' 'pocket: {error_code} {error_message}').format( error_code=response.status_code, error_message=response.content, )) statsd_client.incr('pocket_request_success') pocket_data = [] if response is not None: try: pocket_data = json.loads(response.content) except (TypeError, ValueError), e: statsd_client.incr('pocket_parse_failure') raise self.PocketException( ('Unable to parse the JSON ' 'response from pocket: {error}').format(error=e))
def _get_cached_url(self, url): cache_key = self._get_cache_key(url) try: cached_data = self.redis_client.get(cache_key) except redis.RedisError: raise self.MetadataClientException('Unable to read from redis.') if cached_data is not None: statsd_client.incr('redis_cache_hit') try: return json.loads(cached_data) except ValueError: raise self.MetadataClientException( ('Unable to load JSON data ' 'from cache for key: {key}').format(key=cache_key)) else: statsd_client.incr('redis_cache_miss')
def extract_urls_async(self, urls): all_cached_url_data = self.get_cached_urls(urls) if self.IN_JOB_QUEUE in all_cached_url_data.values(): statsd_client.incr('request_in_job_queue') cached_url_data = { url: url_data for (url, url_data) in all_cached_url_data.items() if url_data != self.IN_JOB_QUEUE } uncached_urls = set(urls) - set(all_cached_url_data.keys()) if uncached_urls: allowed_urls = self._domain_limit_urls(uncached_urls) self._queue_url_jobs(allowed_urls) return cached_url_data
def _queue_url_jobs(self, urls): batched_urls = group_by(list(urls), self.url_batch_size) for url_batch in batched_urls: try: self.job_queue.enqueue( self.TASK, url_batch, time.time(), ttl=self.job_ttl, at_front=True, ) statsd_client.gauge('request_fetch_job_create', len(url_batch)) statsd_client.gauge('request_fetch_job_queue_size', self.job_queue.count) for queued_url in url_batch: self._set_cached_url(queued_url, self.IN_JOB_QUEUE, self.redis_job_timeout) except Exception: statsd_client.incr('request_fetch_job_create_fail')
def _queue_url_jobs(self, urls): batched_urls = group_by(list(urls), self.url_batch_size) for url_batch in batched_urls: try: self.job_queue.enqueue( self.TASK, url_batch, time.time(), ttl=self.job_ttl, at_front=True, ) statsd_client.gauge( 'request_fetch_job_create', len(url_batch)) statsd_client.gauge( 'request_fetch_job_queue_size', self.job_queue.count) for queued_url in url_batch: self._set_cached_url( queued_url, self.IN_JOB_QUEUE, self.redis_job_timeout) except Exception: statsd_client.incr('request_fetch_job_create_fail')
def get_recommended_urls(self): try: recommended_urls = self.redis_client.get(self.redis_key) except redis.RedisError: raise self.PocketException('Unable to read from redis.') if recommended_urls is None: recommended_urls = [] statsd_client.incr('redis_recommended_cache_miss') try: self.job_queue.enqueue( fetch_recommended_urls, time.time(), ttl=self.job_ttl, at_front=True, ) except Exception: statsd_client.incr('request_recommended_job_create_fail') raise self.PocketException( 'Unable to start the pocket fetch job.') statsd_client.incr('request_recommended_job_create') try: self.redis_client.setex( self.redis_key, self.job_ttl, self.redis_in_flight_value) except redis.RedisError: raise self.PocketException('Unable to write to redis.') else: statsd_client.incr('redis_recommended_cache_hit') try: recommended_urls = json.loads(recommended_urls) except ValueError: raise self.PocketException( ('Unable to load JSON data ' 'from cache for key: {key}').format(key=self.redis_key)) return recommended_urls
class MetadataClient(object): IN_JOB_QUEUE = 'in job queue' class MetadataClientException(Exception): pass def __init__(self, redis_client, redis_data_timeout, redis_job_timeout, blocked_domains, job_queue, job_ttl, url_batch_size): self.redis_client = redis_client self.redis_data_timeout = redis_data_timeout self.redis_job_timeout = redis_job_timeout self.schema = EmbedlyURLSchema(blocked_domains=blocked_domains) self.job_queue = job_queue self.job_ttl = job_ttl self.url_batch_size = url_batch_size self.domain_limiter = rratelimit.SimpleLimiter( redis=self.redis_client, action='domain_limit', limit=20, period=1, ) def _get_cache_key(self, url): return u'{service}:{url}'.format(service=self.SERVICE_NAME, url=url) def _get_cached_url(self, url): cache_key = self._get_cache_key(url) try: cached_data = self.redis_client.get(cache_key) except redis.RedisError: raise self.MetadataClientException('Unable to read from redis.') if cached_data is not None: statsd_client.incr('redis_cache_hit') try: return json.loads(cached_data) except ValueError: raise self.MetadataClientException( ('Unable to load JSON data ' 'from cache for key: {key}').format(key=cache_key)) else: statsd_client.incr('redis_cache_miss') def _set_cached_url(self, url, data, timeout): cache_key = self._get_cache_key(url) try: self.redis_client.setex(cache_key, timeout, json.dumps(data)) statsd_client.incr('redis_cache_write') except redis.RedisError: raise self.MetadataClientException('Unable to write to redis.') def _queue_url_jobs(self, urls): batched_urls = group_by(list(urls), self.url_batch_size) for url_batch in batched_urls: try: self.job_queue.enqueue( self.TASK, url_batch, time.time(), ttl=self.job_ttl, at_front=True, ) statsd_client.gauge('request_fetch_job_create', len(url_batch)) statsd_client.gauge('request_fetch_job_queue_size', self.job_queue.count) for queued_url in url_batch: self._set_cached_url(queued_url, self.IN_JOB_QUEUE, self.redis_job_timeout) except Exception: statsd_client.incr('request_fetch_job_create_fail') def _remove_cached_keys(self, urls): self.redis_client.delete(*[self._get_cache_key(url) for url in urls]) def get_cached_urls(self, urls): url_data = {} for url in urls: cached_url_data = self._get_cached_url(url) if cached_url_data is not None: url_data[url] = cached_url_data return url_data def _make_remote_request(self, urls): raise NotImplementedError def _parse_remote_data(self, remote_data): raise NotImplementedError def _get_remote_urls_data(self, urls): statsd_client.gauge( '{service}_request_url_count'.format(service=self.SERVICE_NAME), len(urls)) with statsd_client.timer( '{service}_request_timer'.format(service=self.SERVICE_NAME)): try: response = self._make_remote_request(urls) except requests.RequestException, e: raise self.MetadataClientException( ('Unable to communicate ' 'with {service}: {error}').format( service=self.SERVICE_NAME, error=e)) if response.status_code != 200: statsd_client.incr( '{service}_request_failure'.format(service=self.SERVICE_NAME)) raise self.MetadataClientException( ('Error status returned from ' '{service}: {error_code} {error_message}').format( service=self.SERVICE_NAME, error_code=response.status_code, error_message=response.content, )) statsd_client.incr( '{service}_request_success'.format(service=self.SERVICE_NAME)) remote_data = [] if response is not None: try: remote_data = json.loads(response.content) except (TypeError, ValueError), e: statsd_client.incr('{service}_parse_failure'.format( service=self.SERVICE_NAME)) raise self.MetadataClientException( ('Unable to parse the JSON ' 'response from {service}: {error}').format( service=self.SERVICE_NAME, error=e))