Пример #1
0
    def _set_cached_url(self, url, data, timeout):
        cache_key = self._get_cache_key(url)

        try:
            self.redis_client.setex(cache_key, timeout, json.dumps(data))
            statsd_client.incr('redis_cache_write')
        except redis.RedisError:
            raise self.MetadataClientException('Unable to write to redis.')
Пример #2
0
    def _set_cached_url(self, url, data, timeout):
        cache_key = self._get_cache_key(url)

        try:
            self.redis_client.setex(cache_key, timeout, json.dumps(data))
            statsd_client.incr('redis_cache_write')
        except redis.RedisError:
            raise self.MetadataClientException('Unable to write to redis.')
Пример #3
0
    def _domain_limit_urls(self, urls):
        allowed_urls = []

        for url in urls:
            domain = urlparse.urlparse(url).netloc.encode('utf8')
            if self.domain_limiter.checked_insert(domain):
                allowed_urls.append(url)
            else:
                statsd_client.incr('domain_rate_limit_exceeded')

        return allowed_urls
Пример #4
0
    def _domain_limit_urls(self, urls):
        allowed_urls = []

        for url in urls:
            domain = urlparse.urlparse(url).netloc.encode('utf8')
            if self.domain_limiter.checked_insert(domain):
                allowed_urls.append(url)
            else:
                statsd_client.incr('domain_rate_limit_exceeded')

        return allowed_urls
Пример #5
0
def heartbeat():
    status = 200

    # Check cache connectivity
    try:
        current_app.redis_client.ping()
        statsd_client.incr('heartbeat.pass')
    except redis.ConnectionError:
        statsd_client.incr('heartbeat.fail')
        status = 500

    return Response('', status=status)
Пример #6
0
def heartbeat():
    status = 200

    # Check cache connectivity
    try:
        current_app.redis_client.ping()
        statsd_client.incr('heartbeat.pass')
    except redis.ConnectionError:
        statsd_client.incr('heartbeat.fail')
        status = 500

    return Response('', status=status)
Пример #7
0
def fetch_recommended_urls(start_time, redis_client=None):
    import time
    from proxy.app import get_pocket_client
    from proxy.stats import statsd_client

    statsd_client.incr('task_fetch_recommended_start')

    pocket_client = get_pocket_client(redis_client=redis_client)

    pocket_client.fetch_recommended_urls()

    job_time = int((time.time() - start_time) * 1000)
    statsd_client.timing('task_fetch_recommended_time', job_time)
Пример #8
0
def fetch_mozilla_data(urls, start_time, redis_client=None):
    import time
    from proxy.app import get_mozilla_client
    from proxy.stats import statsd_client

    statsd_client.incr('task_fetch_mozilla_start')

    mozilla_client = get_mozilla_client(redis_client=redis_client)

    url_data = mozilla_client.get_remote_urls(urls)

    statsd_client.gauge('task_fetch_mozilla_cached', len(url_data.keys()))

    job_time = int((time.time() - start_time) * 1000)
    statsd_client.timing('task_fetch_mozilla_time', job_time)
Пример #9
0
class PocketClient(object):

    class PocketException(Exception):
        pass

    def __init__(self, pocket_url, redis_client, redis_data_timeout,
                 job_queue, job_ttl):
        self.pocket_url = pocket_url
        self.redis_client = redis_client
        self.redis_key = 'POCKET_RECOMMENDED_URLS'
        self.redis_in_flight_value = 'JOB_IN_FLIGHT'
        self.redis_data_timeout = redis_data_timeout
        self.job_queue = job_queue
        self.job_ttl = job_ttl

    def fetch_recommended_urls(self):
        with statsd_client.timer('pocket_request_timer'):
            try:
                response = requests.get(self.pocket_url)
            except requests.RequestException, e:
                raise self.PocketException(
                    ('Unable to communicate '
                     'with pocket: {error}').format(error=e))

        if response.status_code != 200:
            statsd_client.incr('pocket_request_failure')
            raise self.PocketException(
                ('Error status returned from '
                 'pocket: {error_code} {error_message}').format(
                    error_code=response.status_code,
                    error_message=response.content,
                  ))

        statsd_client.incr('pocket_request_success')

        pocket_data = []

        if response is not None:
            try:
                pocket_data = json.loads(response.content)
            except (TypeError, ValueError), e:
                statsd_client.incr('pocket_parse_failure')
                raise self.PocketException(
                    ('Unable to parse the JSON '
                     'response from pocket: {error}').format(error=e))
Пример #10
0
    def _get_cached_url(self, url):
        cache_key = self._get_cache_key(url)

        try:
            cached_data = self.redis_client.get(cache_key)
        except redis.RedisError:
            raise self.MetadataClientException('Unable to read from redis.')

        if cached_data is not None:
            statsd_client.incr('redis_cache_hit')
            try:
                return json.loads(cached_data)
            except ValueError:
                raise self.MetadataClientException(
                    ('Unable to load JSON data '
                     'from cache for key: {key}').format(key=cache_key))
        else:
            statsd_client.incr('redis_cache_miss')
Пример #11
0
    def _get_cached_url(self, url):
        cache_key = self._get_cache_key(url)

        try:
            cached_data = self.redis_client.get(cache_key)
        except redis.RedisError:
            raise self.MetadataClientException('Unable to read from redis.')

        if cached_data is not None:
            statsd_client.incr('redis_cache_hit')
            try:
                return json.loads(cached_data)
            except ValueError:
                raise self.MetadataClientException(
                    ('Unable to load JSON data '
                     'from cache for key: {key}').format(key=cache_key))
        else:
            statsd_client.incr('redis_cache_miss')
Пример #12
0
    def extract_urls_async(self, urls):
        all_cached_url_data = self.get_cached_urls(urls)

        if self.IN_JOB_QUEUE in all_cached_url_data.values():
            statsd_client.incr('request_in_job_queue')

        cached_url_data = {
            url: url_data
            for (url, url_data) in all_cached_url_data.items()
            if url_data != self.IN_JOB_QUEUE
        }

        uncached_urls = set(urls) - set(all_cached_url_data.keys())

        if uncached_urls:
            allowed_urls = self._domain_limit_urls(uncached_urls)
            self._queue_url_jobs(allowed_urls)

        return cached_url_data
Пример #13
0
    def extract_urls_async(self, urls):
        all_cached_url_data = self.get_cached_urls(urls)

        if self.IN_JOB_QUEUE in all_cached_url_data.values():
            statsd_client.incr('request_in_job_queue')

        cached_url_data = {
            url: url_data
            for (url, url_data)
            in all_cached_url_data.items()
            if url_data != self.IN_JOB_QUEUE
        }

        uncached_urls = set(urls) - set(all_cached_url_data.keys())

        if uncached_urls:
            allowed_urls = self._domain_limit_urls(uncached_urls)
            self._queue_url_jobs(allowed_urls)

        return cached_url_data
Пример #14
0
    def _queue_url_jobs(self, urls):
        batched_urls = group_by(list(urls), self.url_batch_size)

        for url_batch in batched_urls:
            try:
                self.job_queue.enqueue(
                    self.TASK,
                    url_batch,
                    time.time(),
                    ttl=self.job_ttl,
                    at_front=True,
                )
                statsd_client.gauge('request_fetch_job_create', len(url_batch))
                statsd_client.gauge('request_fetch_job_queue_size',
                                    self.job_queue.count)

                for queued_url in url_batch:
                    self._set_cached_url(queued_url, self.IN_JOB_QUEUE,
                                         self.redis_job_timeout)

            except Exception:
                statsd_client.incr('request_fetch_job_create_fail')
Пример #15
0
    def _queue_url_jobs(self, urls):
        batched_urls = group_by(list(urls), self.url_batch_size)

        for url_batch in batched_urls:
            try:
                self.job_queue.enqueue(
                    self.TASK,
                    url_batch,
                    time.time(),
                    ttl=self.job_ttl,
                    at_front=True,
                )
                statsd_client.gauge(
                    'request_fetch_job_create', len(url_batch))
                statsd_client.gauge(
                    'request_fetch_job_queue_size', self.job_queue.count)

                for queued_url in url_batch:
                    self._set_cached_url(
                        queued_url, self.IN_JOB_QUEUE, self.redis_job_timeout)

            except Exception:
                statsd_client.incr('request_fetch_job_create_fail')
Пример #16
0
    def get_recommended_urls(self):
        try:
            recommended_urls = self.redis_client.get(self.redis_key)
        except redis.RedisError:
            raise self.PocketException('Unable to read from redis.')

        if recommended_urls is None:
            recommended_urls = []
            statsd_client.incr('redis_recommended_cache_miss')

            try:
                self.job_queue.enqueue(
                    fetch_recommended_urls,
                    time.time(),
                    ttl=self.job_ttl,
                    at_front=True,
                )
            except Exception:
                statsd_client.incr('request_recommended_job_create_fail')
                raise self.PocketException(
                    'Unable to start the pocket fetch job.')

            statsd_client.incr('request_recommended_job_create')

            try:
                self.redis_client.setex(
                    self.redis_key, self.job_ttl, self.redis_in_flight_value)
            except redis.RedisError:
                raise self.PocketException('Unable to write to redis.')

        else:
            statsd_client.incr('redis_recommended_cache_hit')

            try:
                recommended_urls = json.loads(recommended_urls)
            except ValueError:
                raise self.PocketException(
                    ('Unable to load JSON data '
                     'from cache for key: {key}').format(key=self.redis_key))

        return recommended_urls
Пример #17
0
class MetadataClient(object):
    IN_JOB_QUEUE = 'in job queue'

    class MetadataClientException(Exception):
        pass

    def __init__(self, redis_client, redis_data_timeout, redis_job_timeout,
                 blocked_domains, job_queue, job_ttl, url_batch_size):
        self.redis_client = redis_client
        self.redis_data_timeout = redis_data_timeout
        self.redis_job_timeout = redis_job_timeout
        self.schema = EmbedlyURLSchema(blocked_domains=blocked_domains)
        self.job_queue = job_queue
        self.job_ttl = job_ttl
        self.url_batch_size = url_batch_size
        self.domain_limiter = rratelimit.SimpleLimiter(
            redis=self.redis_client,
            action='domain_limit',
            limit=20,
            period=1,
        )

    def _get_cache_key(self, url):
        return u'{service}:{url}'.format(service=self.SERVICE_NAME, url=url)

    def _get_cached_url(self, url):
        cache_key = self._get_cache_key(url)

        try:
            cached_data = self.redis_client.get(cache_key)
        except redis.RedisError:
            raise self.MetadataClientException('Unable to read from redis.')

        if cached_data is not None:
            statsd_client.incr('redis_cache_hit')
            try:
                return json.loads(cached_data)
            except ValueError:
                raise self.MetadataClientException(
                    ('Unable to load JSON data '
                     'from cache for key: {key}').format(key=cache_key))
        else:
            statsd_client.incr('redis_cache_miss')

    def _set_cached_url(self, url, data, timeout):
        cache_key = self._get_cache_key(url)

        try:
            self.redis_client.setex(cache_key, timeout, json.dumps(data))
            statsd_client.incr('redis_cache_write')
        except redis.RedisError:
            raise self.MetadataClientException('Unable to write to redis.')

    def _queue_url_jobs(self, urls):
        batched_urls = group_by(list(urls), self.url_batch_size)

        for url_batch in batched_urls:
            try:
                self.job_queue.enqueue(
                    self.TASK,
                    url_batch,
                    time.time(),
                    ttl=self.job_ttl,
                    at_front=True,
                )
                statsd_client.gauge('request_fetch_job_create', len(url_batch))
                statsd_client.gauge('request_fetch_job_queue_size',
                                    self.job_queue.count)

                for queued_url in url_batch:
                    self._set_cached_url(queued_url, self.IN_JOB_QUEUE,
                                         self.redis_job_timeout)

            except Exception:
                statsd_client.incr('request_fetch_job_create_fail')

    def _remove_cached_keys(self, urls):
        self.redis_client.delete(*[self._get_cache_key(url) for url in urls])

    def get_cached_urls(self, urls):
        url_data = {}

        for url in urls:
            cached_url_data = self._get_cached_url(url)

            if cached_url_data is not None:
                url_data[url] = cached_url_data

        return url_data

    def _make_remote_request(self, urls):
        raise NotImplementedError

    def _parse_remote_data(self, remote_data):
        raise NotImplementedError

    def _get_remote_urls_data(self, urls):
        statsd_client.gauge(
            '{service}_request_url_count'.format(service=self.SERVICE_NAME),
            len(urls))

        with statsd_client.timer(
                '{service}_request_timer'.format(service=self.SERVICE_NAME)):
            try:
                response = self._make_remote_request(urls)
            except requests.RequestException, e:
                raise self.MetadataClientException(
                    ('Unable to communicate '
                     'with {service}: {error}').format(
                         service=self.SERVICE_NAME, error=e))

        if response.status_code != 200:
            statsd_client.incr(
                '{service}_request_failure'.format(service=self.SERVICE_NAME))
            raise self.MetadataClientException(
                ('Error status returned from '
                 '{service}: {error_code} {error_message}').format(
                     service=self.SERVICE_NAME,
                     error_code=response.status_code,
                     error_message=response.content,
                 ))

        statsd_client.incr(
            '{service}_request_success'.format(service=self.SERVICE_NAME))

        remote_data = []

        if response is not None:
            try:
                remote_data = json.loads(response.content)
            except (TypeError, ValueError), e:
                statsd_client.incr('{service}_parse_failure'.format(
                    service=self.SERVICE_NAME))
                raise self.MetadataClientException(
                    ('Unable to parse the JSON '
                     'response from {service}: {error}').format(
                         service=self.SERVICE_NAME, error=e))