def _get_project_enhancements_config(project): enhancements = project.get_option('sentry:grouping_enhancements') enhancements_base = project.get_option('sentry:grouping_enhancements_base') if not enhancements and not enhancements_base: return DEFAULT_ENHANCEMENTS_CONFIG if enhancements_base is None or enhancements_base not in ENHANCEMENT_BASES: enhancements_base = DEFAULT_ENHANCEMENT_BASE # Instead of parsing and dumping out config here, we can make a # shortcut from sentry.utils.cache import cache from sentry.utils.hashlib import md5_text cache_key = 'grouping-enhancements:' + \ md5_text('%s|%s' % (enhancements_base, enhancements)).hexdigest() rv = cache.get(cache_key) if rv is not None: return rv try: rv = Enhancements.from_config_string( enhancements or '', bases=[enhancements_base]).dumps() except InvalidEnhancerConfig: rv = DEFAULT_ENHANCEMENTS_CONFIG cache.set(cache_key, rv) return rv
def _finish_login_pipeline(self, identity): """ The login flow executes both with anonymous and authenticated users. Upon completion a few branches exist: If the identity is already linked, the user should be logged in and redirected immediately. Otherwise, the user is presented with a confirmation window. That window will show them the new account that will be created, and if they're already authenticated an optional button to associate the identity with their account. """ auth_provider = self.auth_provider user_id = identity['id'] lock = locks.get( 'sso:auth:{}:{}'.format( auth_provider.id, md5_text(user_id).hexdigest(), ), duration=5, ) with TimedRetryPolicy(5)(lock.acquire): try: auth_identity = AuthIdentity.objects.select_related('user').get( auth_provider=auth_provider, ident=user_id, ) except AuthIdentity.DoesNotExist: auth_identity = None # Handle migration of identity keys if not auth_identity and isinstance(user_id, MigratingIdentityId): try: auth_identity = AuthIdentity.objects.select_related('user').get( auth_provider=auth_provider, ident=user_id.legacy_id, ) auth_identity.update(ident=user_id.id) except AuthIdentity.DoesNotExist: auth_identity = None if not auth_identity: return self._handle_unknown_identity(identity) # If the User attached to this AuthIdentity is not active, # we want to clobber the old account and take it over, rather than # getting logged into the inactive account. if not auth_identity.user.is_active: # Current user is also not logged in, so we have to # assume unknown. if not self.request.user.is_authenticated(): return self._handle_unknown_identity(identity) auth_identity = self._handle_attach_identity(identity) return self._handle_existing_identity(auth_identity, identity)
def __init__(self, request, organization, flow, auth_provider=None, provider_key=None): assert provider_key or auth_provider self.request = request self.auth_provider = auth_provider self.organization = organization self.flow = flow if auth_provider: provider = auth_provider.get_provider() elif provider_key: provider = manager.get(provider_key) else: raise NotImplementedError self.provider = provider if flow == self.FLOW_LOGIN: self.pipeline = provider.get_auth_pipeline() elif flow == self.FLOW_SETUP_PROVIDER: self.pipeline = provider.get_setup_pipeline() else: raise NotImplementedError # we serialize the pipeline to be [AuthView().get_ident(), ...] which # allows us to determine if the pipeline has changed during the auth # flow or if the user is somehow circumventing a chunk of it self.signature = md5_text( ' '.join(av.get_ident() for av in self.pipeline) ).hexdigest()
def _get_event_user(self, project, data): user_data = data.get('sentry.interfaces.User') if not user_data: return euser = EventUser( project=project, ident=user_data.get('id'), email=user_data.get('email'), username=user_data.get('username'), ip_address=user_data.get('ip_address'), ) if not euser.tag_value: return cache_key = 'euser:{}:{}'.format( project.id, md5_text(euser.tag_value).hexdigest(), ) cached = default_cache.get(cache_key) if cached is None: try: with transaction.atomic(using=router.db_for_write(EventUser)): euser.save() except IntegrityError: pass default_cache.set(cache_key, '', 3600) return euser
def test_query_hash(self): recent_search = RecentSearch.objects.create( organization=self.organization, user=self.user, type=0, query='hello', ) recent_search = RecentSearch.objects.get(id=recent_search.id) assert recent_search.query_hash == md5_text(recent_search.query).hexdigest()
def make_key(model, prefix, kwargs): kwargs_bits = [] for k, v in sorted(six.iteritems(kwargs)): k = __prep_key(model, k) v = smart_text(__prep_value(model, k, v)) kwargs_bits.append("%s=%s" % (k, v)) kwargs_bits = ":".join(kwargs_bits) return "%s:%s:%s" % (prefix, model.__name__, md5_text(kwargs_bits).hexdigest())
def _make_key(self, model, filters): """ Returns a Redis-compatible key for the model given filters. """ return 'b:k:%s:%s' % ( model._meta, md5_text( '&'. join('%s=%s' % (k, self._coerce_val(v)) for k, v in sorted(six.iteritems(filters))) ).hexdigest(), )
def for_tags(cls, project_id, values): """ Finds matching EventUser objects from a list of tag values. Return a dictionary of {tag_value: event_user}. """ hashes = [md5_text(v.split(':', 1)[-1]).hexdigest() for v in values] return {e.tag_value: e for e in cls.objects.filter( project_id=project_id, hash__in=hashes, )}
def get_conf_version(self, project): """ Returns a version string that represents the current configuration state. If any option changes or new options added, the version will change. >>> plugin.get_conf_version(project) """ options = self.get_conf_options(project) return md5_text('&'.join(sorted('%s=%s' % o for o in six.iteritems(options)))).hexdigest()[:3]
def get_gravatar_url(email, size=None, default='mm'): if email is None: email = '' gravatar_url = "%s/avatar/%s" % (settings.SENTRY_GRAVATAR_BASE_URL, md5_text(email.lower()).hexdigest()) properties = {} if size: properties['s'] = six.text_type(size) if default: properties['d'] = default if properties: gravatar_url += "?" + urlencode(properties) return gravatar_url
def get(self, request): results = status_checks.check_all() return Response({ 'problems': [ { 'id': md5_text(problem.message).hexdigest(), 'message': problem.message, 'severity': problem.severity, 'url': problem.url, } for problem in sorted(itertools.chain.from_iterable(results.values()), reverse=True) ], 'healthy': {type(check).__name__: not problems for check, problems in results.items()}, })
def get_event_payload(self, event): props = { 'event_id': event.event_id, 'project_id': event.project.slug, 'transaction': event.get_tag('transaction') or '', 'release': event.get_tag('sentry:release') or '', 'environment': event.get_tag('environment') or '', 'type': event.get_event_type(), } props['tags'] = [[k.format(tagstore.get_standardized_key(k)), v] for k, v in event.get_tags()] for key, value in six.iteritems(event.interfaces): if key == 'request': headers = value.headers if not isinstance(headers, dict): headers = dict(headers or ()) props.update({ 'request_url': value.url, 'request_method': value.method, 'request_referer': headers.get('Referer', ''), }) elif key == 'exception': exc = value.values[0] props.update({ 'exception_type': exc.type, 'exception_value': exc.value, }) elif key == 'logentry': props.update({ 'message': value.formatted or value.message, }) elif key in ('csp', 'expectct', 'expectstable', 'hpkp'): props.update({ '{}_{}'.format(key.rsplit('.', 1)[-1].lower(), k): v for k, v in six.iteritems(value.to_json()) }) elif key == 'user': user_payload = {} if value.id: user_payload['user_id'] = value.id if value.email: user_payload['user_email_hash'] = md5_text(value.email).hexdigest() if value.ip_address: user_payload['user_ip_trunc'] = anonymize_ip(value.ip_address) if user_payload: props.update(user_payload) return props
def is_limited(self, key, limit, project=None, window=None): if window is None: window = self.window key_hex = md5_text(key).hexdigest() bucket = int(time() / window) if project: key = 'rl:%s:%s:%s' % (key_hex, project.id, bucket) else: key = 'rl:%s:%s' % (key_hex, bucket) with self.cluster.map() as client: result = client.incr(key) client.expire(key, window) return result.value > limit
def __init__(self, request, organization, provider_key, provider_model=None, config=None): if config is None: config = {} self.request = request self.organization = organization self.state = RedisSessionStore(request, self.pipeline_name) self.provider = self.provider_manager.get(provider_key) self.provider_model = provider_model self.config = config self.provider.set_config(config) self.pipeline = self.get_pipeline_views() # we serialize the pipeline to be ['fqn.PipelineView', ...] which # allows us to determine if the pipeline has changed during the auth # flow or if the user is somehow circumventing a chunk of it pipe_ids = ['{}.{}'.format(type(v).__module__, type(v).__name__) for v in self.pipeline] self.signature = md5_text(*pipe_ids).hexdigest()
def get_fingerprinting_config_for_project(project): from sentry.grouping.fingerprinting import FingerprintingRules, \ InvalidFingerprintingConfig rules = project.get_option('sentry:fingerprinting_rules') if not rules: return FingerprintingRules([]) from sentry.utils.cache import cache from sentry.utils.hashlib import md5_text cache_key = 'fingerprinting-rules:' + md5_text(rules).hexdigest() rv = cache.get(cache_key) if rv is not None: return FingerprintingRules.from_json(rv) try: rv = FingerprintingRules.from_config_string( rules or '') except InvalidFingerprintingConfig: rv = FingerprintingRules([]) cache.set(cache_key, rv.to_json()) return rv
def _make_cache_key(key): return 'o:%s' % md5_text(key).hexdigest()
def fetch_release_file(filename, release, dist=None): cache_key = 'releasefile:v1:%s:%s' % (release.id, md5_text(filename).hexdigest(), ) logger.debug('Checking cache for release artifact %r (release_id=%s)', filename, release.id) result = cache.get(cache_key) dist_name = dist and dist.name or None if result is None: filename_choices = ReleaseFile.normalize(filename) filename_idents = [ReleaseFile.get_ident(f, dist_name) for f in filename_choices] logger.debug( 'Checking database for release artifact %r (release_id=%s)', filename, release.id ) possible_files = list( ReleaseFile.objects.filter( release=release, dist=dist, ident__in=filename_idents, ).select_related('file') ) if len(possible_files) == 0: logger.debug( 'Release artifact %r not found in database (release_id=%s)', filename, release.id ) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Pick first one that matches in priority order. # This is O(N*M) but there are only ever at most 4 things here # so not really worth optimizing. releasefile = next(( rf for ident in filename_idents for rf in possible_files if rf.ident == ident )) logger.debug( 'Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id ) try: with metrics.timer('sourcemaps.release_file_read'): with releasefile.file.getfile() as fp: z_body, body = compress_file(fp) except Exception: logger.error('sourcemap.compress_read_failed', exc_info=sys.exc_info()) result = None else: headers = {k.lower(): v for k, v in releasefile.file.headers.items()} encoding = get_encoding_from_headers(headers) result = http.UrlResult(filename, headers, body, 200, encoding) cache.set(cache_key, (headers, z_body, 200, encoding), 3600) elif result == -1: # We cached an error, so normalize # it down to None result = None else: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None result = http.UrlResult( filename, result[0], zlib.decompress(result[1]), result[2], encoding ) return result
def get_rl_key(self, event): return "{}:{}".format(self.conf_key, md5_text(self.project_token).hexdigest())
def get_cache_key(cls, project_id, name): return 'env:1:%s:%s' % (project_id, md5_text(name).hexdigest())
def fetch_release_file(filename, release): cache_key = 'releasefile:v1:%s:%s' % ( release.id, md5_text(filename).hexdigest(), ) filename_path = None if filename is not None: # Reconstruct url without protocol + host # e.g. http://example.com/foo?bar => ~/foo?bar parsed_url = urlparse(filename) filename_path = '~' + parsed_url.path if parsed_url.query: filename_path += '?' + parsed_url.query logger.debug('Checking cache for release artifact %r (release_id=%s)', filename, release.id) result = cache.get(cache_key) if result is None: logger.debug('Checking database for release artifact %r (release_id=%s)', filename, release.id) filename_idents = [ReleaseFile.get_ident(filename)] if filename_path is not None and filename_path != filename: filename_idents.append(ReleaseFile.get_ident(filename_path)) possible_files = list(ReleaseFile.objects.filter( release=release, ident__in=filename_idents, ).select_related('file')) if len(possible_files) == 0: logger.debug('Release artifact %r not found in database (release_id=%s)', filename, release.id) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Prioritize releasefile that matches full url (w/ host) # over hostless releasefile target_ident = filename_idents[0] releasefile = next((f for f in possible_files if f.ident == target_ident)) logger.debug('Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id) try: with releasefile.file.getfile() as fp: z_body, body = compress_file(fp) except Exception as e: logger.exception(six.text_type(e)) cache.set(cache_key, -1, 3600) result = None else: try: result = (releasefile.file.headers, body.decode('utf-8'), 200) except UnicodeDecodeError: error = { 'type': EventError.JS_INVALID_SOURCE_ENCODING, 'value': 'utf8', 'url': expose_url(releasefile.name), } raise CannotFetchSource(error) else: # Write the compressed version to cache, but return the deflated version cache.set(cache_key, (releasefile.file.headers, z_body, 200), 3600) elif result == -1: # We cached an error, so normalize # it down to None result = None else: # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off body = zlib.decompress(result[1]) try: result = (result[0], body.decode('utf-8'), result[2]) except UnicodeDecodeError: error = { 'type': EventError.JS_INVALID_SOURCE_ENCODING, 'value': 'utf8', 'url': expose_url(releasefile.name), } raise CannotFetchSource(error) return result
def handle_basic_auth(self, request): can_register = auth.has_user_registration() or request.session.get('can_register') op = request.POST.get('op') # Detect that we are on the register page by url /register/ and # then activate the register tab by default. if not op and '/register' in request.path_info and can_register: op = 'register' login_form = self.get_login_form(request) if can_register: register_form = self.get_register_form(request, initial={ 'username': request.session.get('invite_email', '') }) else: register_form = None if can_register and register_form.is_valid(): user = register_form.save() user.send_confirm_emails(is_new_user=True) # HACK: grab whatever the first backend is and assume it works user.backend = settings.AUTHENTICATION_BACKENDS[0] auth.login(request, user) # can_register should only allow a single registration request.session.pop('can_register', None) request.session.pop('invite_email', None) return self.redirect(auth.get_login_redirect(request)) elif request.method == 'POST': from sentry.app import ratelimiter from sentry.utils.hashlib import md5_text login_attempt = op == 'login' and request.POST.get('username') and request.POST.get('password') if login_attempt and ratelimiter.is_limited( u'auth:login:username:{}'.format(md5_text(request.POST['username'].lower()).hexdigest()), limit=10, window=60, # 10 per minute should be enough for anyone ): login_form.errors['__all__'] = [u'You have made too many login attempts. Please try again later.'] elif login_form.is_valid(): user = login_form.get_user() auth.login(request, user) if not user.is_active: return self.redirect(reverse('sentry-reactivate-account')) return self.redirect(auth.get_login_redirect(request)) context = { 'op': op or 'login', 'server_hostname': get_server_hostname(), 'login_form': login_form, 'register_form': register_form, 'CAN_REGISTER': can_register, } return self.respond('sentry/login.html', context)
def get_cache_key(cls, project_id, _key_id, value): return 'tagvalue:1:%s:%s:%s' % (project_id, _key_id, md5_text(value).hexdigest())
def post_process(self, event, **kwargs): token = self.get_option("token", event.project) index = self.get_option("index", event.project) instance = self.get_option("instance", event.project) if not (token and index and instance): metrics.incr( "integrations.splunk.forward-event.unconfigured", tags={ "project_id": event.project_id, "organization_id": event.project.organization_id, "event_type": event.get_event_type(), }, ) return if not instance.endswith("/services/collector"): instance = instance.rstrip("/") + "/services/collector" source = self.get_option("source", event.project) or "sentry" rl_key = "splunk:{}".format(md5_text(token).hexdigest()) # limit splunk to 50 requests/second if ratelimiter.is_limited(rl_key, limit=1000, window=1): metrics.incr( "integrations.splunk.forward-event.rate-limited", tags={ "project_id": event.project_id, "organization_id": event.project.organization_id, "event_type": event.get_event_type(), }, ) return payload = { "time": int(event.datetime.strftime("%s")), "source": source, "index": index, "event": self.get_event_payload(event), } host = self.get_host_for_splunk(event) if host: payload["host"] = host session = http.build_session() try: # https://docs.splunk.com/Documentation/Splunk/7.2.3/Data/TroubleshootHTTPEventCollector resp = session.post( instance, json=payload, # Splunk cloud instances certifcates dont play nicely verify=False, headers={"Authorization": "Splunk {}".format(token)}, timeout=5, ) if resp.status_code != 200: raise SplunkError.from_response(resp) except Exception as exc: metric = "integrations.splunk.forward-event.error" metrics.incr( metric, tags={ "project_id": event.project_id, "organization_id": event.project.organization_id, "event_type": event.get_event_type(), "error_code": getattr(exc, "code", None), }, ) logger.info( metric, extra={ "instance": instance, "project_id": event.project_id, "organization_id": event.project.organization_id, }, ) if isinstance(exc, ReadTimeout): # If we get a ReadTimeout we don't need to raise an error here. # Just log and return. return raise metrics.incr( "integrations.splunk.forward-event.success", tags={ "project_id": event.project_id, "organization_id": event.project.organization_id, "event_type": event.get_event_type(), }, )
def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ # If our url has been truncated, it'd be impossible to fetch # so we check for this early and bail if url[-3:] == '...': raise http.CannotFetch({ 'type': EventError.JS_MISSING_SOURCE, 'url': http.expose_url(url), }) if release: with metrics.timer('sourcemaps.release_file'): result = fetch_release_file(url, release, dist) else: result = None cache_key = 'source:cache:v4:%s' % (md5_text(url).hexdigest(), ) if result is None: if not allow_scraping or not url.startswith(('http:', 'https:')): error = { 'type': EventError.JS_MISSING_SOURCE, 'url': http.expose_url(url), } raise http.CannotFetch(error) logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is not None: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[4] except IndexError: encoding = None # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off result = http.UrlResult(result[0], result[1], zlib.decompress(result[2]), result[3], encoding) if result is None: headers = {} verify_ssl = False if project and is_valid_origin(url, project=project): verify_ssl = bool(project.get_option('sentry:verify_ssl', False)) token = project.get_option('sentry:token') if token: token_header = project.get_option( 'sentry:token_header') or 'X-Sentry-Token' headers[token_header] = token with metrics.timer('sourcemaps.fetch'): result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl) z_body = zlib.compress(result.body) cache.set( cache_key, (url, result.headers, z_body, result.status, result.encoding), get_max_age(result.headers)) # If we did not get a 200 OK we just raise a cannot fetch here. if result.status != 200: raise http.CannotFetch({ 'type': EventError.FETCH_INVALID_HTTP_CODE, 'value': result.status, 'url': http.expose_url(url), }) # Make sure the file we're getting back is six.binary_type. The only # reason it'd not be binary would be from old cached blobs, so # for compatibility with current cached files, let's coerce back to # binary and say utf8 encoding. if not isinstance(result.body, six.binary_type): try: result = http.UrlResult(result.url, result.headers, result.body.encode('utf8'), result.status, result.encoding) except UnicodeEncodeError: error = { 'type': EventError.FETCH_INVALID_ENCODING, 'value': 'utf8', 'url': http.expose_url(url), } raise http.CannotFetch(error) # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) # NOTE: possible to have JS files that don't actually end w/ ".js", but # this should catch 99% of cases if url.endswith('.js'): # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). # This cannot parse as valid JS/JSON. # NOTE: not relying on Content-Type header because apps often don't set this correctly # Discard leading whitespace (often found before doctype) body_start = result.body[:20].lstrip() if body_start[:1] == u'<': error = { 'type': EventError.JS_INVALID_CONTENT, 'url': url, } raise http.CannotFetch(error) return result
def fetch_file(url, domain_lock_enabled=True, outfile=None, headers=None, allow_redirects=True, verify_ssl=False, timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT, **kwargs): """ Pull down a URL, returning a UrlResult object. """ # lock down domains that are problematic if domain_lock_enabled: domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % ( md5_text(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetch(domain_result) logger.debug('Fetching %r from the internet', url) http_session = build_session() response = None try: try: start = time.time() response = http_session.get(url, allow_redirects=allow_redirects, verify=verify_ssl, headers=headers, timeout=timeout, stream=True, **kwargs) try: cl = int(response.headers['content-length']) except (LookupError, ValueError): cl = 0 if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() return_body = False if outfile is None: outfile = six.BytesIO() return_body = True cl = 0 # Only need to even attempt to read the response body if we # got a 200 OK if response.status_code == 200: for chunk in response.iter_content(16 * 1024): if time.time( ) - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT: raise Timeout() outfile.write(chunk) cl += len(chunk) if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, RestrictedIPAddress): error = { 'type': EventError.RESTRICTED_IP, 'url': expose_url(url), } elif isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'url': expose_url(url), } elif isinstance(exc, (Timeout, ReadTimeout)): error = { 'type': EventError.FETCH_TIMEOUT, 'url': expose_url(url), 'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT, } elif isinstance(exc, OverflowError): error = { 'type': EventError.FETCH_TOO_LARGE, 'url': expose_url(url), # We want size in megabytes to format nicely 'max_size': float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024, } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.FETCH_GENERIC_ERROR, 'value': six.text_type(type(exc)), 'url': expose_url(url), } else: logger.exception(six.text_type(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': expose_url(url), } # TODO(dcramer): we want to be less aggressive on disabling domains if domain_lock_enabled: cache.set(domain_key, error or '', 300) logger.warning('source.disabled', extra=error) raise CannotFetch(error) headers = {k.lower(): v for k, v in response.headers.items()} encoding = response.encoding body = None if return_body: body = outfile.getvalue() outfile.close() # we only want to close StringIO result = (headers, body, response.status_code, encoding) finally: if response is not None: response.close() if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.FETCH_INVALID_HTTP_CODE, 'value': result[2], 'url': expose_url(url), } raise CannotFetch(error) return UrlResult(url, result[0], result[1], result[2], result[3])
def fetch_release_file(filename, release, dist=None): cache_key = 'releasefile:v1:%s:%s' % ( release.id, md5_text(filename).hexdigest(), ) logger.debug('Checking cache for release artifact %r (release_id=%s)', filename, release.id) result = cache.get(cache_key) dist_name = dist and dist.name or None if result is None: filename_choices = ReleaseFile.normalize(filename) filename_idents = [ ReleaseFile.get_ident(f, dist_name) for f in filename_choices ] logger.debug( 'Checking database for release artifact %r (release_id=%s)', filename, release.id) possible_files = list( ReleaseFile.objects.filter( release=release, dist=dist, ident__in=filename_idents, ).select_related('file')) if len(possible_files) == 0: logger.debug( 'Release artifact %r not found in database (release_id=%s)', filename, release.id) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Pick first one that matches in priority order. # This is O(N*M) but there are only ever at most 4 things here # so not really worth optimizing. releasefile = next((rf for ident in filename_idents for rf in possible_files if rf.ident == ident)) logger.debug('Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id) try: with metrics.timer('sourcemaps.release_file_read'): with releasefile.file.getfile() as fp: z_body, body = compress_file(fp) except Exception: logger.error('sourcemap.compress_read_failed', exc_info=sys.exc_info()) result = None else: headers = { k.lower(): v for k, v in releasefile.file.headers.items() } encoding = get_encoding_from_headers(headers) result = http.UrlResult(filename, headers, body, 200, encoding) cache.set(cache_key, (headers, z_body, 200, encoding), 3600) elif result == -1: # We cached an error, so normalize # it down to None result = None else: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None result = http.UrlResult(filename, result[0], zlib.decompress(result[1]), result[2], encoding) return result
def _user_to_author_cache_key(organization_id, author): author_hash = md5_text(author.email.lower()).hexdigest() return f"get_users_for_authors:{organization_id}:{author_hash}"
def get_hash(self): value = self.ident or self.username or self.email or self.ip_address return md5_text(value).hexdigest()
def get_cache_key(cls, group_id, release_id, environment): return "grouprelease:1:{}:{}".format( group_id, md5_text(f"{release_id}:{environment}").hexdigest())
def _generate_cache_version(self): return md5_text('&'.join( sorted(f.attname for f in self.model._meta.fields))).hexdigest()[:3]
def fetch_file(url, project=None, release=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ if release: with metrics.timer('sourcemaps.release_file'): result = fetch_release_file(url, release) else: result = None cache_key = 'source:cache:v3:%s' % (md5_text(url).hexdigest(), ) if result is None: if not allow_scraping or not url.startswith(('http:', 'https:')): error = { 'type': EventError.JS_MISSING_SOURCE, 'url': expose_url(url), } raise CannotFetchSource(error) logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is not None: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off result = (result[0], zlib.decompress(result[1]), result[2], encoding) if result is None: # lock down domains that are problematic domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % ( md5_text(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetchSource(domain_result) headers = {} if project and is_valid_origin(url, project=project): token = project.get_option('sentry:token') if token: headers['X-Sentry-Token'] = token logger.debug('Fetching %r from the internet', url) with metrics.timer('sourcemaps.fetch'): http_session = http.build_session() response = None try: try: start = time.time() response = http_session.get( url, allow_redirects=True, verify=False, headers=headers, timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT, stream=True, ) try: cl = int(response.headers['content-length']) except (LookupError, ValueError): cl = 0 if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() contents = [] cl = 0 # Only need to even attempt to read the response body if we # got a 200 OK if response.status_code == 200: for chunk in response.iter_content(16 * 1024): if time.time( ) - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT: raise Timeout() contents.append(chunk) cl += len(chunk) if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, RestrictedIPAddress): error = { 'type': EventError.RESTRICTED_IP, 'url': expose_url(url), } elif isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'url': expose_url(url), } elif isinstance(exc, Timeout): error = { 'type': EventError.JS_FETCH_TIMEOUT, 'url': expose_url(url), 'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT, } elif isinstance(exc, OverflowError): error = { 'type': EventError.JS_TOO_LARGE, 'url': expose_url(url), # We want size in megabytes to format nicely 'max_size': float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024, } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.JS_GENERIC_FETCH_ERROR, 'value': six.text_type(type(exc)), 'url': expose_url(url), } else: logger.exception(six.text_type(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': expose_url(url), } # TODO(dcramer): we want to be less aggressive on disabling domains cache.set(domain_key, error or '', 300) logger.warning('Disabling sources to %s for %ss', domain, 300, exc_info=True) raise CannotFetchSource(error) body = b''.join(contents) z_body = zlib.compress(body) headers = {k.lower(): v for k, v in response.headers.items()} encoding = response.encoding cache.set(cache_key, (headers, z_body, response.status_code, encoding), 60) result = (headers, body, response.status_code, encoding) finally: if response is not None: response.close() if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.JS_INVALID_HTTP_CODE, 'value': result[2], 'url': expose_url(url), } raise CannotFetchSource(error) # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) # NOTE: possible to have JS files that don't actually end w/ ".js", but this should catch 99% of cases if url.endswith('.js'): # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). # This cannot parse as valid JS/JSON. # NOTE: not relying on Content-Type header because apps often don't set this correctly body_start = result[1][:20].lstrip( ) # Discard leading whitespace (often found before doctype) if body_start[:1] == u'<': error = { 'type': EventError.JS_INVALID_CONTENT, 'url': url, } raise CannotFetchSource(error) # Make sure the file we're getting back is six.binary_type. The only # reason it'd not be binary would be from old cached blobs, so # for compatibility with current cached files, let's coerce back to # binary and say utf8 encoding. if not isinstance(result[1], six.binary_type): try: result = (result[0], result[1].encode('utf8'), None) except UnicodeEncodeError: error = { 'type': EventError.JS_INVALID_SOURCE_ENCODING, 'value': 'utf8', 'url': expose_url(url), } raise CannotFetchSource(error) return UrlResult(url, result[0], result[1], result[3])
def cache_version(self) -> str: if self._cache_version is None: self._cache_version = md5_text("&".join( sorted(f.attname for f in self.model._meta.fields))).hexdigest()[:3] return self._cache_version
def fetch_file(url, domain_lock_enabled=True, outfile=None, headers=None, allow_redirects=True, verify_ssl=False, timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT, **kwargs): """ Pull down a URL, returning a UrlResult object. """ # lock down domains that are problematic if domain_lock_enabled: domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % ( md5_text(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetch(domain_result) logger.debug('Fetching %r from the internet', url) http_session = build_session() response = None try: try: start = time.time() response = http_session.get( url, allow_redirects=allow_redirects, verify=verify_ssl, headers=headers, timeout=timeout, stream=True, **kwargs ) try: cl = int(response.headers['content-length']) except (LookupError, ValueError): cl = 0 if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() return_body = False if outfile is None: outfile = six.BytesIO() return_body = True cl = 0 # Only need to even attempt to read the response body if we # got a 200 OK if response.status_code == 200: for chunk in response.iter_content(16 * 1024): if time.time() - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT: raise Timeout() outfile.write(chunk) cl += len(chunk) if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, RestrictedIPAddress): error = { 'type': EventError.RESTRICTED_IP, 'url': expose_url(url), } elif isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'url': expose_url(url), } elif isinstance(exc, (Timeout, ReadTimeout)): error = { 'type': EventError.FETCH_TIMEOUT, 'url': expose_url(url), 'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT, } elif isinstance(exc, OverflowError): error = { 'type': EventError.FETCH_TOO_LARGE, 'url': expose_url(url), # We want size in megabytes to format nicely 'max_size': float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024, } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.FETCH_GENERIC_ERROR, 'value': six.text_type(type(exc)), 'url': expose_url(url), } else: logger.exception(six.text_type(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': expose_url(url), } # TODO(dcramer): we want to be less aggressive on disabling domains if domain_lock_enabled: cache.set(domain_key, error or '', 300) logger.warning('source.disabled', extra=error) raise CannotFetch(error) headers = {k.lower(): v for k, v in response.headers.items()} encoding = response.encoding body = None if return_body: body = outfile.getvalue() outfile.close() # we only want to close StringIO result = (headers, body, response.status_code, encoding) finally: if response is not None: response.close() if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.FETCH_INVALID_HTTP_CODE, 'value': result[2], 'url': expose_url(url), } raise CannotFetch(error) return UrlResult(url, result[0], result[1], result[2], result[3])
def fetch_release_file(filename, release): cache_key = 'releasefile:v1:%s:%s' % ( release.id, md5_text(filename).hexdigest(), ) filename_path = None if filename is not None: # Reconstruct url without protocol + host # e.g. http://example.com/foo?bar => ~/foo?bar parsed_url = urlparse(filename) filename_path = '~' + parsed_url.path if parsed_url.query: filename_path += '?' + parsed_url.query logger.debug('Checking cache for release artifact %r (release_id=%s)', filename, release.id) result = cache.get(cache_key) if result is None: logger.debug( 'Checking database for release artifact %r (release_id=%s)', filename, release.id) filename_idents = [ReleaseFile.get_ident(filename)] if filename_path is not None and filename_path != filename: filename_idents.append(ReleaseFile.get_ident(filename_path)) possible_files = list( ReleaseFile.objects.filter( release=release, ident__in=filename_idents, ).select_related('file')) if len(possible_files) == 0: logger.debug( 'Release artifact %r not found in database (release_id=%s)', filename, release.id) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Prioritize releasefile that matches full url (w/ host) # over hostless releasefile target_ident = filename_idents[0] releasefile = next( (f for f in possible_files if f.ident == target_ident)) logger.debug('Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id) try: with metrics.timer('sourcemaps.release_file_read'): with releasefile.file.getfile() as fp: z_body, body = compress_file(fp) except Exception as e: logger.exception(six.text_type(e)) cache.set(cache_key, -1, 3600) result = None else: headers = { k.lower(): v for k, v in releasefile.file.headers.items() } encoding = get_encoding_from_headers(headers) result = (headers, body, 200, encoding) cache.set(cache_key, (headers, z_body, 200, encoding), 3600) elif result == -1: # We cached an error, so normalize # it down to None result = None else: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None result = (result[0], zlib.decompress(result[1]), result[2], encoding) return result
def fetch_file(url, project=None, release=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ if release: result = fetch_release_file(url, release) else: result = None cache_key = 'source:cache:v3:%s' % ( md5_text(url).hexdigest(), ) if result is None: if not allow_scraping or not url.startswith(('http:', 'https:')): error = { 'type': EventError.JS_MISSING_SOURCE, 'url': expose_url(url), } raise CannotFetchSource(error) logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is not None: # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off body = zlib.decompress(result[1]) result = (result[0], force_text(body), result[2]) if result is None: # lock down domains that are problematic domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % ( md5_text(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetchSource(domain_result) headers = {} if project and is_valid_origin(url, project=project): token = project.get_option('sentry:token') if token: headers['X-Sentry-Token'] = token logger.debug('Fetching %r from the internet', url) http_session = http.build_session() try: response = http_session.get( url, allow_redirects=True, verify=False, headers=headers, timeout=settings.SENTRY_SOURCE_FETCH_TIMEOUT, ) except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, RestrictedIPAddress): error = { 'type': EventError.RESTRICTED_IP, 'url': expose_url(url), } elif isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'url': expose_url(url), } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.JS_GENERIC_FETCH_ERROR, 'value': six.text_type(type(exc)), 'url': expose_url(url), } else: logger.exception(six.text_type(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': expose_url(url), } # TODO(dcramer): we want to be less aggressive on disabling domains cache.set(domain_key, error or '', 300) logger.warning('Disabling sources to %s for %ss', domain, 300, exc_info=True) raise CannotFetchSource(error) # requests' attempts to use chardet internally when no encoding is found # and we want to avoid that slow behavior if not response.encoding: response.encoding = 'utf-8' body = response.text z_body = zlib.compress(force_bytes(body)) headers = {k.lower(): v for k, v in response.headers.items()} cache.set(cache_key, (headers, z_body, response.status_code), 60) result = (headers, body, response.status_code) if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.JS_INVALID_HTTP_CODE, 'value': result[2], 'url': expose_url(url), } raise CannotFetchSource(error) # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) # NOTE: possible to have JS files that don't actually end w/ ".js", but this should catch 99% of cases if url.endswith('.js'): # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). # This cannot parse as valid JS/JSON. # NOTE: not relying on Content-Type header because apps often don't set this correctly body_start = result[1][:20].lstrip() # Discard leading whitespace (often found before doctype) if body_start[:1] == u'<': error = { 'type': EventError.JS_INVALID_CONTENT, 'url': url, } raise CannotFetchSource(error) # Make sure the file we're getting back is six.text_type, if it's not, # it's either some encoding that we don't understand, or it's binary # data which we can't process. if not isinstance(result[1], six.text_type): try: result = (result[0], result[1].decode('utf8'), result[2]) except UnicodeDecodeError: error = { 'type': EventError.JS_INVALID_SOURCE_ENCODING, 'value': 'utf8', 'url': expose_url(url), } raise CannotFetchSource(error) return UrlResult(url, result[0], result[1])
def _finish_login_pipeline(self, identity): """ The login flow executes both with anonymous and authenticated users. Upon completion a few branches exist: If the identity is already linked, the user should be logged in and redirected immediately. Otherwise, the user is presented with a confirmation window. That window will show them the new account that will be created, and if they're already authenticated an optional button to associate the identity with their account. """ auth_provider = self.auth_provider user_id = identity['id'] lock = locks.get( u'sso:auth:{}:{}'.format( auth_provider.id, md5_text(user_id).hexdigest(), ), duration=5, ) with TimedRetryPolicy(5)(lock.acquire): try: auth_identity = AuthIdentity.objects.select_related( 'user').get( auth_provider=auth_provider, ident=user_id, ) except AuthIdentity.DoesNotExist: auth_identity = None # Handle migration of identity keys if not auth_identity and isinstance(user_id, MigratingIdentityId): try: auth_identity = AuthIdentity.objects.select_related( 'user').get( auth_provider=auth_provider, ident=user_id.legacy_id, ) auth_identity.update(ident=user_id.id) except AuthIdentity.DoesNotExist: auth_identity = None if not auth_identity: return handle_unknown_identity( self.request, self.organization, self.auth_provider, self.provider, self.state, identity, ) # If the User attached to this AuthIdentity is not active, # we want to clobber the old account and take it over, rather than # getting logged into the inactive account. if not auth_identity.user.is_active: # Current user is also not logged in, so we have to # assume unknown. if not self.request.user.is_authenticated(): return handle_unknown_identity( self.request, self.organization, self.auth_provider, self.provider, self.state, identity, ) auth_identity = handle_attach_identity( self.auth_provider, self.request, self.organization, self.provider, identity, ) return handle_existing_identity( self.auth_provider, self.provider, self.organization, self.request, self.state, auth_identity, identity, )
def post(self, request: Request, organization=None, *args, **kwargs) -> Response: """ Process a login request via username/password. SSO login is handled elsewhere. """ login_form = AuthenticationForm(request, request.data) # Rate limit logins is_limited = ratelimiter.is_limited( "auth:login:username:{}".format( md5_text( login_form.clean_username( request.data.get("username"))).hexdigest()), limit=10, window=60, # 10 per minute should be enough for anyone ) if is_limited: errors = {"__all__": [login_form.error_messages["rate_limited"]]} metrics.incr("login.attempt", instance="rate_limited", skip_internal=True, sample_rate=1.0) return self.respond_with_error(errors) if not login_form.is_valid(): metrics.incr("login.attempt", instance="failure", skip_internal=True, sample_rate=1.0) return self.respond_with_error(login_form.errors) user = login_form.get_user() auth.login(request, user, organization_id=organization.id if organization else None) metrics.incr("login.attempt", instance="success", skip_internal=True, sample_rate=1.0) if not user.is_active: return Response({ "nextUri": "/auth/reactivate/", "user": serialize(user, user, DetailedUserSerializer()), }) active_org = self.get_active_organization(request) redirect_url = auth.get_org_redirect_url(request, active_org) return Response({ "nextUri": auth.get_login_redirect(request, redirect_url), "user": serialize(user, user, DetailedUserSerializer()), })
def hash_from_tag(cls, value): return md5_text(value.split(':', 1)[-1]).hexdigest()
def get_cache_key(cls, project_id, version): return 'release:2:%s:%s' % (project_id, md5_text(version).hexdigest())
def build_hash(self): for key, value in self.iter_attributes(): if value: return md5_text(value).hexdigest()
def get_cache_key(cls, organization_id, name): return 'env:2:%s:%s' % (organization_id, md5_text(name).hexdigest())
def get_cache_key(cls, group_id, release_id, environment): return 'grouprelease:1:{}:{}'.format( group_id, md5_text(u'{}:{}'.format(release_id, environment)).hexdigest(), )
def build_hash(self): value = self.ident or self.username or self.email or self.ip_address if not value: return None return md5_text(value).hexdigest()
def get_cache_key(cls, organization_id, version): return 'release:3:%s:%s' % (organization_id, md5_text(version).hexdigest())
def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ # If our url has been truncated, it'd be impossible to fetch # so we check for this early and bail if url[-3:] == '...': raise http.CannotFetch( { 'type': EventError.JS_MISSING_SOURCE, 'url': http.expose_url(url), } ) if release: with metrics.timer('sourcemaps.release_file'): result = fetch_release_file(url, release, dist) else: result = None cache_key = 'source:cache:v4:%s' % (md5_text(url).hexdigest(), ) if result is None: if not allow_scraping or not url.startswith(('http:', 'https:')): error = { 'type': EventError.JS_MISSING_SOURCE, 'url': http.expose_url(url), } raise http.CannotFetch(error) logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is not None: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[4] except IndexError: encoding = None # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off result = http.UrlResult( result[0], result[1], zlib.decompress(result[2]), result[3], encoding ) if result is None: headers = {} verify_ssl = False if project and is_valid_origin(url, project=project): verify_ssl = bool(project.get_option('sentry:verify_ssl', False)) token = project.get_option('sentry:token') if token: token_header = project.get_option('sentry:token_header') or 'X-Sentry-Token' headers[token_header] = token with metrics.timer('sourcemaps.fetch'): result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl) z_body = zlib.compress(result.body) cache.set( cache_key, (url, result.headers, z_body, result.status, result.encoding), get_max_age(result.headers)) # If we did not get a 200 OK we just raise a cannot fetch here. if result.status != 200: raise http.CannotFetch( { 'type': EventError.FETCH_INVALID_HTTP_CODE, 'value': result.status, 'url': http.expose_url(url), } ) # Make sure the file we're getting back is six.binary_type. The only # reason it'd not be binary would be from old cached blobs, so # for compatibility with current cached files, let's coerce back to # binary and say utf8 encoding. if not isinstance(result.body, six.binary_type): try: result = http.UrlResult( result.url, result.headers, result.body.encode('utf8'), result.status, result.encoding ) except UnicodeEncodeError: error = { 'type': EventError.FETCH_INVALID_ENCODING, 'value': 'utf8', 'url': http.expose_url(url), } raise http.CannotFetch(error) # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) # NOTE: possible to have JS files that don't actually end w/ ".js", but # this should catch 99% of cases if url.endswith('.js'): # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). # This cannot parse as valid JS/JSON. # NOTE: not relying on Content-Type header because apps often don't set this correctly # Discard leading whitespace (often found before doctype) body_start = result.body[:20].lstrip() if body_start[:1] == u'<': error = { 'type': EventError.JS_INVALID_CONTENT, 'url': url, } raise http.CannotFetch(error) return result
def fetch_release_file(filename, release): cache_key = 'releasefile:v1:%s:%s' % ( release.id, md5_text(filename).hexdigest(), ) filename_path = None if filename is not None: # Reconstruct url without protocol + host # e.g. http://example.com/foo?bar => ~/foo?bar parsed_url = urlparse(filename) filename_path = '~' + parsed_url.path if parsed_url.query: filename_path += '?' + parsed_url.query logger.debug('Checking cache for release artifact %r (release_id=%s)', filename, release.id) result = cache.get(cache_key) if result is None: logger.debug('Checking database for release artifact %r (release_id=%s)', filename, release.id) filename_idents = [ReleaseFile.get_ident(filename)] if filename_path is not None and filename_path != filename: filename_idents.append(ReleaseFile.get_ident(filename_path)) possible_files = list(ReleaseFile.objects.filter( release=release, ident__in=filename_idents, ).select_related('file')) if len(possible_files) == 0: logger.debug('Release artifact %r not found in database (release_id=%s)', filename, release.id) cache.set(cache_key, -1, 60) return None elif len(possible_files) == 1: releasefile = possible_files[0] else: # Prioritize releasefile that matches full url (w/ host) # over hostless releasefile target_ident = filename_idents[0] releasefile = next((f for f in possible_files if f.ident == target_ident)) logger.debug('Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id) try: with metrics.timer('sourcemaps.release_file_read'): with releasefile.file.getfile() as fp: z_body, body = compress_file(fp) except Exception as e: logger.exception(six.text_type(e)) cache.set(cache_key, -1, 3600) result = None else: headers = {k.lower(): v for k, v in releasefile.file.headers.items()} encoding = get_encoding_from_headers(headers) result = (headers, body, 200, encoding) cache.set(cache_key, (headers, z_body, 200, encoding), 3600) elif result == -1: # We cached an error, so normalize # it down to None result = None else: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None result = (result[0], zlib.decompress(result[1]), result[2], encoding) return result
def _generate_cache_version(self): return md5_text("&".join(sorted(f.attname for f in self.model._meta.fields))).hexdigest()[:3]
def fetch_file(url, project=None, release=None, allow_scraping=True): """ Pull down a URL, returning a UrlResult object. Attempts to fetch from the cache. """ # If our url has been truncated, it'd be impossible to fetch # so we check for this early and bail if url[-3:] == '...': raise CannotFetchSource({ 'type': EventError.JS_MISSING_SOURCE, 'url': expose_url(url), }) if release: with metrics.timer('sourcemaps.release_file'): result = fetch_release_file(url, release) else: result = None cache_key = 'source:cache:v3:%s' % ( md5_text(url).hexdigest(), ) if result is None: if not allow_scraping or not url.startswith(('http:', 'https:')): error = { 'type': EventError.JS_MISSING_SOURCE, 'url': expose_url(url), } raise CannotFetchSource(error) logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) if result is not None: # Previous caches would be a 3-tuple instead of a 4-tuple, # so this is being maintained for backwards compatibility try: encoding = result[3] except IndexError: encoding = None # We got a cache hit, but the body is compressed, so we # need to decompress it before handing it off result = (result[0], zlib.decompress(result[1]), result[2], encoding) if result is None: # lock down domains that are problematic domain = urlparse(url).netloc domain_key = 'source:blacklist:v2:%s' % ( md5_text(domain).hexdigest(), ) domain_result = cache.get(domain_key) if domain_result: domain_result['url'] = url raise CannotFetchSource(domain_result) headers = {} if project and is_valid_origin(url, project=project): token = project.get_option('sentry:token') if token: token_header = project.get_option( 'sentry:token_header', 'X-Sentry-Token', ) headers[token_header] = token logger.debug('Fetching %r from the internet', url) with metrics.timer('sourcemaps.fetch'): http_session = http.build_session() response = None try: try: start = time.time() response = http_session.get( url, allow_redirects=True, verify=False, headers=headers, timeout=settings.SENTRY_SOURCE_FETCH_SOCKET_TIMEOUT, stream=True, ) try: cl = int(response.headers['content-length']) except (LookupError, ValueError): cl = 0 if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() contents = [] cl = 0 # Only need to even attempt to read the response body if we # got a 200 OK if response.status_code == 200: for chunk in response.iter_content(16 * 1024): if time.time() - start > settings.SENTRY_SOURCE_FETCH_TIMEOUT: raise Timeout() contents.append(chunk) cl += len(chunk) if cl > settings.SENTRY_SOURCE_FETCH_MAX_SIZE: raise OverflowError() except Exception as exc: logger.debug('Unable to fetch %r', url, exc_info=True) if isinstance(exc, RestrictedIPAddress): error = { 'type': EventError.RESTRICTED_IP, 'url': expose_url(url), } elif isinstance(exc, SuspiciousOperation): error = { 'type': EventError.SECURITY_VIOLATION, 'url': expose_url(url), } elif isinstance(exc, Timeout): error = { 'type': EventError.JS_FETCH_TIMEOUT, 'url': expose_url(url), 'timeout': settings.SENTRY_SOURCE_FETCH_TIMEOUT, } elif isinstance(exc, OverflowError): error = { 'type': EventError.JS_TOO_LARGE, 'url': expose_url(url), # We want size in megabytes to format nicely 'max_size': float(settings.SENTRY_SOURCE_FETCH_MAX_SIZE) / 1024 / 1024, } elif isinstance(exc, (RequestException, ZeroReturnError)): error = { 'type': EventError.JS_GENERIC_FETCH_ERROR, 'value': six.text_type(type(exc)), 'url': expose_url(url), } else: logger.exception(six.text_type(exc)) error = { 'type': EventError.UNKNOWN_ERROR, 'url': expose_url(url), } # TODO(dcramer): we want to be less aggressive on disabling domains cache.set(domain_key, error or '', 300) logger.warning('source.disabled', extra=error) raise CannotFetchSource(error) body = b''.join(contents) z_body = zlib.compress(body) headers = {k.lower(): v for k, v in response.headers.items()} encoding = response.encoding cache.set(cache_key, (headers, z_body, response.status_code, encoding), 60) result = (headers, body, response.status_code, encoding) finally: if response is not None: response.close() if result[2] != 200: logger.debug('HTTP %s when fetching %r', result[2], url, exc_info=True) error = { 'type': EventError.JS_INVALID_HTTP_CODE, 'value': result[2], 'url': expose_url(url), } raise CannotFetchSource(error) # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML) # NOTE: possible to have JS files that don't actually end w/ ".js", but this should catch 99% of cases if url.endswith('.js'): # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<'). # This cannot parse as valid JS/JSON. # NOTE: not relying on Content-Type header because apps often don't set this correctly body_start = result[1][:20].lstrip() # Discard leading whitespace (often found before doctype) if body_start[:1] == u'<': error = { 'type': EventError.JS_INVALID_CONTENT, 'url': url, } raise CannotFetchSource(error) # Make sure the file we're getting back is six.binary_type. The only # reason it'd not be binary would be from old cached blobs, so # for compatibility with current cached files, let's coerce back to # binary and say utf8 encoding. if not isinstance(result[1], six.binary_type): try: result = (result[0], result[1].encode('utf8'), None) except UnicodeEncodeError: error = { 'type': EventError.JS_INVALID_SOURCE_ENCODING, 'value': 'utf8', 'url': expose_url(url), } raise CannotFetchSource(error) return UrlResult(url, result[0], result[1], result[3])
def post_process(self, event, **kwargs): token = self.get_option('token', event.project) index = self.get_option('index', event.project) instance = self.get_option('instance', event.project) if not (token and index and instance): metrics.incr('integrations.splunk.forward-event.unconfigured', tags={ 'project_id': event.project_id, 'organization_id': event.project.organization_id, 'event_type': event.get_event_type(), }) return if not instance.endswith('/services/collector'): instance = instance.rstrip('/') + '/services/collector' source = self.get_option('source', event.project) or 'sentry' rl_key = 'splunk:{}'.format(md5_text(token).hexdigest()) # limit splunk to 50 requests/second if ratelimiter.is_limited(rl_key, limit=1000, window=1): metrics.incr('integrations.splunk.forward-event.rate-limited', tags={ 'project_id': event.project_id, 'organization_id': event.project.organization_id, 'event_type': event.get_event_type(), }) return payload = { 'time': int(event.datetime.strftime('%s')), 'source': source, 'index': index, 'event': self.get_event_payload(event), } host = self.get_host_for_splunk(event) if host: payload['host'] = host session = http.build_session() try: # https://docs.splunk.com/Documentation/Splunk/7.2.3/Data/TroubleshootHTTPEventCollector resp = session.post( instance, json=payload, # Splunk cloud instances certifcates dont play nicely verify=False, headers={ 'Authorization': 'Splunk {}'.format(token) }, timeout=5, ) if resp.status_code != 200: raise SplunkError.from_response(resp) except Exception as exc: metric = 'integrations.splunk.forward-event.error' metrics.incr(metric, tags={ 'project_id': event.project_id, 'organization_id': event.project.organization_id, 'event_type': event.get_event_type(), 'error_code': getattr(exc, 'code', None), }) logger.info( metric, extra={ 'instance': instance, 'project_id': event.project_id, 'organization_id': event.project.organization_id, }, ) if isinstance(exc, ReadTimeout): # If we get a ReadTimeout we don't need to raise an error here. # Just log and return. return raise metrics.incr('integrations.splunk.forward-event.success', tags={ 'project_id': event.project_id, 'organization_id': event.project.organization_id, 'event_type': event.get_event_type(), })
def get_cache_key(cls, organization_id, version): return "release:3:%s:%s" % (organization_id, md5_text(version).hexdigest())