예제 #1
0
    def test_fetch_file(self):
        responses.add(
            responses.GET, 'http://example.com', body='foo bar', content_type='application/json'
        )

        temp = tempfile.TemporaryFile()
        result = http.fetch_file(url='http://example.com', domain_lock_enabled=False, outfile=temp)
        temp.seek(0)
        assert result.body is None
        assert temp.read() == 'foo bar'
        temp.close()
예제 #2
0
def test_fetch_file():
    responses.add(
        responses.GET, "http://example.com", body="foo bar", content_type="application/json"
    )

    temp = tempfile.TemporaryFile()
    result = http.fetch_file(url="http://example.com", domain_lock_enabled=False, outfile=temp)
    temp.seek(0)
    assert result.body is None
    assert temp.read() == b"foo bar"
    temp.close()
예제 #3
0
def test_fetch_file_brotli():
    body = brotli.compress(b"foo bar")
    responses.add(
        responses.GET,
        "http://example.com",
        body=body,
        content_type="application/json",
        adding_headers={"Content-Encoding": "br"},
    )

    temp = tempfile.TemporaryFile()
    result = http.fetch_file(url="http://example.com",
                             domain_lock_enabled=False,
                             outfile=temp)
    temp.seek(0)
    assert result.body is None
    assert temp.read() == b"foo bar"
    temp.close()
예제 #4
0
def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == '...':
        raise http.CannotFetch(
            {
                'type': EventError.JS_MISSING_SOURCE,
                'url': http.expose_url(url),
            }
        )
    if release:
        with metrics.timer('sourcemaps.release_file'):
            result = fetch_release_file(url, release, dist)
    else:
        result = None

    cache_key = 'source:cache:v4:%s' % (md5_text(url).hexdigest(), )

    if result is None:
        if not allow_scraping or not url.startswith(('http:', 'https:')):
            error = {
                'type': EventError.JS_MISSING_SOURCE,
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[4]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = http.UrlResult(
                result[0], result[1], zlib.decompress(result[2]), result[3], encoding
            )

    if result is None:
        headers = {}
        verify_ssl = False
        if project and is_valid_origin(url, project=project):
            verify_ssl = bool(project.get_option('sentry:verify_ssl', False))
            token = project.get_option('sentry:token')
            if token:
                token_header = project.get_option('sentry:token_header') or 'X-Sentry-Token'
                headers[token_header] = token

        with metrics.timer('sourcemaps.fetch'):
            result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl)
            z_body = zlib.compress(result.body)
            cache.set(
                cache_key,
                (url,
                 result.headers,
                 z_body,
                 result.status,
                 result.encoding),
                get_max_age(result.headers))

    # If we did not get a 200 OK we just raise a cannot fetch here.
    if result.status != 200:
        raise http.CannotFetch(
            {
                'type': EventError.FETCH_INVALID_HTTP_CODE,
                'value': result.status,
                'url': http.expose_url(url),
            }
        )

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result.body, six.binary_type):
        try:
            result = http.UrlResult(
                result.url, result.headers,
                result.body.encode('utf8'), result.status, result.encoding
            )
        except UnicodeEncodeError:
            error = {
                'type': EventError.FETCH_INVALID_ENCODING,
                'value': 'utf8',
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but
    # this should catch 99% of cases
    if url.endswith('.js'):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        # Discard leading whitespace (often found before doctype)
        body_start = result.body[:20].lstrip()

        if body_start[:1] == u'<':
            error = {
                'type': EventError.JS_INVALID_CONTENT,
                'url': url,
            }
            raise http.CannotFetch(error)

    return result
예제 #5
0
def fetch_file(url, project=None, release=None, dist=None, allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the cache.
    """
    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == '...':
        raise http.CannotFetch(
            {
                'type': EventError.JS_MISSING_SOURCE,
                'url': http.expose_url(url),
            }
        )
    if release:
        with metrics.timer('sourcemaps.release_file'):
            result = fetch_release_file(url, release, dist)
    else:
        result = None

    cache_key = 'source:cache:v4:%s' % (md5_text(url).hexdigest(), )

    if result is None:
        if not allow_scraping or not url.startswith(('http:', 'https:')):
            error = {
                'type': EventError.JS_MISSING_SOURCE,
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

        logger.debug('Checking cache for url %r', url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[4]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = http.UrlResult(
                result[0], result[1], zlib.decompress(result[2]), result[3], encoding
            )

    if result is None:
        headers = {}
        verify_ssl = False
        if project and is_valid_origin(url, project=project):
            verify_ssl = bool(project.get_option('sentry:verify_ssl', False))
            token = project.get_option('sentry:token')
            if token:
                token_header = project.get_option('sentry:token_header') or 'X-Sentry-Token'
                headers[token_header] = token

        with metrics.timer('sourcemaps.fetch'):
            result = http.fetch_file(url, headers=headers, verify_ssl=verify_ssl)
            z_body = zlib.compress(result.body)
            cache.set(
                cache_key,
                (url,
                 result.headers,
                 z_body,
                 result.status,
                 result.encoding),
                get_max_age(result.headers))

    # If we did not get a 200 OK we just raise a cannot fetch here.
    if result.status != 200:
        raise http.CannotFetch(
            {
                'type': EventError.FETCH_INVALID_HTTP_CODE,
                'value': result.status,
                'url': http.expose_url(url),
            }
        )

    # Make sure the file we're getting back is six.binary_type. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result.body, six.binary_type):
        try:
            result = http.UrlResult(
                result.url, result.headers,
                result.body.encode('utf8'), result.status, result.encoding
            )
        except UnicodeEncodeError:
            error = {
                'type': EventError.FETCH_INVALID_ENCODING,
                'value': 'utf8',
                'url': http.expose_url(url),
            }
            raise http.CannotFetch(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but
    # this should catch 99% of cases
    if url.endswith('.js'):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        # Discard leading whitespace (often found before doctype)
        body_start = result.body[:20].lstrip()

        if body_start[:1] == u'<':
            error = {
                'type': EventError.JS_INVALID_CONTENT,
                'url': url,
            }
            raise http.CannotFetch(error)

    return result
예제 #6
0
def fetch_file(url,
               project=None,
               release=None,
               dist=None,
               allow_scraping=True):
    """
    Pull down a URL, returning a UrlResult object.

    Attempts to fetch from the database first (assuming there's a release on the
    event), then the internet. Caches the result of each of those two attempts
    separately, whether or not those attempts are successful. Used for both
    source files and source maps.
    """

    # If our url has been truncated, it'd be impossible to fetch
    # so we check for this early and bail
    if url[-3:] == "...":
        raise http.CannotFetch({
            "type": EventError.JS_MISSING_SOURCE,
            "url": http.expose_url(url)
        })

    # if we've got a release to look on, try that first (incl associated cache)
    if release:
        with metrics.timer("sourcemaps.release_file"):
            result = fetch_release_file(url, release, dist)
    else:
        result = None

    # otherwise, try the web-scraping cache and then the web itself

    cache_key = f"source:cache:v4:{md5_text(url).hexdigest()}"

    if result is None:
        if not allow_scraping or not url.startswith(("http:", "https:")):
            error = {
                "type": EventError.JS_MISSING_SOURCE,
                "url": http.expose_url(url)
            }
            raise http.CannotFetch(error)

        logger.debug("Checking cache for url %r", url)
        result = cache.get(cache_key)
        if result is not None:
            # Previous caches would be a 3-tuple instead of a 4-tuple,
            # so this is being maintained for backwards compatibility
            try:
                encoding = result[4]
            except IndexError:
                encoding = None
            # We got a cache hit, but the body is compressed, so we
            # need to decompress it before handing it off
            result = http.UrlResult(result[0], result[1],
                                    zlib.decompress(result[2]), result[3],
                                    encoding)

    if result is None:
        headers = {}
        verify_ssl = False
        if project and is_valid_origin(url, project=project):
            verify_ssl = bool(project.get_option("sentry:verify_ssl", False))
            token = project.get_option("sentry:token")
            if token:
                token_header = project.get_option(
                    "sentry:token_header") or "X-Sentry-Token"
                headers[token_header] = token

        with metrics.timer("sourcemaps.fetch"):
            result = http.fetch_file(url,
                                     headers=headers,
                                     verify_ssl=verify_ssl)
            z_body = zlib.compress(result.body)
            cache.set(
                cache_key,
                (url, result.headers, z_body, result.status, result.encoding),
                get_max_age(result.headers),
            )

            # since the cache.set above can fail we can end up in a situation
            # where the file is too large for the cache. In that case we abort
            # the fetch and cache a failure and lock the domain for future
            # http fetches.
            if cache.get(cache_key) is None:
                error = {
                    "type": EventError.TOO_LARGE_FOR_CACHE,
                    "url": http.expose_url(url),
                }
                http.lock_domain(url, error=error)
                raise http.CannotFetch(error)

    # If we did not get a 200 OK we just raise a cannot fetch here.
    if result.status != 200:
        raise http.CannotFetch({
            "type": EventError.FETCH_INVALID_HTTP_CODE,
            "value": result.status,
            "url": http.expose_url(url),
        })

    # Make sure the file we're getting back is bytes. The only
    # reason it'd not be binary would be from old cached blobs, so
    # for compatibility with current cached files, let's coerce back to
    # binary and say utf8 encoding.
    if not isinstance(result.body, bytes):
        try:
            result = http.UrlResult(
                result.url,
                result.headers,
                result.body.encode("utf8"),
                result.status,
                result.encoding,
            )
        except UnicodeEncodeError:
            error = {
                "type": EventError.FETCH_INVALID_ENCODING,
                "value": "utf8",
                "url": http.expose_url(url),
            }
            raise http.CannotFetch(error)

    # For JavaScript files, check if content is something other than JavaScript/JSON (i.e. HTML)
    # NOTE: possible to have JS files that don't actually end w/ ".js", but
    # this should catch 99% of cases
    if urlsplit(url).path.endswith(".js"):
        # Check if response is HTML by looking if the first non-whitespace character is an open tag ('<').
        # This cannot parse as valid JS/JSON.
        # NOTE: not relying on Content-Type header because apps often don't set this correctly
        # Discard leading whitespace (often found before doctype)
        body_start = result.body[:20].lstrip()

        if body_start[:1] == b"<":
            error = {"type": EventError.JS_INVALID_CONTENT, "url": url}
            raise http.CannotFetch(error)

    return result