示例#1
0
 def tests(self):
     for hdr, expected in self.TEST_CASES.items():
         res = get_filename_from_headers({b'Content-Disposition': [hdr]})
         self.assertEqual(
             res,
             expected,
             "expected output for %s to be %s but was %s" % (hdr, expected, res),
         )
示例#2
0
 def tests(self) -> None:
     for hdr, expected in self.TEST_CASES.items():
         res = get_filename_from_headers({b"Content-Disposition": [hdr]})
         self.assertEqual(
             res,
             expected,
             f"expected output for {hdr!r} to be {expected} but was {res}",
         )
示例#3
0
 def tests(self):
     for hdr, expected in self.TEST_CASES.items():
         res = get_filename_from_headers({b"Content-Disposition": [hdr]})
         self.assertEqual(
             res,
             expected,
             "expected output for %s to be %s but was %s" %
             (hdr, expected, res),
         )
示例#4
0
    def _download_url(self, url, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
        # bot, so are we really a robot?

        file_id = datetime.date.today().isoformat() + "_" + random_string(16)

        file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)

        with self.media_storage.store_into_file(file_info) as (f, fname,
                                                               finish):
            try:
                logger.debug("Trying to get url '%s'" % url)
                length, headers, uri, code = yield self.client.get_file(
                    url, output_stream=f, max_size=self.max_spider_size)
            except SynapseError:
                # Pass SynapseErrors through directly, so that the servlet
                # handler will return a SynapseError to the client instead of
                # blank data or a 500.
                raise
            except DNSLookupError:
                # DNS lookup returned no results
                # Note: This will also be the case if one of the resolved IP
                # addresses is blacklisted
                raise SynapseError(
                    502,
                    "DNS resolution failure during URL preview generation",
                    Codes.UNKNOWN,
                )
            except Exception as e:
                # FIXME: pass through 404s and other error messages nicely
                logger.warn("Error downloading %s: %r", url, e)

                raise SynapseError(
                    500,
                    "Failed to download content: %s" %
                    (traceback.format_exception_only(sys.exc_info()[0], e), ),
                    Codes.UNKNOWN,
                )
            yield finish()

        try:
            if b"Content-Type" in headers:
                media_type = headers[b"Content-Type"][0].decode("ascii")
            else:
                media_type = "application/octet-stream"
            time_now_ms = self.clock.time_msec()

            download_name = get_filename_from_headers(headers)

            yield self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
                time_now_ms=self.clock.time_msec(),
                upload_name=download_name,
                media_length=length,
                user_id=user,
                url_cache=url,
            )

        except Exception as e:
            logger.error("Error handling downloaded %s: %r", url, e)
            # TODO: we really ought to delete the downloaded file in this
            # case, since we won't have recorded it in the db, and will
            # therefore not expire it.
            raise

        defer.returnValue({
            "media_type":
            media_type,
            "media_length":
            length,
            "download_name":
            download_name,
            "created_ts":
            time_now_ms,
            "filesystem_id":
            file_id,
            "filename":
            fname,
            "uri":
            uri,
            "response_code":
            code,
            # FIXME: we should calculate a proper expiration based on the
            # Cache-Control and Expire headers.  But for now, assume 1 hour.
            "expires":
            60 * 60 * 1000,
            "etag":
            headers["ETag"][0] if "ETag" in headers else None,
        })
    async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
        """
        Fetches a remote URL and parses the headers.

        Args:
             url: The URL to fetch.
             output_stream: The stream to write the content to.

        Returns:
            A tuple of:
                Media length, URL downloaded, the HTTP response code,
                the media type, the downloaded file name, the number of
                milliseconds the result is valid for, the etag header.
        """

        try:
            logger.debug("Trying to get preview for url '%s'", url)
            length, headers, uri, code = await self.client.get_file(
                url,
                output_stream=output_stream,
                max_size=self.max_spider_size,
                headers={"Accept-Language": self.url_preview_accept_language},
            )
        except SynapseError:
            # Pass SynapseErrors through directly, so that the servlet
            # handler will return a SynapseError to the client instead of
            # blank data or a 500.
            raise
        except DNSLookupError:
            # DNS lookup returned no results
            # Note: This will also be the case if one of the resolved IP
            # addresses is blacklisted
            raise SynapseError(
                502,
                "DNS resolution failure during URL preview generation",
                Codes.UNKNOWN,
            )
        except Exception as e:
            # FIXME: pass through 404s and other error messages nicely
            logger.warning("Error downloading %s: %r", url, e)

            raise SynapseError(
                500,
                "Failed to download content: %s"
                % (traceback.format_exception_only(sys.exc_info()[0], e),),
                Codes.UNKNOWN,
            )

        if b"Content-Type" in headers:
            media_type = headers[b"Content-Type"][0].decode("ascii")
        else:
            media_type = "application/octet-stream"

        download_name = get_filename_from_headers(headers)

        # FIXME: we should calculate a proper expiration based on the
        # Cache-Control and Expire headers.  But for now, assume 1 hour.
        expires = ONE_HOUR
        etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None

        return DownloadResult(
            length, uri, code, media_type, download_name, expires, etag
        )
    async def _download_url(self, url: str, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
        # bot, so are we really a robot?

        file_id = datetime.date.today().isoformat() + "_" + random_string(16)

        file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)

        # If this URL can be accessed via oEmbed, use that instead.
        url_to_download = url  # type: Optional[str]
        oembed_url = self._get_oembed_url(url)
        if oembed_url:
            # The result might be a new URL to download, or it might be HTML content.
            try:
                oembed_result = await self._get_oembed_content(oembed_url, url)
                if oembed_result.url:
                    url_to_download = oembed_result.url
                elif oembed_result.html:
                    url_to_download = None
            except OEmbedError:
                # If an error occurs, try doing a normal preview.
                pass

        if url_to_download:
            with self.media_storage.store_into_file(file_info) as (f, fname, finish):
                try:
                    logger.debug("Trying to get preview for url '%s'", url_to_download)
                    length, headers, uri, code = await self.client.get_file(
                        url_to_download,
                        output_stream=f,
                        max_size=self.max_spider_size,
                        headers={"Accept-Language": self.url_preview_accept_language},
                    )
                except SynapseError:
                    # Pass SynapseErrors through directly, so that the servlet
                    # handler will return a SynapseError to the client instead of
                    # blank data or a 500.
                    raise
                except DNSLookupError:
                    # DNS lookup returned no results
                    # Note: This will also be the case if one of the resolved IP
                    # addresses is blacklisted
                    raise SynapseError(
                        502,
                        "DNS resolution failure during URL preview generation",
                        Codes.UNKNOWN,
                    )
                except Exception as e:
                    # FIXME: pass through 404s and other error messages nicely
                    logger.warning("Error downloading %s: %r", url_to_download, e)

                    raise SynapseError(
                        500,
                        "Failed to download content: %s"
                        % (traceback.format_exception_only(sys.exc_info()[0], e),),
                        Codes.UNKNOWN,
                    )
                await finish()

                if b"Content-Type" in headers:
                    media_type = headers[b"Content-Type"][0].decode("ascii")
                else:
                    media_type = "application/octet-stream"

                download_name = get_filename_from_headers(headers)

                # FIXME: we should calculate a proper expiration based on the
                # Cache-Control and Expire headers.  But for now, assume 1 hour.
                expires = ONE_HOUR
                etag = (
                    headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
                )
        else:
            # we can only get here if we did an oembed request and have an oembed_result.html
            assert oembed_result.html is not None
            assert oembed_url is not None

            html_bytes = oembed_result.html.encode("utf-8")
            with self.media_storage.store_into_file(file_info) as (f, fname, finish):
                f.write(html_bytes)
                await finish()

            media_type = "text/html"
            download_name = oembed_result.title
            length = len(html_bytes)
            # If a specific cache age was not given, assume 1 hour.
            expires = oembed_result.cache_age or ONE_HOUR
            uri = oembed_url
            code = 200
            etag = None

        try:
            time_now_ms = self.clock.time_msec()

            await self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
                time_now_ms=time_now_ms,
                upload_name=download_name,
                media_length=length,
                user_id=user,
                url_cache=url,
            )

        except Exception as e:
            logger.error("Error handling downloaded %s: %r", url, e)
            # TODO: we really ought to delete the downloaded file in this
            # case, since we won't have recorded it in the db, and will
            # therefore not expire it.
            raise

        return {
            "media_type": media_type,
            "media_length": length,
            "download_name": download_name,
            "created_ts": time_now_ms,
            "filesystem_id": file_id,
            "filename": fname,
            "uri": uri,
            "response_code": code,
            "expires": expires,
            "etag": etag,
        }
    def _download_url(self, url, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
        # bot, so are we really a robot?

        file_id = datetime.date.today().isoformat() + '_' + random_string(16)

        file_info = FileInfo(
            server_name=None,
            file_id=file_id,
            url_cache=True,
        )

        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
            try:
                logger.debug("Trying to get url '%s'" % url)
                length, headers, uri, code = yield self.client.get_file(
                    url, output_stream=f, max_size=self.max_spider_size,
                )
            except SynapseError:
                # Pass SynapseErrors through directly, so that the servlet
                # handler will return a SynapseError to the client instead of
                # blank data or a 500.
                raise
            except DNSLookupError:
                # DNS lookup returned no results
                # Note: This will also be the case if one of the resolved IP
                # addresses is blacklisted
                raise SynapseError(
                    502, "DNS resolution failure during URL preview generation",
                    Codes.UNKNOWN
                )
            except Exception as e:
                # FIXME: pass through 404s and other error messages nicely
                logger.warn("Error downloading %s: %r", url, e)

                raise SynapseError(
                    500, "Failed to download content: %s" % (
                        traceback.format_exception_only(sys.exc_info()[0], e),
                    ),
                    Codes.UNKNOWN,
                )
            yield finish()

        try:
            if b"Content-Type" in headers:
                media_type = headers[b"Content-Type"][0].decode('ascii')
            else:
                media_type = "application/octet-stream"
            time_now_ms = self.clock.time_msec()

            download_name = get_filename_from_headers(headers)

            yield self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
                time_now_ms=self.clock.time_msec(),
                upload_name=download_name,
                media_length=length,
                user_id=user,
                url_cache=url,
            )

        except Exception as e:
            logger.error("Error handling downloaded %s: %r", url, e)
            # TODO: we really ought to delete the downloaded file in this
            # case, since we won't have recorded it in the db, and will
            # therefore not expire it.
            raise

        defer.returnValue({
            "media_type": media_type,
            "media_length": length,
            "download_name": download_name,
            "created_ts": time_now_ms,
            "filesystem_id": file_id,
            "filename": fname,
            "uri": uri,
            "response_code": code,
            # FIXME: we should calculate a proper expiration based on the
            # Cache-Control and Expire headers.  But for now, assume 1 hour.
            "expires": 60 * 60 * 1000,
            "etag": headers["ETag"][0] if "ETag" in headers else None,
        })