Пример #1
0
    def test_time_eviction(self):
        clock = MockClock()
        cache = ExpiringCache("test", clock, expiry_ms=1000)
        cache.start()

        cache["key"] = 1
        clock.advance_time(0.5)
        cache["key2"] = 2

        self.assertEquals(cache.get("key"), 1)
        self.assertEquals(cache.get("key2"), 2)

        clock.advance_time(0.9)
        self.assertEquals(cache.get("key"), None)
        self.assertEquals(cache.get("key2"), 2)

        clock.advance_time(1)
        self.assertEquals(cache.get("key"), None)
        self.assertEquals(cache.get("key2"), None)
Пример #2
0
    def test_time_eviction(self):
        clock = MockClock()
        cache = ExpiringCache("test", clock, expiry_ms=1000)
        cache.start()

        cache["key"] = 1
        clock.advance_time(0.5)
        cache["key2"] = 2

        self.assertEquals(cache.get("key"), 1)
        self.assertEquals(cache.get("key2"), 2)

        clock.advance_time(0.9)
        self.assertEquals(cache.get("key"), None)
        self.assertEquals(cache.get("key2"), 2)

        clock.advance_time(1)
        self.assertEquals(cache.get("key"), None)
        self.assertEquals(cache.get("key2"), None)
class PreviewUrlResource(Resource):
    isLeaf = True

    def __init__(self, hs, media_repo):
        Resource.__init__(self)

        self.auth = hs.get_auth()
        self.clock = hs.get_clock()
        self.version_string = hs.version_string
        self.filepaths = media_repo.filepaths
        self.max_spider_size = hs.config.max_spider_size
        self.server_name = hs.hostname
        self.store = hs.get_datastore()
        self.client = SpiderHttpClient(hs)
        self.media_repo = media_repo

        self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist

        # simple memory cache mapping urls to OG metadata
        self.cache = ExpiringCache(
            cache_name="url_previews",
            clock=self.clock,
            # don't spider URLs more often than once an hour
            expiry_ms=60 * 60 * 1000,
        )
        self.cache.start()

        self.downloads = {}

    def render_GET(self, request):
        self._async_render_GET(request)
        return NOT_DONE_YET

    @request_handler()
    @defer.inlineCallbacks
    def _async_render_GET(self, request):

        # XXX: if get_user_by_req fails, what should we do in an async render?
        requester = yield self.auth.get_user_by_req(request)
        url = request.args.get("url")[0]
        if "ts" in request.args:
            ts = int(request.args.get("ts")[0])
        else:
            ts = self.clock.time_msec()

        url_tuple = urlparse.urlsplit(url)
        for entry in self.url_preview_url_blacklist:
            match = True
            for attrib in entry:
                pattern = entry[attrib]
                value = getattr(url_tuple, attrib)
                logger.debug(("Matching attrib '%s' with value '%s' against"
                              " pattern '%s'") % (attrib, value, pattern))

                if value is None:
                    match = False
                    continue

                if pattern.startswith('^'):
                    if not re.match(pattern, getattr(url_tuple, attrib)):
                        match = False
                        continue
                else:
                    if not fnmatch.fnmatch(getattr(url_tuple, attrib),
                                           pattern):
                        match = False
                        continue
            if match:
                logger.warn("URL %s blocked by url_blacklist entry %s", url,
                            entry)
                raise SynapseError(
                    403, "URL blocked by url pattern blacklist entry",
                    Codes.UNKNOWN)

        # first check the memory cache - good to handle all the clients on this
        # HS thundering away to preview the same URL at the same time.
        og = self.cache.get(url)
        if og:
            respond_with_json_bytes(request,
                                    200,
                                    json.dumps(og),
                                    send_cors=True)
            return

        # then check the URL cache in the DB (which will also provide us with
        # historical previews, if we have any)
        cache_result = yield self.store.get_url_cache(url, ts)
        if (cache_result
                and cache_result["download_ts"] + cache_result["expires"] > ts
                and cache_result["response_code"] / 100 == 2):
            respond_with_json_bytes(request,
                                    200,
                                    cache_result["og"].encode('utf-8'),
                                    send_cors=True)
            return

        # Ensure only one download for a given URL is active at a time
        download = self.downloads.get(url)
        if download is None:
            download = self._download_url(url, requester.user)
            download = ObservableDeferred(download, consumeErrors=True)
            self.downloads[url] = download

            @download.addBoth
            def callback(media_info):
                del self.downloads[url]
                return media_info

        media_info = yield download.observe()

        # FIXME: we should probably update our cache now anyway, so that
        # even if the OG calculation raises, we don't keep hammering on the
        # remote server.  For now, leave it uncached to aid debugging OG
        # calculation problems

        logger.debug("got media_info of '%s'" % media_info)

        if _is_media(media_info['media_type']):
            dims = yield self.media_repo._generate_local_thumbnails(
                media_info['filesystem_id'],
                media_info,
                url_cache=True,
            )

            og = {
                "og:description":
                media_info['download_name'],
                "og:image":
                "mxc://%s/%s" %
                (self.server_name, media_info['filesystem_id']),
                "og:image:type":
                media_info['media_type'],
                "matrix:image:size":
                media_info['media_length'],
            }

            if dims:
                og["og:image:width"] = dims['width']
                og["og:image:height"] = dims['height']
            else:
                logger.warn("Couldn't get dims for %s" % url)

            # define our OG response for this media
        elif _is_html(media_info['media_type']):
            # TODO: somehow stop a big HTML tree from exploding synapse's RAM

            file = open(media_info['filename'])
            body = file.read()
            file.close()

            # clobber the encoding from the content-type, or default to utf-8
            # XXX: this overrides any <meta/> or XML charset headers in the body
            # which may pose problems, but so far seems to work okay.
            match = re.match(r'.*; *charset=(.*?)(;|$)',
                             media_info['media_type'], re.I)
            encoding = match.group(1) if match else "utf-8"

            og = decode_and_calc_og(body, media_info['uri'], encoding)

            # pre-cache the image for posterity
            # FIXME: it might be cleaner to use the same flow as the main /preview_url
            # request itself and benefit from the same caching etc.  But for now we
            # just rely on the caching on the master request to speed things up.
            if 'og:image' in og and og['og:image']:
                image_info = yield self._download_url(
                    _rebase_url(og['og:image'], media_info['uri']),
                    requester.user)

                if _is_media(image_info['media_type']):
                    # TODO: make sure we don't choke on white-on-transparent images
                    dims = yield self.media_repo._generate_local_thumbnails(
                        image_info['filesystem_id'],
                        image_info,
                        url_cache=True,
                    )
                    if dims:
                        og["og:image:width"] = dims['width']
                        og["og:image:height"] = dims['height']
                    else:
                        logger.warn("Couldn't get dims for %s" %
                                    og["og:image"])

                    og["og:image"] = "mxc://%s/%s" % (
                        self.server_name, image_info['filesystem_id'])
                    og["og:image:type"] = image_info['media_type']
                    og["matrix:image:size"] = image_info['media_length']
                else:
                    del og["og:image"]
        else:
            logger.warn("Failed to find any OG data in %s", url)
            og = {}

        logger.debug("Calculated OG for %s as %s" % (url, og))

        # store OG in ephemeral in-memory cache
        self.cache[url] = og

        # store OG in history-aware DB cache
        yield self.store.store_url_cache(
            url,
            media_info["response_code"],
            media_info["etag"],
            media_info["expires"],
            json.dumps(og),
            media_info["filesystem_id"],
            media_info["created_ts"],
        )

        respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)

    @defer.inlineCallbacks
    def _download_url(self, url, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
        # bot, so are we really a robot?

        # XXX: horrible duplication with base_resource's _download_remote_file()
        file_id = random_string(24)

        fname = self.filepaths.url_cache_filepath(file_id)
        self.media_repo._makedirs(fname)

        try:
            with open(fname, "wb") as f:
                logger.debug("Trying to get url '%s'" % url)
                length, headers, uri, code = yield self.client.get_file(
                    url,
                    output_stream=f,
                    max_size=self.max_spider_size,
                )
                # FIXME: pass through 404s and other error messages nicely

            media_type = headers["Content-Type"][0]
            time_now_ms = self.clock.time_msec()

            content_disposition = headers.get("Content-Disposition", None)
            if content_disposition:
                _, params = cgi.parse_header(content_disposition[0], )
                download_name = None

                # First check if there is a valid UTF-8 filename
                download_name_utf8 = params.get("filename*", None)
                if download_name_utf8:
                    if download_name_utf8.lower().startswith("utf-8''"):
                        download_name = download_name_utf8[7:]

                # If there isn't check for an ascii name.
                if not download_name:
                    download_name_ascii = params.get("filename", None)
                    if download_name_ascii and is_ascii(download_name_ascii):
                        download_name = download_name_ascii

                if download_name:
                    download_name = urlparse.unquote(download_name)
                    try:
                        download_name = download_name.decode("utf-8")
                    except UnicodeDecodeError:
                        download_name = None
            else:
                download_name = None

            yield self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
                time_now_ms=self.clock.time_msec(),
                upload_name=download_name,
                media_length=length,
                user_id=user,
                url_cache=url,
            )

        except Exception as e:
            os.remove(fname)
            raise SynapseError(500, ("Failed to download content: %s" % e),
                               Codes.UNKNOWN)

        defer.returnValue({
            "media_type":
            media_type,
            "media_length":
            length,
            "download_name":
            download_name,
            "created_ts":
            time_now_ms,
            "filesystem_id":
            file_id,
            "filename":
            fname,
            "uri":
            uri,
            "response_code":
            code,
            # FIXME: we should calculate a proper expiration based on the
            # Cache-Control and Expire headers.  But for now, assume 1 hour.
            "expires":
            60 * 60 * 1000,
            "etag":
            headers["ETag"][0] if "ETag" in headers else None,
        })
Пример #4
0
class PreviewUrlResource(Resource):
    isLeaf = True

    def __init__(self, hs, media_repo):
        Resource.__init__(self)

        self.auth = hs.get_auth()
        self.clock = hs.get_clock()
        self.version_string = hs.version_string
        self.filepaths = media_repo.filepaths
        self.max_spider_size = hs.config.max_spider_size
        self.server_name = hs.hostname
        self.store = hs.get_datastore()
        self.client = SpiderHttpClient(hs)
        self.media_repo = media_repo

        self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist

        # simple memory cache mapping urls to OG metadata
        self.cache = ExpiringCache(
            cache_name="url_previews",
            clock=self.clock,
            # don't spider URLs more often than once an hour
            expiry_ms=60 * 60 * 1000,
        )
        self.cache.start()

        self.downloads = {}

    def render_GET(self, request):
        self._async_render_GET(request)
        return NOT_DONE_YET

    @request_handler()
    @defer.inlineCallbacks
    def _async_render_GET(self, request):

        # XXX: if get_user_by_req fails, what should we do in an async render?
        requester = yield self.auth.get_user_by_req(request)
        url = request.args.get("url")[0]
        if "ts" in request.args:
            ts = int(request.args.get("ts")[0])
        else:
            ts = self.clock.time_msec()

        url_tuple = urlparse.urlsplit(url)
        for entry in self.url_preview_url_blacklist:
            match = True
            for attrib in entry:
                pattern = entry[attrib]
                value = getattr(url_tuple, attrib)
                logger.debug((
                    "Matching attrib '%s' with value '%s' against"
                    " pattern '%s'"
                ) % (attrib, value, pattern))

                if value is None:
                    match = False
                    continue

                if pattern.startswith('^'):
                    if not re.match(pattern, getattr(url_tuple, attrib)):
                        match = False
                        continue
                else:
                    if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
                        match = False
                        continue
            if match:
                logger.warn(
                    "URL %s blocked by url_blacklist entry %s", url, entry
                )
                raise SynapseError(
                    403, "URL blocked by url pattern blacklist entry",
                    Codes.UNKNOWN
                )

        # first check the memory cache - good to handle all the clients on this
        # HS thundering away to preview the same URL at the same time.
        og = self.cache.get(url)
        if og:
            respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
            return

        # then check the URL cache in the DB (which will also provide us with
        # historical previews, if we have any)
        cache_result = yield self.store.get_url_cache(url, ts)
        if (
            cache_result and
            cache_result["download_ts"] + cache_result["expires"] > ts and
            cache_result["response_code"] / 100 == 2
        ):
            respond_with_json_bytes(
                request, 200, cache_result["og"].encode('utf-8'),
                send_cors=True
            )
            return

        # Ensure only one download for a given URL is active at a time
        download = self.downloads.get(url)
        if download is None:
            download = self._download_url(url, requester.user)
            download = ObservableDeferred(
                download,
                consumeErrors=True
            )
            self.downloads[url] = download

            @download.addBoth
            def callback(media_info):
                del self.downloads[url]
                return media_info
        media_info = yield download.observe()

        # FIXME: we should probably update our cache now anyway, so that
        # even if the OG calculation raises, we don't keep hammering on the
        # remote server.  For now, leave it uncached to aid debugging OG
        # calculation problems

        logger.debug("got media_info of '%s'" % media_info)

        if self._is_media(media_info['media_type']):
            dims = yield self.media_repo._generate_local_thumbnails(
                media_info['filesystem_id'], media_info
            )

            og = {
                "og:description": media_info['download_name'],
                "og:image": "mxc://%s/%s" % (
                    self.server_name, media_info['filesystem_id']
                ),
                "og:image:type": media_info['media_type'],
                "matrix:image:size": media_info['media_length'],
            }

            if dims:
                og["og:image:width"] = dims['width']
                og["og:image:height"] = dims['height']
            else:
                logger.warn("Couldn't get dims for %s" % url)

            # define our OG response for this media
        elif self._is_html(media_info['media_type']):
            # TODO: somehow stop a big HTML tree from exploding synapse's RAM

            from lxml import etree

            file = open(media_info['filename'])
            body = file.read()
            file.close()

            # clobber the encoding from the content-type, or default to utf-8
            # XXX: this overrides any <meta/> or XML charset headers in the body
            # which may pose problems, but so far seems to work okay.
            match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
            encoding = match.group(1) if match else "utf-8"

            try:
                parser = etree.HTMLParser(recover=True, encoding=encoding)
                tree = etree.fromstring(body, parser)
                og = yield self._calc_og(tree, media_info, requester)
            except UnicodeDecodeError:
                # blindly try decoding the body as utf-8, which seems to fix
                # the charset mismatches on https://google.com
                parser = etree.HTMLParser(recover=True, encoding=encoding)
                tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
                og = yield self._calc_og(tree, media_info, requester)

        else:
            logger.warn("Failed to find any OG data in %s", url)
            og = {}

        logger.debug("Calculated OG for %s as %s" % (url, og))

        # store OG in ephemeral in-memory cache
        self.cache[url] = og

        # store OG in history-aware DB cache
        yield self.store.store_url_cache(
            url,
            media_info["response_code"],
            media_info["etag"],
            media_info["expires"],
            json.dumps(og),
            media_info["filesystem_id"],
            media_info["created_ts"],
        )

        respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)

    @defer.inlineCallbacks
    def _calc_og(self, tree, media_info, requester):
        # suck our tree into lxml and define our OG response.

        # if we see any image URLs in the OG response, then spider them
        # (although the client could choose to do this by asking for previews of those
        # URLs to avoid DoSing the server)

        # "og:type"         : "video",
        # "og:url"          : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
        # "og:site_name"    : "YouTube",
        # "og:video:type"   : "application/x-shockwave-flash",
        # "og:description"  : "Fun stuff happening here",
        # "og:title"        : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
        # "og:image"        : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
        # "og:video:url"    : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
        # "og:video:width"  : "1280"
        # "og:video:height" : "720",
        # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",

        og = {}
        for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
            if 'content' in tag.attrib:
                og[tag.attrib['property']] = tag.attrib['content']

        # TODO: grab article: meta tags too, e.g.:

        # "article:publisher" : "https://www.facebook.com/thethudonline" />
        # "article:author" content="https://www.facebook.com/thethudonline" />
        # "article:tag" content="baby" />
        # "article:section" content="Breaking News" />
        # "article:published_time" content="2016-03-31T19:58:24+00:00" />
        # "article:modified_time" content="2016-04-01T18:31:53+00:00" />

        if 'og:title' not in og:
            # do some basic spidering of the HTML
            title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
            og['og:title'] = title[0].text.strip() if title else None

        if 'og:image' not in og:
            # TODO: extract a favicon failing all else
            meta_image = tree.xpath(
                "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
            )
            if meta_image:
                og['og:image'] = self._rebase_url(meta_image[0], media_info['uri'])
            else:
                # TODO: consider inlined CSS styles as well as width & height attribs
                images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
                images = sorted(images, key=lambda i: (
                    -1 * float(i.attrib['width']) * float(i.attrib['height'])
                ))
                if not images:
                    images = tree.xpath("//img[@src]")
                if images:
                    og['og:image'] = images[0].attrib['src']

        # pre-cache the image for posterity
        # FIXME: it might be cleaner to use the same flow as the main /preview_url
        # request itself and benefit from the same caching etc.  But for now we
        # just rely on the caching on the master request to speed things up.
        if 'og:image' in og and og['og:image']:
            image_info = yield self._download_url(
                self._rebase_url(og['og:image'], media_info['uri']), requester.user
            )

            if self._is_media(image_info['media_type']):
                # TODO: make sure we don't choke on white-on-transparent images
                dims = yield self.media_repo._generate_local_thumbnails(
                    image_info['filesystem_id'], image_info
                )
                if dims:
                    og["og:image:width"] = dims['width']
                    og["og:image:height"] = dims['height']
                else:
                    logger.warn("Couldn't get dims for %s" % og["og:image"])

                og["og:image"] = "mxc://%s/%s" % (
                    self.server_name, image_info['filesystem_id']
                )
                og["og:image:type"] = image_info['media_type']
                og["matrix:image:size"] = image_info['media_length']
            else:
                del og["og:image"]

        if 'og:description' not in og:
            meta_description = tree.xpath(
                "//*/meta"
                "[translate(@name, 'DESCRIPTION', 'description')='description']"
                "/@content")
            if meta_description:
                og['og:description'] = meta_description[0]
            else:
                # grab any text nodes which are inside the <body/> tag...
                # unless they are within an HTML5 semantic markup tag...
                # <header/>, <nav/>, <aside/>, <footer/>
                # ...or if they are within a <script/> or <style/> tag.
                # This is a very very very coarse approximation to a plain text
                # render of the page.
                text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | "
                                        "ancestor::aside | ancestor::footer | "
                                        "ancestor::script | ancestor::style)]" +
                                        "[ancestor::body]")
                text = ''
                for text_node in text_nodes:
                    if len(text) < 500:
                        text += text_node + ' '
                    else:
                        break
                text = re.sub(r'[\t ]+', ' ', text)
                text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text)
                text = text.strip()[:500]
                og['og:description'] = text if text else None

        # TODO: delete the url downloads to stop diskfilling,
        # as we only ever cared about its OG
        defer.returnValue(og)

    def _rebase_url(self, url, base):
        base = list(urlparse.urlparse(base))
        url = list(urlparse.urlparse(url))
        if not url[0]:  # fix up schema
            url[0] = base[0] or "http"
        if not url[1]:  # fix up hostname
            url[1] = base[1]
            if not url[2].startswith('/'):
                url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2]
        return urlparse.urlunparse(url)

    @defer.inlineCallbacks
    def _download_url(self, url, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
        # bot, so are we really a robot?

        # XXX: horrible duplication with base_resource's _download_remote_file()
        file_id = random_string(24)

        fname = self.filepaths.local_media_filepath(file_id)
        self.media_repo._makedirs(fname)

        try:
            with open(fname, "wb") as f:
                logger.debug("Trying to get url '%s'" % url)
                length, headers, uri, code = yield self.client.get_file(
                    url, output_stream=f, max_size=self.max_spider_size,
                )
                # FIXME: pass through 404s and other error messages nicely

            media_type = headers["Content-Type"][0]
            time_now_ms = self.clock.time_msec()

            content_disposition = headers.get("Content-Disposition", None)
            if content_disposition:
                _, params = cgi.parse_header(content_disposition[0],)
                download_name = None

                # First check if there is a valid UTF-8 filename
                download_name_utf8 = params.get("filename*", None)
                if download_name_utf8:
                    if download_name_utf8.lower().startswith("utf-8''"):
                        download_name = download_name_utf8[7:]

                # If there isn't check for an ascii name.
                if not download_name:
                    download_name_ascii = params.get("filename", None)
                    if download_name_ascii and is_ascii(download_name_ascii):
                        download_name = download_name_ascii

                if download_name:
                    download_name = urlparse.unquote(download_name)
                    try:
                        download_name = download_name.decode("utf-8")
                    except UnicodeDecodeError:
                        download_name = None
            else:
                download_name = None

            yield self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
                time_now_ms=self.clock.time_msec(),
                upload_name=download_name,
                media_length=length,
                user_id=user,
            )

        except Exception as e:
            os.remove(fname)
            raise SynapseError(
                500, ("Failed to download content: %s" % e),
                Codes.UNKNOWN
            )

        defer.returnValue({
            "media_type": media_type,
            "media_length": length,
            "download_name": download_name,
            "created_ts": time_now_ms,
            "filesystem_id": file_id,
            "filename": fname,
            "uri": uri,
            "response_code": code,
            # FIXME: we should calculate a proper expiration based on the
            # Cache-Control and Expire headers.  But for now, assume 1 hour.
            "expires": 60 * 60 * 1000,
            "etag": headers["ETag"][0] if "ETag" in headers else None,
        })

    def _is_media(self, content_type):
        if content_type.lower().startswith("image/"):
            return True

    def _is_html(self, content_type):
        content_type = content_type.lower()
        if (
            content_type.startswith("text/html") or
            content_type.startswith("application/xhtml")
        ):
            return True
Пример #5
0
class StateHandler(object):
    """ Responsible for doing state conflict resolution.
    """

    def __init__(self, hs):
        self.clock = hs.get_clock()
        self.store = hs.get_datastore()
        self.hs = hs

        # dict of set of event_ids -> _StateCacheEntry.
        self._state_cache = None
        self.resolve_linearizer = Linearizer()

    def start_caching(self):
        logger.debug("start_caching")

        self._state_cache = ExpiringCache(
            cache_name="state_cache",
            clock=self.clock,
            max_len=SIZE_OF_CACHE,
            expiry_ms=EVICTION_TIMEOUT_SECONDS * 1000,
            reset_expiry_on_get=True,
        )

        self._state_cache.start()

    @defer.inlineCallbacks
    def get_current_state(self, room_id, event_type=None, state_key="",
                          latest_event_ids=None):
        """ Retrieves the current state for the room. This is done by
        calling `get_latest_events_in_room` to get the leading edges of the
        event graph and then resolving any of the state conflicts.

        This is equivalent to getting the state of an event that were to send
        next before receiving any new events.

        If `event_type` is specified, then the method returns only the one
        event (or None) with that `event_type` and `state_key`.

        Returns:
            map from (type, state_key) to event
        """
        if not latest_event_ids:
            latest_event_ids = yield self.store.get_latest_event_ids_in_room(room_id)

        ret = yield self.resolve_state_groups(room_id, latest_event_ids)
        state = ret.state

        if event_type:
            event_id = state.get((event_type, state_key))
            event = None
            if event_id:
                event = yield self.store.get_event(event_id, allow_none=True)
            defer.returnValue(event)
            return

        state_map = yield self.store.get_events(state.values(), get_prev_content=False)
        state = {
            key: state_map[e_id] for key, e_id in state.items() if e_id in state_map
        }

        defer.returnValue(state)

    @defer.inlineCallbacks
    def get_current_state_ids(self, room_id, event_type=None, state_key="",
                              latest_event_ids=None):
        if not latest_event_ids:
            latest_event_ids = yield self.store.get_latest_event_ids_in_room(room_id)

        ret = yield self.resolve_state_groups(room_id, latest_event_ids)
        state = ret.state

        if event_type:
            defer.returnValue(state.get((event_type, state_key)))
            return

        defer.returnValue(state)

    @defer.inlineCallbacks
    def get_current_user_in_room(self, room_id, latest_event_ids=None):
        if not latest_event_ids:
            latest_event_ids = yield self.store.get_latest_event_ids_in_room(room_id)
        entry = yield self.resolve_state_groups(room_id, latest_event_ids)
        joined_users = yield self.store.get_joined_users_from_state(
            room_id, entry.state_id, entry.state
        )
        defer.returnValue(joined_users)

    @defer.inlineCallbacks
    def compute_event_context(self, event, old_state=None):
        """ Fills out the context with the `current state` of the graph. The
        `current state` here is defined to be the state of the event graph
        just before the event - i.e. it never includes `event`

        If `event` has `auth_events` then this will also fill out the
        `auth_events` field on `context` from the `current_state`.

        Args:
            event (EventBase)
        Returns:
            an EventContext
        """
        context = EventContext()

        if event.internal_metadata.is_outlier():
            # If this is an outlier, then we know it shouldn't have any current
            # state. Certainly store.get_current_state won't return any, and
            # persisting the event won't store the state group.
            if old_state:
                context.prev_state_ids = {
                    (s.type, s.state_key): s.event_id for s in old_state
                }
                if event.is_state():
                    context.current_state_events = dict(context.prev_state_ids)
                    key = (event.type, event.state_key)
                    context.current_state_events[key] = event.event_id
                else:
                    context.current_state_events = context.prev_state_ids
            else:
                context.current_state_ids = {}
                context.prev_state_ids = {}
            context.prev_state_events = []
            context.state_group = self.store.get_next_state_group()
            defer.returnValue(context)

        if old_state:
            context.prev_state_ids = {
                (s.type, s.state_key): s.event_id for s in old_state
            }
            context.state_group = self.store.get_next_state_group()

            if event.is_state():
                key = (event.type, event.state_key)
                if key in context.prev_state_ids:
                    replaces = context.prev_state_ids[key]
                    if replaces != event.event_id:  # Paranoia check
                        event.unsigned["replaces_state"] = replaces
                context.current_state_ids = dict(context.prev_state_ids)
                context.current_state_ids[key] = event.event_id
            else:
                context.current_state_ids = context.prev_state_ids

            context.prev_state_events = []
            defer.returnValue(context)

        if event.is_state():
            entry = yield self.resolve_state_groups(
                event.room_id, [e for e, _ in event.prev_events],
                event_type=event.type,
                state_key=event.state_key,
            )
        else:
            entry = yield self.resolve_state_groups(
                event.room_id, [e for e, _ in event.prev_events],
            )

        curr_state = entry.state

        context.prev_state_ids = curr_state
        if event.is_state():
            context.state_group = self.store.get_next_state_group()

            key = (event.type, event.state_key)
            if key in context.prev_state_ids:
                replaces = context.prev_state_ids[key]
                event.unsigned["replaces_state"] = replaces

            context.current_state_ids = dict(context.prev_state_ids)
            context.current_state_ids[key] = event.event_id

            context.prev_group = entry.prev_group
            context.delta_ids = entry.delta_ids
            if context.delta_ids is not None:
                context.delta_ids = dict(context.delta_ids)
                context.delta_ids[key] = event.event_id
        else:
            if entry.state_group is None:
                entry.state_group = self.store.get_next_state_group()
                entry.state_id = entry.state_group

            context.state_group = entry.state_group
            context.current_state_ids = context.prev_state_ids
            context.prev_group = entry.prev_group
            context.delta_ids = entry.delta_ids

        context.prev_state_events = []
        defer.returnValue(context)

    @defer.inlineCallbacks
    @log_function
    def resolve_state_groups(self, room_id, event_ids, event_type=None, state_key=""):
        """ Given a list of event_ids this method fetches the state at each
        event, resolves conflicts between them and returns them.

        Returns:
            a Deferred tuple of (`state_group`, `state`, `prev_state`).
            `state_group` is the name of a state group if one and only one is
            involved. `state` is a map from (type, state_key) to event, and
            `prev_state` is a list of event ids.
        """
        logger.debug("resolve_state_groups event_ids %s", event_ids)

        state_groups_ids = yield self.store.get_state_groups_ids(
            room_id, event_ids
        )

        logger.debug(
            "resolve_state_groups state_groups %s",
            state_groups_ids.keys()
        )

        group_names = frozenset(state_groups_ids.keys())
        if len(group_names) == 1:
            name, state_list = state_groups_ids.items().pop()

            defer.returnValue(_StateCacheEntry(
                state=state_list,
                state_group=name,
                prev_group=name,
                delta_ids={},
            ))

        with (yield self.resolve_linearizer.queue(group_names)):
            if self._state_cache is not None:
                cache = self._state_cache.get(group_names, None)
                if cache:
                    defer.returnValue(cache)

            logger.info(
                "Resolving state for %s with %d groups", room_id, len(state_groups_ids)
            )

            state = {}
            for st in state_groups_ids.values():
                for key, e_id in st.items():
                    state.setdefault(key, set()).add(e_id)

            conflicted_state = {
                k: list(v)
                for k, v in state.items()
                if len(v) > 1
            }

            if conflicted_state:
                logger.info("Resolving conflicted state for %r", room_id)
                state_map = yield self.store.get_events(
                    [e_id for st in state_groups_ids.values() for e_id in st.values()],
                    get_prev_content=False
                )
                state_sets = [
                    [state_map[e_id] for key, e_id in st.items() if e_id in state_map]
                    for st in state_groups_ids.values()
                ]
                new_state, _ = self._resolve_events(
                    state_sets, event_type, state_key
                )
                new_state = {
                    key: e.event_id for key, e in new_state.items()
                }
            else:
                new_state = {
                    key: e_ids.pop() for key, e_ids in state.items()
                }

            state_group = None
            new_state_event_ids = frozenset(new_state.values())
            for sg, events in state_groups_ids.items():
                if new_state_event_ids == frozenset(e_id for e_id in events):
                    state_group = sg
                    break
            if state_group is None:
                # Worker instances don't have access to this method, but we want
                # to set the state_group on the main instance to increase cache
                # hits.
                if hasattr(self.store, "get_next_state_group"):
                    state_group = self.store.get_next_state_group()

            prev_group = None
            delta_ids = None
            for old_group, old_ids in state_groups_ids.items():
                if not set(new_state.iterkeys()) - set(old_ids.iterkeys()):
                    n_delta_ids = {
                        k: v
                        for k, v in new_state.items()
                        if old_ids.get(k) != v
                    }
                    if not delta_ids or len(n_delta_ids) < len(delta_ids):
                        prev_group = old_group
                        delta_ids = n_delta_ids

            cache = _StateCacheEntry(
                state=new_state,
                state_group=state_group,
                prev_group=prev_group,
                delta_ids=delta_ids,
            )

            if self._state_cache is not None:
                self._state_cache[group_names] = cache

            defer.returnValue(cache)

    def resolve_events(self, state_sets, event):
        logger.info(
            "Resolving state for %s with %d groups", event.room_id, len(state_sets)
        )
        if event.is_state():
            return self._resolve_events(
                state_sets, event.type, event.state_key
            )
        else:
            return self._resolve_events(state_sets)

    def _resolve_events(self, state_sets, event_type=None, state_key=""):
        """
        Returns
            (dict[(str, str), synapse.events.FrozenEvent], list[str]): a tuple
            (new_state, prev_states). new_state is a map from (type, state_key)
            to event. prev_states is a list of event_ids.
        """
        with Measure(self.clock, "state._resolve_events"):
            state = {}
            for st in state_sets:
                for e in st:
                    state.setdefault(
                        (e.type, e.state_key),
                        {}
                    )[e.event_id] = e

            unconflicted_state = {
                k: v.values()[0] for k, v in state.items()
                if len(v.values()) == 1
            }

            conflicted_state = {
                k: v.values()
                for k, v in state.items()
                if len(v.values()) > 1
            }

            if event_type:
                prev_states_events = conflicted_state.get(
                    (event_type, state_key), []
                )
                prev_states = [s.event_id for s in prev_states_events]
            else:
                prev_states = []

            auth_events = {
                k: e for k, e in unconflicted_state.items()
                if k[0] in AuthEventTypes
            }

            try:
                resolved_state = self._resolve_state_events(
                    conflicted_state, auth_events
                )
            except:
                logger.exception("Failed to resolve state")
                raise

            new_state = unconflicted_state
            new_state.update(resolved_state)

        return new_state, prev_states

    @log_function
    def _resolve_state_events(self, conflicted_state, auth_events):
        """ This is where we actually decide which of the conflicted state to
        use.

        We resolve conflicts in the following order:
            1. power levels
            2. join rules
            3. memberships
            4. other events.
        """
        resolved_state = {}
        power_key = (EventTypes.PowerLevels, "")
        if power_key in conflicted_state:
            events = conflicted_state[power_key]
            logger.debug("Resolving conflicted power levels %r", events)
            resolved_state[power_key] = self._resolve_auth_events(
                events, auth_events)

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key[0] == EventTypes.JoinRules:
                logger.debug("Resolving conflicted join rules %r", events)
                resolved_state[key] = self._resolve_auth_events(
                    events,
                    auth_events
                )

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key[0] == EventTypes.Member:
                logger.debug("Resolving conflicted member lists %r", events)
                resolved_state[key] = self._resolve_auth_events(
                    events,
                    auth_events
                )

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key not in resolved_state:
                logger.debug("Resolving conflicted state %r:%r", key, events)
                resolved_state[key] = self._resolve_normal_events(
                    events, auth_events
                )

        return resolved_state

    def _resolve_auth_events(self, events, auth_events):
        reverse = [i for i in reversed(self._ordered_events(events))]

        auth_events = dict(auth_events)

        prev_event = reverse[0]
        for event in reverse[1:]:
            auth_events[(prev_event.type, prev_event.state_key)] = prev_event
            try:
                # FIXME: hs.get_auth() is bad style, but we need to do it to
                # get around circular deps.
                # The signatures have already been checked at this point
                self.hs.get_auth().check(event, auth_events, do_sig_check=False)
                prev_event = event
            except AuthError:
                return prev_event

        return event

    def _resolve_normal_events(self, events, auth_events):
        for event in self._ordered_events(events):
            try:
                # FIXME: hs.get_auth() is bad style, but we need to do it to
                # get around circular deps.
                # The signatures have already been checked at this point
                self.hs.get_auth().check(event, auth_events, do_sig_check=False)
                return event
            except AuthError:
                pass

        # Use the last event (the one with the least depth) if they all fail
        # the auth check.
        return event

    def _ordered_events(self, events):
        def key_func(e):
            return -int(e.depth), hashlib.sha1(e.event_id).hexdigest()

        return sorted(events, key=key_func)
class FederationClient(FederationBase):
    def __init__(self, hs):
        super(FederationClient, self).__init__(hs)

        self.pdu_destination_tried = {}
        self._clock.looping_call(
            self._clear_tried_cache,
            60 * 1000,
        )
        self.state = hs.get_state_handler()
        self.transport_layer = hs.get_federation_transport_client()

    def _clear_tried_cache(self):
        """Clear pdu_destination_tried cache"""
        now = self._clock.time_msec()

        old_dict = self.pdu_destination_tried
        self.pdu_destination_tried = {}

        for event_id, destination_dict in old_dict.items():
            destination_dict = {
                dest: time
                for dest, time in destination_dict.items()
                if time + PDU_RETRY_TIME_MS > now
            }
            if destination_dict:
                self.pdu_destination_tried[event_id] = destination_dict

    def start_get_pdu_cache(self):
        self._get_pdu_cache = ExpiringCache(
            cache_name="get_pdu_cache",
            clock=self._clock,
            max_len=1000,
            expiry_ms=120 * 1000,
            reset_expiry_on_get=False,
        )

        self._get_pdu_cache.start()

    @log_function
    def make_query(self,
                   destination,
                   query_type,
                   args,
                   retry_on_dns_fail=False,
                   ignore_backoff=False):
        """Sends a federation Query to a remote homeserver of the given type
        and arguments.

        Args:
            destination (str): Domain name of the remote homeserver
            query_type (str): Category of the query type; should match the
                handler name used in register_query_handler().
            args (dict): Mapping of strings to strings containing the details
                of the query request.
            ignore_backoff (bool): true to ignore the historical backoff data
                and try the request anyway.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.labels(query_type).inc()

        return self.transport_layer.make_query(
            destination,
            query_type,
            args,
            retry_on_dns_fail=retry_on_dns_fail,
            ignore_backoff=ignore_backoff,
        )

    @log_function
    def query_client_keys(self, destination, content, timeout):
        """Query device keys for a device hosted on a remote server.

        Args:
            destination (str): Domain name of the remote homeserver
            content (dict): The query content.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.labels("client_device_keys").inc()
        return self.transport_layer.query_client_keys(destination, content,
                                                      timeout)

    @log_function
    def query_user_devices(self, destination, user_id, timeout=30000):
        """Query the device keys for a list of user ids hosted on a remote
        server.
        """
        sent_queries_counter.labels("user_devices").inc()
        return self.transport_layer.query_user_devices(destination, user_id,
                                                       timeout)

    @log_function
    def claim_client_keys(self, destination, content, timeout):
        """Claims one-time keys for a device hosted on a remote server.

        Args:
            destination (str): Domain name of the remote homeserver
            content (dict): The query content.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.labels("client_one_time_keys").inc()
        return self.transport_layer.claim_client_keys(destination, content,
                                                      timeout)

    @defer.inlineCallbacks
    @log_function
    def backfill(self, dest, context, limit, extremities):
        """Requests some more historic PDUs for the given context from the
        given destination server.

        Args:
            dest (str): The remote home server to ask.
            context (str): The context to backfill.
            limit (int): The maximum number of PDUs to return.
            extremities (list): List of PDU id and origins of the first pdus
                we have seen from the context

        Returns:
            Deferred: Results in the received PDUs.
        """
        logger.debug("backfill extrem=%s", extremities)

        # If there are no extremeties then we've (probably) reached the start.
        if not extremities:
            return

        transaction_data = yield self.transport_layer.backfill(
            dest, context, extremities, limit)

        logger.debug("backfill transaction_data=%s", repr(transaction_data))

        pdus = [
            event_from_pdu_json(p, outlier=False)
            for p in transaction_data["pdus"]
        ]

        # FIXME: We should handle signature failures more gracefully.
        pdus[:] = yield logcontext.make_deferred_yieldable(
            defer.gatherResults(
                self._check_sigs_and_hashes(pdus),
                consumeErrors=True,
            ).addErrback(unwrapFirstError))

        defer.returnValue(pdus)

    @defer.inlineCallbacks
    @log_function
    def get_pdu(self, destinations, event_id, outlier=False, timeout=None):
        """Requests the PDU with given origin and ID from the remote home
        servers.

        Will attempt to get the PDU from each destination in the list until
        one succeeds.

        This will persist the PDU locally upon receipt.

        Args:
            destinations (list): Which home servers to query
            event_id (str): event to fetch
            outlier (bool): Indicates whether the PDU is an `outlier`, i.e. if
                it's from an arbitary point in the context as opposed to part
                of the current block of PDUs. Defaults to `False`
            timeout (int): How long to try (in ms) each destination for before
                moving to the next destination. None indicates no timeout.

        Returns:
            Deferred: Results in the requested PDU.
        """

        # TODO: Rate limit the number of times we try and get the same event.

        if self._get_pdu_cache:
            ev = self._get_pdu_cache.get(event_id)
            if ev:
                defer.returnValue(ev)

        pdu_attempts = self.pdu_destination_tried.setdefault(event_id, {})

        signed_pdu = None
        for destination in destinations:
            now = self._clock.time_msec()
            last_attempt = pdu_attempts.get(destination, 0)
            if last_attempt + PDU_RETRY_TIME_MS > now:
                continue

            try:
                transaction_data = yield self.transport_layer.get_event(
                    destination,
                    event_id,
                    timeout=timeout,
                )

                logger.debug("transaction_data %r", transaction_data)

                pdu_list = [
                    event_from_pdu_json(p, outlier=outlier)
                    for p in transaction_data["pdus"]
                ]

                if pdu_list and pdu_list[0]:
                    pdu = pdu_list[0]

                    # Check signatures are correct.
                    signed_pdu = yield self._check_sigs_and_hash(pdu)

                    break

                pdu_attempts[destination] = now

            except SynapseError as e:
                logger.info(
                    "Failed to get PDU %s from %s because %s",
                    event_id,
                    destination,
                    e,
                )
            except NotRetryingDestination as e:
                logger.info(e.message)
                continue
            except FederationDeniedError as e:
                logger.info(e.message)
                continue
            except Exception as e:
                pdu_attempts[destination] = now

                logger.info(
                    "Failed to get PDU %s from %s because %s",
                    event_id,
                    destination,
                    e,
                )
                continue

        if self._get_pdu_cache is not None and signed_pdu:
            self._get_pdu_cache[event_id] = signed_pdu

        defer.returnValue(signed_pdu)

    @defer.inlineCallbacks
    @log_function
    def get_state_for_room(self, destination, room_id, event_id):
        """Requests all of the `current` state PDUs for a given room from
        a remote home server.

        Args:
            destination (str): The remote homeserver to query for the state.
            room_id (str): The id of the room we're interested in.
            event_id (str): The id of the event we want the state at.

        Returns:
            Deferred: Results in a list of PDUs.
        """

        try:
            # First we try and ask for just the IDs, as thats far quicker if
            # we have most of the state and auth_chain already.
            # However, this may 404 if the other side has an old synapse.
            result = yield self.transport_layer.get_room_state_ids(
                destination,
                room_id,
                event_id=event_id,
            )

            state_event_ids = result["pdu_ids"]
            auth_event_ids = result.get("auth_chain_ids", [])

            fetched_events, failed_to_fetch = yield self.get_events(
                [destination], room_id, set(state_event_ids + auth_event_ids))

            if failed_to_fetch:
                logger.warn("Failed to get %r", failed_to_fetch)

            event_map = {ev.event_id: ev for ev in fetched_events}

            pdus = [
                event_map[e_id] for e_id in state_event_ids
                if e_id in event_map
            ]
            auth_chain = [
                event_map[e_id] for e_id in auth_event_ids if e_id in event_map
            ]

            auth_chain.sort(key=lambda e: e.depth)

            defer.returnValue((pdus, auth_chain))
        except HttpResponseException as e:
            if e.code == 400 or e.code == 404:
                logger.info(
                    "Failed to use get_room_state_ids API, falling back")
            else:
                raise e

        result = yield self.transport_layer.get_room_state(
            destination,
            room_id,
            event_id=event_id,
        )

        pdus = [event_from_pdu_json(p, outlier=True) for p in result["pdus"]]

        auth_chain = [
            event_from_pdu_json(p, outlier=True)
            for p in result.get("auth_chain", [])
        ]

        seen_events = yield self.store.get_events(
            [ev.event_id for ev in itertools.chain(pdus, auth_chain)])

        signed_pdus = yield self._check_sigs_and_hash_and_fetch(
            destination, [p for p in pdus if p.event_id not in seen_events],
            outlier=True)
        signed_pdus.extend(seen_events[p.event_id] for p in pdus
                           if p.event_id in seen_events)

        signed_auth = yield self._check_sigs_and_hash_and_fetch(
            destination,
            [p for p in auth_chain if p.event_id not in seen_events],
            outlier=True)
        signed_auth.extend(seen_events[p.event_id] for p in auth_chain
                           if p.event_id in seen_events)

        signed_auth.sort(key=lambda e: e.depth)

        defer.returnValue((signed_pdus, signed_auth))

    @defer.inlineCallbacks
    def get_events(self, destinations, room_id, event_ids, return_local=True):
        """Fetch events from some remote destinations, checking if we already
        have them.

        Args:
            destinations (list)
            room_id (str)
            event_ids (list)
            return_local (bool): Whether to include events we already have in
                the DB in the returned list of events

        Returns:
            Deferred: A deferred resolving to a 2-tuple where the first is a list of
            events and the second is a list of event ids that we failed to fetch.
        """
        if return_local:
            seen_events = yield self.store.get_events(event_ids,
                                                      allow_rejected=True)
            signed_events = list(seen_events.values())
        else:
            seen_events = yield self.store.have_seen_events(event_ids)
            signed_events = []

        failed_to_fetch = set()

        missing_events = set(event_ids)
        for k in seen_events:
            missing_events.discard(k)

        if not missing_events:
            defer.returnValue((signed_events, failed_to_fetch))

        def random_server_list():
            srvs = list(destinations)
            random.shuffle(srvs)
            return srvs

        batch_size = 20
        missing_events = list(missing_events)
        for i in range(0, len(missing_events), batch_size):
            batch = set(missing_events[i:i + batch_size])

            deferreds = [
                run_in_background(
                    self.get_pdu,
                    destinations=random_server_list(),
                    event_id=e_id,
                ) for e_id in batch
            ]

            res = yield make_deferred_yieldable(
                defer.DeferredList(deferreds, consumeErrors=True))
            for success, result in res:
                if success and result:
                    signed_events.append(result)
                    batch.discard(result.event_id)

            # We removed all events we successfully fetched from `batch`
            failed_to_fetch.update(batch)

        defer.returnValue((signed_events, failed_to_fetch))

    @defer.inlineCallbacks
    @log_function
    def get_event_auth(self, destination, room_id, event_id):
        res = yield self.transport_layer.get_event_auth(
            destination,
            room_id,
            event_id,
        )

        auth_chain = [
            event_from_pdu_json(p, outlier=True) for p in res["auth_chain"]
        ]

        signed_auth = yield self._check_sigs_and_hash_and_fetch(destination,
                                                                auth_chain,
                                                                outlier=True)

        signed_auth.sort(key=lambda e: e.depth)

        defer.returnValue(signed_auth)

    @defer.inlineCallbacks
    def _try_destination_list(self, description, destinations, callback):
        """Try an operation on a series of servers, until it succeeds

        Args:
            description (unicode): description of the operation we're doing, for logging

            destinations (Iterable[unicode]): list of server_names to try

            callback (callable):  Function to run for each server. Passed a single
                argument: the server_name to try. May return a deferred.

                If the callback raises a CodeMessageException with a 300/400 code,
                attempts to perform the operation stop immediately and the exception is
                reraised.

                Otherwise, if the callback raises an Exception the error is logged and the
                next server tried. Normally the stacktrace is logged but this is
                suppressed if the exception is an InvalidResponseError.

        Returns:
            The [Deferred] result of callback, if it succeeds

        Raises:
            SynapseError if the chosen remote server returns a 300/400 code.

            RuntimeError if no servers were reachable.
        """
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                res = yield callback(destination)
                defer.returnValue(res)
            except InvalidResponseError as e:
                logger.warn(
                    "Failed to %s via %s: %s",
                    description,
                    destination,
                    e,
                )
            except HttpResponseException as e:
                if not 500 <= e.code < 600:
                    raise e.to_synapse_error()
                else:
                    logger.warn(
                        "Failed to %s via %s: %i %s",
                        description,
                        destination,
                        e.code,
                        e.message,
                    )
            except Exception:
                logger.warn(
                    "Failed to %s via %s",
                    description,
                    destination,
                    exc_info=1,
                )

        raise RuntimeError("Failed to %s via any server" % (description, ))

    def make_membership_event(self, destinations, room_id, user_id, membership,
                              content, params):
        """
        Creates an m.room.member event, with context, without participating in the room.

        Does so by asking one of the already participating servers to create an
        event with proper context.

        Note that this does not append any events to any graphs.

        Args:
            destinations (str): Candidate homeservers which are probably
                participating in the room.
            room_id (str): The room in which the event will happen.
            user_id (str): The user whose membership is being evented.
            membership (str): The "membership" property of the event. Must be
                one of "join" or "leave".
            content (dict): Any additional data to put into the content field
                of the event.
            params (dict[str, str|Iterable[str]]): Query parameters to include in the
                request.
        Return:
            Deferred: resolves to a tuple of (origin (str), event (object))
            where origin is the remote homeserver which generated the event.

            Fails with a ``SynapseError`` if the chosen remote server
            returns a 300/400 code.

            Fails with a ``RuntimeError`` if no servers were reachable.
        """
        valid_memberships = {Membership.JOIN, Membership.LEAVE}
        if membership not in valid_memberships:
            raise RuntimeError(
                "make_membership_event called with membership='%s', must be one of %s"
                % (membership, ",".join(valid_memberships)))

        @defer.inlineCallbacks
        def send_request(destination):
            ret = yield self.transport_layer.make_membership_event(
                destination,
                room_id,
                user_id,
                membership,
                params,
            )

            pdu_dict = ret.get("event", None)
            if not isinstance(pdu_dict, dict):
                raise InvalidResponseError("Bad 'event' field in response")

            logger.debug("Got response to make_%s: %s", membership, pdu_dict)

            pdu_dict["content"].update(content)

            # The protoevent received over the JSON wire may not have all
            # the required fields. Lets just gloss over that because
            # there's some we never care about
            if "prev_state" not in pdu_dict:
                pdu_dict["prev_state"] = []

            ev = builder.EventBuilder(pdu_dict)

            defer.returnValue((destination, ev))

        return self._try_destination_list(
            "make_" + membership,
            destinations,
            send_request,
        )

    def send_join(self, destinations, pdu):
        """Sends a join event to one of a list of homeservers.

        Doing so will cause the remote server to add the event to the graph,
        and send the event out to the rest of the federation.

        Args:
            destinations (str): Candidate homeservers which are probably
                participating in the room.
            pdu (BaseEvent): event to be sent

        Return:
            Deferred: resolves to a dict with members ``origin`` (a string
            giving the serer the event was sent to, ``state`` (?) and
            ``auth_chain``.

            Fails with a ``SynapseError`` if the chosen remote server
            returns a 300/400 code.

            Fails with a ``RuntimeError`` if no servers were reachable.
        """
        def check_authchain_validity(signed_auth_chain):
            for e in signed_auth_chain:
                if e.type == EventTypes.Create:
                    create_event = e
                    break
            else:
                raise InvalidResponseError(
                    "no %s in auth chain" % (EventTypes.Create, ), )

            # the room version should be sane.
            room_version = create_event.content.get("room_version", "1")
            if room_version not in KNOWN_ROOM_VERSIONS:
                # This shouldn't be possible, because the remote server should have
                # rejected the join attempt during make_join.
                raise InvalidResponseError(
                    "room appears to have unsupported version %s" %
                    (room_version, ))

        @defer.inlineCallbacks
        def send_request(destination):
            time_now = self._clock.time_msec()
            _, content = yield self.transport_layer.send_join(
                destination=destination,
                room_id=pdu.room_id,
                event_id=pdu.event_id,
                content=pdu.get_pdu_json(time_now),
            )

            logger.debug("Got content: %s", content)

            state = [
                event_from_pdu_json(p, outlier=True)
                for p in content.get("state", [])
            ]

            auth_chain = [
                event_from_pdu_json(p, outlier=True)
                for p in content.get("auth_chain", [])
            ]

            pdus = {p.event_id: p for p in itertools.chain(state, auth_chain)}

            valid_pdus = yield self._check_sigs_and_hash_and_fetch(
                destination,
                list(pdus.values()),
                outlier=True,
            )

            valid_pdus_map = {p.event_id: p for p in valid_pdus}

            # NB: We *need* to copy to ensure that we don't have multiple
            # references being passed on, as that causes... issues.
            signed_state = [
                copy.copy(valid_pdus_map[p.event_id]) for p in state
                if p.event_id in valid_pdus_map
            ]

            signed_auth = [
                valid_pdus_map[p.event_id] for p in auth_chain
                if p.event_id in valid_pdus_map
            ]

            # NB: We *need* to copy to ensure that we don't have multiple
            # references being passed on, as that causes... issues.
            for s in signed_state:
                s.internal_metadata = copy.deepcopy(s.internal_metadata)

            check_authchain_validity(signed_auth)

            defer.returnValue({
                "state": signed_state,
                "auth_chain": signed_auth,
                "origin": destination,
            })

        return self._try_destination_list("send_join", destinations,
                                          send_request)

    @defer.inlineCallbacks
    def send_invite(self, destination, room_id, event_id, pdu):
        time_now = self._clock.time_msec()
        try:
            code, content = yield self.transport_layer.send_invite(
                destination=destination,
                room_id=room_id,
                event_id=event_id,
                content=pdu.get_pdu_json(time_now),
            )
        except HttpResponseException as e:
            if e.code == 403:
                raise e.to_synapse_error()
            raise

        pdu_dict = content["event"]

        logger.debug("Got response to send_invite: %s", pdu_dict)

        pdu = event_from_pdu_json(pdu_dict)

        # Check signatures are correct.
        pdu = yield self._check_sigs_and_hash(pdu)

        # FIXME: We should handle signature failures more gracefully.

        defer.returnValue(pdu)

    def send_leave(self, destinations, pdu):
        """Sends a leave event to one of a list of homeservers.

        Doing so will cause the remote server to add the event to the graph,
        and send the event out to the rest of the federation.

        This is mostly useful to reject received invites.

        Args:
            destinations (str): Candidate homeservers which are probably
                participating in the room.
            pdu (BaseEvent): event to be sent

        Return:
            Deferred: resolves to None.

            Fails with a ``SynapseError`` if the chosen remote server
            returns a 300/400 code.

            Fails with a ``RuntimeError`` if no servers were reachable.
        """
        @defer.inlineCallbacks
        def send_request(destination):
            time_now = self._clock.time_msec()
            _, content = yield self.transport_layer.send_leave(
                destination=destination,
                room_id=pdu.room_id,
                event_id=pdu.event_id,
                content=pdu.get_pdu_json(time_now),
            )

            logger.debug("Got content: %s", content)
            defer.returnValue(None)

        return self._try_destination_list("send_leave", destinations,
                                          send_request)

    def get_public_rooms(self,
                         destination,
                         limit=None,
                         since_token=None,
                         search_filter=None,
                         include_all_networks=False,
                         third_party_instance_id=None):
        if destination == self.server_name:
            return

        return self.transport_layer.get_public_rooms(
            destination,
            limit,
            since_token,
            search_filter,
            include_all_networks=include_all_networks,
            third_party_instance_id=third_party_instance_id,
        )

    @defer.inlineCallbacks
    def query_auth(self, destination, room_id, event_id, local_auth):
        """
        Params:
            destination (str)
            event_it (str)
            local_auth (list)
        """
        time_now = self._clock.time_msec()

        send_content = {
            "auth_chain": [e.get_pdu_json(time_now) for e in local_auth],
        }

        code, content = yield self.transport_layer.send_query_auth(
            destination=destination,
            room_id=room_id,
            event_id=event_id,
            content=send_content,
        )

        auth_chain = [event_from_pdu_json(e) for e in content["auth_chain"]]

        signed_auth = yield self._check_sigs_and_hash_and_fetch(destination,
                                                                auth_chain,
                                                                outlier=True)

        signed_auth.sort(key=lambda e: e.depth)

        ret = {
            "auth_chain": signed_auth,
            "rejects": content.get("rejects", []),
            "missing": content.get("missing", []),
        }

        defer.returnValue(ret)

    @defer.inlineCallbacks
    def get_missing_events(self, destination, room_id, earliest_events_ids,
                           latest_events, limit, min_depth, timeout):
        """Tries to fetch events we are missing. This is called when we receive
        an event without having received all of its ancestors.

        Args:
            destination (str)
            room_id (str)
            earliest_events_ids (list): List of event ids. Effectively the
                events we expected to receive, but haven't. `get_missing_events`
                should only return events that didn't happen before these.
            latest_events (list): List of events we have received that we don't
                have all previous events for.
            limit (int): Maximum number of events to return.
            min_depth (int): Minimum depth of events tor return.
            timeout (int): Max time to wait in ms
        """
        try:
            content = yield self.transport_layer.get_missing_events(
                destination=destination,
                room_id=room_id,
                earliest_events=earliest_events_ids,
                latest_events=[e.event_id for e in latest_events],
                limit=limit,
                min_depth=min_depth,
                timeout=timeout,
            )

            events = [
                event_from_pdu_json(e) for e in content.get("events", [])
            ]

            signed_events = yield self._check_sigs_and_hash_and_fetch(
                destination, events, outlier=False)
        except HttpResponseException as e:
            if not e.code == 400:
                raise

            # We are probably hitting an old server that doesn't support
            # get_missing_events
            signed_events = []

        defer.returnValue(signed_events)

    @defer.inlineCallbacks
    def forward_third_party_invite(self, destinations, room_id, event_dict):
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                yield self.transport_layer.exchange_third_party_invite(
                    destination=destination,
                    room_id=room_id,
                    event_dict=event_dict,
                )
                defer.returnValue(None)
            except CodeMessageException:
                raise
            except Exception as e:
                logger.exception(
                    "Failed to send_third_party_invite via %s: %s",
                    destination, e.message)

        raise RuntimeError("Failed to send to any server.")
Пример #7
0
class StateHandler(object):
    """ Responsible for doing state conflict resolution.
    """
    def __init__(self, hs):
        self.clock = hs.get_clock()
        self.store = hs.get_datastore()
        self.hs = hs

        # dict of set of event_ids -> _StateCacheEntry.
        self._state_cache = None
        self.resolve_linearizer = Linearizer()

    def start_caching(self):
        logger.debug("start_caching")

        self._state_cache = ExpiringCache(
            cache_name="state_cache",
            clock=self.clock,
            max_len=SIZE_OF_CACHE,
            expiry_ms=EVICTION_TIMEOUT_SECONDS * 1000,
            reset_expiry_on_get=True,
        )

        self._state_cache.start()

    @defer.inlineCallbacks
    def get_current_state(self,
                          room_id,
                          event_type=None,
                          state_key="",
                          latest_event_ids=None):
        """ Retrieves the current state for the room. This is done by
        calling `get_latest_events_in_room` to get the leading edges of the
        event graph and then resolving any of the state conflicts.

        This is equivalent to getting the state of an event that were to send
        next before receiving any new events.

        If `event_type` is specified, then the method returns only the one
        event (or None) with that `event_type` and `state_key`.

        Returns:
            map from (type, state_key) to event
        """
        if not latest_event_ids:
            latest_event_ids = yield self.store.get_latest_event_ids_in_room(
                room_id)

        ret = yield self.resolve_state_groups(room_id, latest_event_ids)
        state = ret.state

        if event_type:
            event_id = state.get((event_type, state_key))
            event = None
            if event_id:
                event = yield self.store.get_event(event_id, allow_none=True)
            defer.returnValue(event)
            return

        state_map = yield self.store.get_events(state.values(),
                                                get_prev_content=False)
        state = {
            key: state_map[e_id]
            for key, e_id in state.items() if e_id in state_map
        }

        defer.returnValue(state)

    @defer.inlineCallbacks
    def get_current_state_ids(self,
                              room_id,
                              event_type=None,
                              state_key="",
                              latest_event_ids=None):
        if not latest_event_ids:
            latest_event_ids = yield self.store.get_latest_event_ids_in_room(
                room_id)

        ret = yield self.resolve_state_groups(room_id, latest_event_ids)
        state = ret.state

        if event_type:
            defer.returnValue(state.get((event_type, state_key)))
            return

        defer.returnValue(state)

    @defer.inlineCallbacks
    def get_current_user_in_room(self, room_id):
        latest_event_ids = yield self.store.get_latest_event_ids_in_room(
            room_id)
        entry = yield self.resolve_state_groups(room_id, latest_event_ids)
        joined_users = yield self.store.get_joined_users_from_state(
            room_id, entry.state_id, entry.state)
        defer.returnValue(joined_users)

    @defer.inlineCallbacks
    def compute_event_context(self, event, old_state=None):
        """ Fills out the context with the `current state` of the graph. The
        `current state` here is defined to be the state of the event graph
        just before the event - i.e. it never includes `event`

        If `event` has `auth_events` then this will also fill out the
        `auth_events` field on `context` from the `current_state`.

        Args:
            event (EventBase)
        Returns:
            an EventContext
        """
        context = EventContext()

        if event.internal_metadata.is_outlier():
            # If this is an outlier, then we know it shouldn't have any current
            # state. Certainly store.get_current_state won't return any, and
            # persisting the event won't store the state group.
            if old_state:
                context.prev_state_ids = {(s.type, s.state_key): s.event_id
                                          for s in old_state}
                if event.is_state():
                    context.current_state_events = dict(context.prev_state_ids)
                    key = (event.type, event.state_key)
                    context.current_state_events[key] = event.event_id
                else:
                    context.current_state_events = context.prev_state_ids
            else:
                context.current_state_ids = {}
                context.prev_state_ids = {}
            context.prev_state_events = []
            context.state_group = self.store.get_next_state_group()
            defer.returnValue(context)

        if old_state:
            context.prev_state_ids = {(s.type, s.state_key): s.event_id
                                      for s in old_state}
            context.state_group = self.store.get_next_state_group()

            if event.is_state():
                key = (event.type, event.state_key)
                if key in context.prev_state_ids:
                    replaces = context.prev_state_ids[key]
                    if replaces != event.event_id:  # Paranoia check
                        event.unsigned["replaces_state"] = replaces
                context.current_state_ids = dict(context.prev_state_ids)
                context.current_state_ids[key] = event.event_id
            else:
                context.current_state_ids = context.prev_state_ids

            context.prev_state_events = []
            defer.returnValue(context)

        if event.is_state():
            entry = yield self.resolve_state_groups(
                event.room_id,
                [e for e, _ in event.prev_events],
                event_type=event.type,
                state_key=event.state_key,
            )
        else:
            entry = yield self.resolve_state_groups(
                event.room_id,
                [e for e, _ in event.prev_events],
            )

        curr_state = entry.state

        context.prev_state_ids = curr_state
        if event.is_state():
            context.state_group = self.store.get_next_state_group()
        else:
            if entry.state_group is None:
                entry.state_group = self.store.get_next_state_group()
                entry.state_id = entry.state_group
            context.state_group = entry.state_group

        if event.is_state():
            key = (event.type, event.state_key)
            if key in context.prev_state_ids:
                replaces = context.prev_state_ids[key]
                event.unsigned["replaces_state"] = replaces
            context.current_state_ids = dict(context.prev_state_ids)
            context.current_state_ids[key] = event.event_id
        else:
            context.current_state_ids = context.prev_state_ids

        context.prev_state_events = []
        defer.returnValue(context)

    @defer.inlineCallbacks
    @log_function
    def resolve_state_groups(self,
                             room_id,
                             event_ids,
                             event_type=None,
                             state_key=""):
        """ Given a list of event_ids this method fetches the state at each
        event, resolves conflicts between them and returns them.

        Returns:
            a Deferred tuple of (`state_group`, `state`, `prev_state`).
            `state_group` is the name of a state group if one and only one is
            involved. `state` is a map from (type, state_key) to event, and
            `prev_state` is a list of event ids.
        """
        logger.debug("resolve_state_groups event_ids %s", event_ids)

        state_groups_ids = yield self.store.get_state_groups_ids(
            room_id, event_ids)

        logger.debug("resolve_state_groups state_groups %s",
                     state_groups_ids.keys())

        group_names = frozenset(state_groups_ids.keys())
        if len(group_names) == 1:
            name, state_list = state_groups_ids.items().pop()

            defer.returnValue(
                _StateCacheEntry(
                    state=state_list,
                    state_group=name,
                ))

        with (yield self.resolve_linearizer.queue(group_names)):
            if self._state_cache is not None:
                cache = self._state_cache.get(group_names, None)
                if cache:
                    defer.returnValue(cache)

            logger.info("Resolving state for %s with %d groups", room_id,
                        len(state_groups_ids))

            state = {}
            for st in state_groups_ids.values():
                for key, e_id in st.items():
                    state.setdefault(key, set()).add(e_id)

            conflicted_state = {
                k: list(v)
                for k, v in state.items() if len(v) > 1
            }

            if conflicted_state:
                logger.info("Resolving conflicted state for %r", room_id)
                state_map = yield self.store.get_events([
                    e_id for st in state_groups_ids.values()
                    for e_id in st.values()
                ],
                                                        get_prev_content=False)
                state_sets = [[
                    state_map[e_id] for key, e_id in st.items()
                    if e_id in state_map
                ] for st in state_groups_ids.values()]
                new_state, _ = self._resolve_events(state_sets, event_type,
                                                    state_key)
                new_state = {key: e.event_id for key, e in new_state.items()}
            else:
                new_state = {key: e_ids.pop() for key, e_ids in state.items()}

            state_group = None
            new_state_event_ids = frozenset(new_state.values())
            for sg, events in state_groups_ids.items():
                if new_state_event_ids == frozenset(e_id for e_id in events):
                    state_group = sg
                    break
            if state_group is None:
                # Worker instances don't have access to this method, but we want
                # to set the state_group on the main instance to increase cache
                # hits.
                if hasattr(self.store, "get_next_state_group"):
                    state_group = self.store.get_next_state_group()

            cache = _StateCacheEntry(
                state=new_state,
                state_group=state_group,
            )

            if self._state_cache is not None:
                self._state_cache[group_names] = cache

            defer.returnValue(cache)

    def resolve_events(self, state_sets, event):
        logger.info("Resolving state for %s with %d groups", event.room_id,
                    len(state_sets))
        if event.is_state():
            return self._resolve_events(state_sets, event.type,
                                        event.state_key)
        else:
            return self._resolve_events(state_sets)

    def _resolve_events(self, state_sets, event_type=None, state_key=""):
        """
        Returns
            (dict[(str, str), synapse.events.FrozenEvent], list[str]): a tuple
            (new_state, prev_states). new_state is a map from (type, state_key)
            to event. prev_states is a list of event_ids.
        """
        with Measure(self.clock, "state._resolve_events"):
            state = {}
            for st in state_sets:
                for e in st:
                    state.setdefault((e.type, e.state_key), {})[e.event_id] = e

            unconflicted_state = {
                k: v.values()[0]
                for k, v in state.items() if len(v.values()) == 1
            }

            conflicted_state = {
                k: v.values()
                for k, v in state.items() if len(v.values()) > 1
            }

            if event_type:
                prev_states_events = conflicted_state.get(
                    (event_type, state_key), [])
                prev_states = [s.event_id for s in prev_states_events]
            else:
                prev_states = []

            auth_events = {
                k: e
                for k, e in unconflicted_state.items()
                if k[0] in AuthEventTypes
            }

            try:
                resolved_state = self._resolve_state_events(
                    conflicted_state, auth_events)
            except:
                logger.exception("Failed to resolve state")
                raise

            new_state = unconflicted_state
            new_state.update(resolved_state)

        return new_state, prev_states

    @log_function
    def _resolve_state_events(self, conflicted_state, auth_events):
        """ This is where we actually decide which of the conflicted state to
        use.

        We resolve conflicts in the following order:
            1. power levels
            2. join rules
            3. memberships
            4. other events.
        """
        resolved_state = {}
        power_key = (EventTypes.PowerLevels, "")
        if power_key in conflicted_state:
            events = conflicted_state[power_key]
            logger.debug("Resolving conflicted power levels %r", events)
            resolved_state[power_key] = self._resolve_auth_events(
                events, auth_events)

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key[0] == EventTypes.JoinRules:
                logger.debug("Resolving conflicted join rules %r", events)
                resolved_state[key] = self._resolve_auth_events(
                    events, auth_events)

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key[0] == EventTypes.Member:
                logger.debug("Resolving conflicted member lists %r", events)
                resolved_state[key] = self._resolve_auth_events(
                    events, auth_events)

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key not in resolved_state:
                logger.debug("Resolving conflicted state %r:%r", key, events)
                resolved_state[key] = self._resolve_normal_events(
                    events, auth_events)

        return resolved_state

    def _resolve_auth_events(self, events, auth_events):
        reverse = [i for i in reversed(self._ordered_events(events))]

        auth_events = dict(auth_events)

        prev_event = reverse[0]
        for event in reverse[1:]:
            auth_events[(prev_event.type, prev_event.state_key)] = prev_event
            try:
                # FIXME: hs.get_auth() is bad style, but we need to do it to
                # get around circular deps.
                # The signatures have already been checked at this point
                self.hs.get_auth().check(event,
                                         auth_events,
                                         do_sig_check=False)
                prev_event = event
            except AuthError:
                return prev_event

        return event

    def _resolve_normal_events(self, events, auth_events):
        for event in self._ordered_events(events):
            try:
                # FIXME: hs.get_auth() is bad style, but we need to do it to
                # get around circular deps.
                # The signatures have already been checked at this point
                self.hs.get_auth().check(event,
                                         auth_events,
                                         do_sig_check=False)
                return event
            except AuthError:
                pass

        # Use the last event (the one with the least depth) if they all fail
        # the auth check.
        return event

    def _ordered_events(self, events):
        def key_func(e):
            return -int(e.depth), hashlib.sha1(e.event_id).hexdigest()

        return sorted(events, key=key_func)
Пример #8
0
class PreviewUrlResource(Resource):
    isLeaf = True

    def __init__(self, hs, media_repo):
        Resource.__init__(self)

        self.auth = hs.get_auth()
        self.clock = hs.get_clock()
        self.version_string = hs.version_string
        self.filepaths = media_repo.filepaths
        self.max_spider_size = hs.config.max_spider_size
        self.server_name = hs.hostname
        self.store = hs.get_datastore()
        self.client = SpiderHttpClient(hs)
        self.media_repo = media_repo

        self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist

        # simple memory cache mapping urls to OG metadata
        self.cache = ExpiringCache(
            cache_name="url_previews",
            clock=self.clock,
            # don't spider URLs more often than once an hour
            expiry_ms=60 * 60 * 1000,
        )
        self.cache.start()

        self.downloads = {}

    def render_GET(self, request):
        self._async_render_GET(request)
        return NOT_DONE_YET

    @request_handler()
    @defer.inlineCallbacks
    def _async_render_GET(self, request):

        # XXX: if get_user_by_req fails, what should we do in an async render?
        requester = yield self.auth.get_user_by_req(request)
        url = request.args.get("url")[0]
        if "ts" in request.args:
            ts = int(request.args.get("ts")[0])
        else:
            ts = self.clock.time_msec()

        url_tuple = urlparse.urlsplit(url)
        for entry in self.url_preview_url_blacklist:
            match = True
            for attrib in entry:
                pattern = entry[attrib]
                value = getattr(url_tuple, attrib)
                logger.debug((
                    "Matching attrib '%s' with value '%s' against"
                    " pattern '%s'"
                ) % (attrib, value, pattern))

                if value is None:
                    match = False
                    continue

                if pattern.startswith('^'):
                    if not re.match(pattern, getattr(url_tuple, attrib)):
                        match = False
                        continue
                else:
                    if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
                        match = False
                        continue
            if match:
                logger.warn(
                    "URL %s blocked by url_blacklist entry %s", url, entry
                )
                raise SynapseError(
                    403, "URL blocked by url pattern blacklist entry",
                    Codes.UNKNOWN
                )

        # first check the memory cache - good to handle all the clients on this
        # HS thundering away to preview the same URL at the same time.
        og = self.cache.get(url)
        if og:
            respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
            return

        # then check the URL cache in the DB (which will also provide us with
        # historical previews, if we have any)
        cache_result = yield self.store.get_url_cache(url, ts)
        if (
            cache_result and
            cache_result["download_ts"] + cache_result["expires"] > ts and
            cache_result["response_code"] / 100 == 2
        ):
            respond_with_json_bytes(
                request, 200, cache_result["og"].encode('utf-8'),
                send_cors=True
            )
            return

        # Ensure only one download for a given URL is active at a time
        download = self.downloads.get(url)
        if download is None:
            download = self._download_url(url, requester.user)
            download = ObservableDeferred(
                download,
                consumeErrors=True
            )
            self.downloads[url] = download

            @download.addBoth
            def callback(media_info):
                del self.downloads[url]
                return media_info
        media_info = yield download.observe()

        # FIXME: we should probably update our cache now anyway, so that
        # even if the OG calculation raises, we don't keep hammering on the
        # remote server.  For now, leave it uncached to aid debugging OG
        # calculation problems

        logger.debug("got media_info of '%s'" % media_info)

        if _is_media(media_info['media_type']):
            dims = yield self.media_repo._generate_local_thumbnails(
                media_info['filesystem_id'], media_info
            )

            og = {
                "og:description": media_info['download_name'],
                "og:image": "mxc://%s/%s" % (
                    self.server_name, media_info['filesystem_id']
                ),
                "og:image:type": media_info['media_type'],
                "matrix:image:size": media_info['media_length'],
            }

            if dims:
                og["og:image:width"] = dims['width']
                og["og:image:height"] = dims['height']
            else:
                logger.warn("Couldn't get dims for %s" % url)

            # define our OG response for this media
        elif _is_html(media_info['media_type']):
            # TODO: somehow stop a big HTML tree from exploding synapse's RAM

            file = open(media_info['filename'])
            body = file.read()
            file.close()

            # clobber the encoding from the content-type, or default to utf-8
            # XXX: this overrides any <meta/> or XML charset headers in the body
            # which may pose problems, but so far seems to work okay.
            match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
            encoding = match.group(1) if match else "utf-8"

            og = decode_and_calc_og(body, media_info['uri'], encoding)

            # pre-cache the image for posterity
            # FIXME: it might be cleaner to use the same flow as the main /preview_url
            # request itself and benefit from the same caching etc.  But for now we
            # just rely on the caching on the master request to speed things up.
            if 'og:image' in og and og['og:image']:
                image_info = yield self._download_url(
                    _rebase_url(og['og:image'], media_info['uri']), requester.user
                )

                if _is_media(image_info['media_type']):
                    # TODO: make sure we don't choke on white-on-transparent images
                    dims = yield self.media_repo._generate_local_thumbnails(
                        image_info['filesystem_id'], image_info
                    )
                    if dims:
                        og["og:image:width"] = dims['width']
                        og["og:image:height"] = dims['height']
                    else:
                        logger.warn("Couldn't get dims for %s" % og["og:image"])

                    og["og:image"] = "mxc://%s/%s" % (
                        self.server_name, image_info['filesystem_id']
                    )
                    og["og:image:type"] = image_info['media_type']
                    og["matrix:image:size"] = image_info['media_length']
                else:
                    del og["og:image"]
        else:
            logger.warn("Failed to find any OG data in %s", url)
            og = {}

        logger.debug("Calculated OG for %s as %s" % (url, og))

        # store OG in ephemeral in-memory cache
        self.cache[url] = og

        # store OG in history-aware DB cache
        yield self.store.store_url_cache(
            url,
            media_info["response_code"],
            media_info["etag"],
            media_info["expires"],
            json.dumps(og),
            media_info["filesystem_id"],
            media_info["created_ts"],
        )

        respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)

    @defer.inlineCallbacks
    def _download_url(self, url, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
        # bot, so are we really a robot?

        # XXX: horrible duplication with base_resource's _download_remote_file()
        file_id = random_string(24)

        fname = self.filepaths.local_media_filepath(file_id)
        self.media_repo._makedirs(fname)

        try:
            with open(fname, "wb") as f:
                logger.debug("Trying to get url '%s'" % url)
                length, headers, uri, code = yield self.client.get_file(
                    url, output_stream=f, max_size=self.max_spider_size,
                )
                # FIXME: pass through 404s and other error messages nicely

            media_type = headers["Content-Type"][0]
            time_now_ms = self.clock.time_msec()

            content_disposition = headers.get("Content-Disposition", None)
            if content_disposition:
                _, params = cgi.parse_header(content_disposition[0],)
                download_name = None

                # First check if there is a valid UTF-8 filename
                download_name_utf8 = params.get("filename*", None)
                if download_name_utf8:
                    if download_name_utf8.lower().startswith("utf-8''"):
                        download_name = download_name_utf8[7:]

                # If there isn't check for an ascii name.
                if not download_name:
                    download_name_ascii = params.get("filename", None)
                    if download_name_ascii and is_ascii(download_name_ascii):
                        download_name = download_name_ascii

                if download_name:
                    download_name = urlparse.unquote(download_name)
                    try:
                        download_name = download_name.decode("utf-8")
                    except UnicodeDecodeError:
                        download_name = None
            else:
                download_name = None

            yield self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
                time_now_ms=self.clock.time_msec(),
                upload_name=download_name,
                media_length=length,
                user_id=user,
            )

        except Exception as e:
            os.remove(fname)
            raise SynapseError(
                500, ("Failed to download content: %s" % e),
                Codes.UNKNOWN
            )

        defer.returnValue({
            "media_type": media_type,
            "media_length": length,
            "download_name": download_name,
            "created_ts": time_now_ms,
            "filesystem_id": file_id,
            "filename": fname,
            "uri": uri,
            "response_code": code,
            # FIXME: we should calculate a proper expiration based on the
            # Cache-Control and Expire headers.  But for now, assume 1 hour.
            "expires": 60 * 60 * 1000,
            "etag": headers["ETag"][0] if "ETag" in headers else None,
        })
Пример #9
0
class StateResolutionHandler(object):
    """Responsible for doing state conflict resolution.

    Note that the storage layer depends on this handler, so all functions must
    be storage-independent.
    """
    def __init__(self, hs):
        self.clock = hs.get_clock()

        # dict of set of event_ids -> _StateCacheEntry.
        self._state_cache = None
        self.resolve_linearizer = Linearizer(name="state_resolve_lock")

    def start_caching(self):
        logger.debug("start_caching")

        self._state_cache = ExpiringCache(
            cache_name="state_cache",
            clock=self.clock,
            max_len=SIZE_OF_CACHE,
            expiry_ms=EVICTION_TIMEOUT_SECONDS * 1000,
            iterable=True,
            reset_expiry_on_get=True,
        )

        self._state_cache.start()

    @defer.inlineCallbacks
    @log_function
    def resolve_state_groups(
        self,
        room_id,
        state_groups_ids,
        event_map,
        state_map_factory,
    ):
        """Resolves conflicts between a set of state groups

        Always generates a new state group (unless we hit the cache), so should
        not be called for a single state group

        Args:
            room_id (str): room we are resolving for (used for logging)
            state_groups_ids (dict[int, dict[(str, str), str]]):
                 map from state group id to the state in that state group
                (where 'state' is a map from state key to event id)

            event_map(dict[str,FrozenEvent]|None):
                a dict from event_id to event, for any events that we happen to
                have in flight (eg, those currently being persisted). This will be
                used as a starting point fof finding the state we need; any missing
                events will be requested via state_map_factory.

                If None, all events will be fetched via state_map_factory.

        Returns:
            Deferred[_StateCacheEntry]: resolved state
        """
        logger.debug("resolve_state_groups state_groups %s",
                     state_groups_ids.keys())

        group_names = frozenset(state_groups_ids.keys())

        with (yield self.resolve_linearizer.queue(group_names)):
            if self._state_cache is not None:
                cache = self._state_cache.get(group_names, None)
                if cache:
                    defer.returnValue(cache)

            logger.info("Resolving state for %s with %d groups", room_id,
                        len(state_groups_ids))

            # build a map from state key to the event_ids which set that state.
            # dict[(str, str), set[str])
            state = {}
            for st in state_groups_ids.itervalues():
                for key, e_id in st.iteritems():
                    state.setdefault(key, set()).add(e_id)

            # build a map from state key to the event_ids which set that state,
            # including only those where there are state keys in conflict.
            conflicted_state = {
                k: list(v)
                for k, v in state.iteritems() if len(v) > 1
            }

            if conflicted_state:
                logger.info("Resolving conflicted state for %r", room_id)
                with Measure(self.clock, "state._resolve_events"):
                    new_state = yield resolve_events_with_factory(
                        state_groups_ids.values(),
                        event_map=event_map,
                        state_map_factory=state_map_factory,
                    )
            else:
                new_state = {
                    key: e_ids.pop()
                    for key, e_ids in state.iteritems()
                }

            with Measure(self.clock, "state.create_group_ids"):
                # if the new state matches any of the input state groups, we can
                # use that state group again. Otherwise we will generate a state_id
                # which will be used as a cache key for future resolutions, but
                # not get persisted.
                state_group = None
                new_state_event_ids = frozenset(new_state.itervalues())
                for sg, events in state_groups_ids.iteritems():
                    if new_state_event_ids == frozenset(e_id
                                                        for e_id in events):
                        state_group = sg
                        break

                # TODO: We want to create a state group for this set of events, to
                # increase cache hits, but we need to make sure that it doesn't
                # end up as a prev_group without being added to the database

                prev_group = None
                delta_ids = None
                for old_group, old_ids in state_groups_ids.iteritems():
                    if not set(new_state) - set(old_ids):
                        n_delta_ids = {
                            k: v
                            for k, v in new_state.iteritems()
                            if old_ids.get(k) != v
                        }
                        if not delta_ids or len(n_delta_ids) < len(delta_ids):
                            prev_group = old_group
                            delta_ids = n_delta_ids

            cache = _StateCacheEntry(
                state=new_state,
                state_group=state_group,
                prev_group=prev_group,
                delta_ids=delta_ids,
            )

            if self._state_cache is not None:
                self._state_cache[group_names] = cache

            defer.returnValue(cache)
Пример #10
0
class StateHandler(object):
    """ Responsible for doing state conflict resolution.
    """
    def __init__(self, hs):
        self.clock = hs.get_clock()
        self.store = hs.get_datastore()
        self.hs = hs

        # dict of set of event_ids -> _StateCacheEntry.
        self._state_cache = None
        self.resolve_linearizer = Linearizer(name="state_resolve_lock")

    def start_caching(self):
        logger.debug("start_caching")

        self._state_cache = ExpiringCache(
            cache_name="state_cache",
            clock=self.clock,
            max_len=SIZE_OF_CACHE,
            expiry_ms=EVICTION_TIMEOUT_SECONDS * 1000,
            iterable=True,
            reset_expiry_on_get=True,
        )

        self._state_cache.start()

    @defer.inlineCallbacks
    def get_current_state(self,
                          room_id,
                          event_type=None,
                          state_key="",
                          latest_event_ids=None):
        """ Retrieves the current state for the room. This is done by
        calling `get_latest_events_in_room` to get the leading edges of the
        event graph and then resolving any of the state conflicts.

        This is equivalent to getting the state of an event that were to send
        next before receiving any new events.

        If `event_type` is specified, then the method returns only the one
        event (or None) with that `event_type` and `state_key`.

        Returns:
            map from (type, state_key) to event
        """
        if not latest_event_ids:
            latest_event_ids = yield self.store.get_latest_event_ids_in_room(
                room_id)

        logger.debug("calling resolve_state_groups from get_current_state")
        ret = yield self.resolve_state_groups(room_id, latest_event_ids)
        state = ret.state

        if event_type:
            event_id = state.get((event_type, state_key))
            event = None
            if event_id:
                event = yield self.store.get_event(event_id, allow_none=True)
            defer.returnValue(event)
            return

        state_map = yield self.store.get_events(state.values(),
                                                get_prev_content=False)
        state = {
            key: state_map[e_id]
            for key, e_id in state.items() if e_id in state_map
        }

        defer.returnValue(state)

    @defer.inlineCallbacks
    def get_current_state_ids(self,
                              room_id,
                              event_type=None,
                              state_key="",
                              latest_event_ids=None):
        if not latest_event_ids:
            latest_event_ids = yield self.store.get_latest_event_ids_in_room(
                room_id)

        logger.debug("calling resolve_state_groups from get_current_state_ids")
        ret = yield self.resolve_state_groups(room_id, latest_event_ids)
        state = ret.state

        if event_type:
            defer.returnValue(state.get((event_type, state_key)))
            return

        defer.returnValue(state)

    @defer.inlineCallbacks
    def get_current_user_in_room(self, room_id, latest_event_ids=None):
        if not latest_event_ids:
            latest_event_ids = yield self.store.get_latest_event_ids_in_room(
                room_id)
        logger.debug(
            "calling resolve_state_groups from get_current_user_in_room")
        entry = yield self.resolve_state_groups(room_id, latest_event_ids)
        joined_users = yield self.store.get_joined_users_from_state(
            room_id, entry.state_id, entry.state)
        defer.returnValue(joined_users)

    @defer.inlineCallbacks
    def compute_event_context(self, event, old_state=None):
        """ Fills out the context with the `current state` of the graph. The
        `current state` here is defined to be the state of the event graph
        just before the event - i.e. it never includes `event`

        If `event` has `auth_events` then this will also fill out the
        `auth_events` field on `context` from the `current_state`.

        Args:
            event (EventBase)
        Returns:
            an EventContext
        """
        context = EventContext()

        if event.internal_metadata.is_outlier():
            # If this is an outlier, then we know it shouldn't have any current
            # state. Certainly store.get_current_state won't return any, and
            # persisting the event won't store the state group.
            if old_state:
                context.prev_state_ids = {(s.type, s.state_key): s.event_id
                                          for s in old_state}
                if event.is_state():
                    context.current_state_events = dict(context.prev_state_ids)
                    key = (event.type, event.state_key)
                    context.current_state_events[key] = event.event_id
                else:
                    context.current_state_events = context.prev_state_ids
            else:
                context.current_state_ids = {}
                context.prev_state_ids = {}
            context.prev_state_events = []
            context.state_group = self.store.get_next_state_group()
            defer.returnValue(context)

        if old_state:
            context.prev_state_ids = {(s.type, s.state_key): s.event_id
                                      for s in old_state}
            context.state_group = self.store.get_next_state_group()

            if event.is_state():
                key = (event.type, event.state_key)
                if key in context.prev_state_ids:
                    replaces = context.prev_state_ids[key]
                    if replaces != event.event_id:  # Paranoia check
                        event.unsigned["replaces_state"] = replaces
                context.current_state_ids = dict(context.prev_state_ids)
                context.current_state_ids[key] = event.event_id
            else:
                context.current_state_ids = context.prev_state_ids

            context.prev_state_events = []
            defer.returnValue(context)

        logger.debug("calling resolve_state_groups from compute_event_context")
        if event.is_state():
            entry = yield self.resolve_state_groups(
                event.room_id,
                [e for e, _ in event.prev_events],
                event_type=event.type,
                state_key=event.state_key,
            )
        else:
            entry = yield self.resolve_state_groups(
                event.room_id,
                [e for e, _ in event.prev_events],
            )

        curr_state = entry.state

        context.prev_state_ids = curr_state
        if event.is_state():
            context.state_group = self.store.get_next_state_group()

            key = (event.type, event.state_key)
            if key in context.prev_state_ids:
                replaces = context.prev_state_ids[key]
                event.unsigned["replaces_state"] = replaces

            context.current_state_ids = dict(context.prev_state_ids)
            context.current_state_ids[key] = event.event_id

            context.prev_group = entry.prev_group
            context.delta_ids = entry.delta_ids
            if context.delta_ids is not None:
                context.delta_ids = dict(context.delta_ids)
                context.delta_ids[key] = event.event_id
        else:
            if entry.state_group is None:
                entry.state_group = self.store.get_next_state_group()
                entry.state_id = entry.state_group

            context.state_group = entry.state_group
            context.current_state_ids = context.prev_state_ids
            context.prev_group = entry.prev_group
            context.delta_ids = entry.delta_ids

        context.prev_state_events = []
        defer.returnValue(context)

    @defer.inlineCallbacks
    @log_function
    def resolve_state_groups(self,
                             room_id,
                             event_ids,
                             event_type=None,
                             state_key=""):
        """ Given a list of event_ids this method fetches the state at each
        event, resolves conflicts between them and returns them.

        Returns:
            a Deferred tuple of (`state_group`, `state`, `prev_state`).
            `state_group` is the name of a state group if one and only one is
            involved. `state` is a map from (type, state_key) to event, and
            `prev_state` is a list of event ids.
        """
        logger.debug("resolve_state_groups event_ids %s", event_ids)

        state_groups_ids = yield self.store.get_state_groups_ids(
            room_id, event_ids)

        logger.debug("resolve_state_groups state_groups %s",
                     state_groups_ids.keys())

        group_names = frozenset(state_groups_ids.keys())
        if len(group_names) == 1:
            name, state_list = state_groups_ids.items().pop()

            defer.returnValue(
                _StateCacheEntry(
                    state=state_list,
                    state_group=name,
                    prev_group=name,
                    delta_ids={},
                ))

        with (yield self.resolve_linearizer.queue(group_names)):
            if self._state_cache is not None:
                cache = self._state_cache.get(group_names, None)
                if cache:
                    defer.returnValue(cache)

            logger.info("Resolving state for %s with %d groups", room_id,
                        len(state_groups_ids))

            state = {}
            for st in state_groups_ids.values():
                for key, e_id in st.items():
                    state.setdefault(key, set()).add(e_id)

            conflicted_state = {
                k: list(v)
                for k, v in state.items() if len(v) > 1
            }

            if conflicted_state:
                logger.info("Resolving conflicted state for %r", room_id)
                with Measure(self.clock, "state._resolve_events"):
                    new_state = yield resolve_events(
                        state_groups_ids.values(),
                        state_map_factory=lambda ev_ids: self.store.get_events(
                            ev_ids,
                            get_prev_content=False,
                            check_redacted=False,
                        ),
                    )
            else:
                new_state = {key: e_ids.pop() for key, e_ids in state.items()}

            state_group = None
            new_state_event_ids = frozenset(new_state.values())
            for sg, events in state_groups_ids.items():
                if new_state_event_ids == frozenset(e_id for e_id in events):
                    state_group = sg
                    break
            if state_group is None:
                # Worker instances don't have access to this method, but we want
                # to set the state_group on the main instance to increase cache
                # hits.
                if hasattr(self.store, "get_next_state_group"):
                    state_group = self.store.get_next_state_group()

            prev_group = None
            delta_ids = None
            for old_group, old_ids in state_groups_ids.items():
                if not set(new_state.iterkeys()) - set(old_ids.iterkeys()):
                    n_delta_ids = {
                        k: v
                        for k, v in new_state.items() if old_ids.get(k) != v
                    }
                    if not delta_ids or len(n_delta_ids) < len(delta_ids):
                        prev_group = old_group
                        delta_ids = n_delta_ids

            cache = _StateCacheEntry(
                state=new_state,
                state_group=state_group,
                prev_group=prev_group,
                delta_ids=delta_ids,
            )

            if self._state_cache is not None:
                self._state_cache[group_names] = cache

            defer.returnValue(cache)

    def resolve_events(self, state_sets, event):
        logger.info("Resolving state for %s with %d groups", event.room_id,
                    len(state_sets))
        state_set_ids = [{(ev.type, ev.state_key): ev.event_id
                          for ev in st} for st in state_sets]

        state_map = {ev.event_id: ev for st in state_sets for ev in st}

        with Measure(self.clock, "state._resolve_events"):
            new_state = resolve_events(state_set_ids, state_map)

        new_state = {key: state_map[ev_id] for key, ev_id in new_state.items()}

        return new_state
Пример #11
0
class StateResolutionHandler(object):
    """Responsible for doing state conflict resolution.

    Note that the storage layer depends on this handler, so all functions must
    be storage-independent.
    """
    def __init__(self, hs):
        self.clock = hs.get_clock()

        # dict of set of event_ids -> _StateCacheEntry.
        self._state_cache = None
        self.resolve_linearizer = Linearizer(name="state_resolve_lock")

    def start_caching(self):
        logger.debug("start_caching")

        self._state_cache = ExpiringCache(
            cache_name="state_cache",
            clock=self.clock,
            max_len=SIZE_OF_CACHE,
            expiry_ms=EVICTION_TIMEOUT_SECONDS * 1000,
            iterable=True,
            reset_expiry_on_get=True,
        )

        self._state_cache.start()

    @defer.inlineCallbacks
    @log_function
    def resolve_state_groups(
        self, room_id, state_groups_ids, event_map, state_map_factory,
    ):
        """Resolves conflicts between a set of state groups

        Always generates a new state group (unless we hit the cache), so should
        not be called for a single state group

        Args:
            room_id (str): room we are resolving for (used for logging)
            state_groups_ids (dict[int, dict[(str, str), str]]):
                 map from state group id to the state in that state group
                (where 'state' is a map from state key to event id)

            event_map(dict[str,FrozenEvent]|None):
                a dict from event_id to event, for any events that we happen to
                have in flight (eg, those currently being persisted). This will be
                used as a starting point fof finding the state we need; any missing
                events will be requested via state_map_factory.

                If None, all events will be fetched via state_map_factory.

        Returns:
            Deferred[_StateCacheEntry]: resolved state
        """
        logger.debug(
            "resolve_state_groups state_groups %s",
            state_groups_ids.keys()
        )

        group_names = frozenset(state_groups_ids.keys())

        with (yield self.resolve_linearizer.queue(group_names)):
            if self._state_cache is not None:
                cache = self._state_cache.get(group_names, None)
                if cache:
                    defer.returnValue(cache)

            logger.info(
                "Resolving state for %s with %d groups", room_id, len(state_groups_ids)
            )

            # build a map from state key to the event_ids which set that state.
            # dict[(str, str), set[str])
            state = {}
            for st in state_groups_ids.itervalues():
                for key, e_id in st.iteritems():
                    state.setdefault(key, set()).add(e_id)

            # build a map from state key to the event_ids which set that state,
            # including only those where there are state keys in conflict.
            conflicted_state = {
                k: list(v)
                for k, v in state.iteritems()
                if len(v) > 1
            }

            if conflicted_state:
                logger.info("Resolving conflicted state for %r", room_id)
                with Measure(self.clock, "state._resolve_events"):
                    new_state = yield resolve_events_with_factory(
                        state_groups_ids.values(),
                        event_map=event_map,
                        state_map_factory=state_map_factory,
                    )
            else:
                new_state = {
                    key: e_ids.pop() for key, e_ids in state.iteritems()
                }

            with Measure(self.clock, "state.create_group_ids"):
                # if the new state matches any of the input state groups, we can
                # use that state group again. Otherwise we will generate a state_id
                # which will be used as a cache key for future resolutions, but
                # not get persisted.
                state_group = None
                new_state_event_ids = frozenset(new_state.itervalues())
                for sg, events in state_groups_ids.iteritems():
                    if new_state_event_ids == frozenset(e_id for e_id in events):
                        state_group = sg
                        break

                # TODO: We want to create a state group for this set of events, to
                # increase cache hits, but we need to make sure that it doesn't
                # end up as a prev_group without being added to the database

                prev_group = None
                delta_ids = None
                for old_group, old_ids in state_groups_ids.iteritems():
                    if not set(new_state) - set(old_ids):
                        n_delta_ids = {
                            k: v
                            for k, v in new_state.iteritems()
                            if old_ids.get(k) != v
                        }
                        if not delta_ids or len(n_delta_ids) < len(delta_ids):
                            prev_group = old_group
                            delta_ids = n_delta_ids

            cache = _StateCacheEntry(
                state=new_state,
                state_group=state_group,
                prev_group=prev_group,
                delta_ids=delta_ids,
            )

            if self._state_cache is not None:
                self._state_cache[group_names] = cache

            defer.returnValue(cache)
Пример #12
0
class FederationClient(FederationBase):

    def start_get_pdu_cache(self):
        self._get_pdu_cache = ExpiringCache(
            cache_name="get_pdu_cache",
            clock=self._clock,
            max_len=1000,
            expiry_ms=120 * 1000,
            reset_expiry_on_get=False,
        )

        self._get_pdu_cache.start()

    @log_function
    def send_pdu(self, pdu, destinations):
        """Informs the replication layer about a new PDU generated within the
        home server that should be transmitted to others.

        TODO: Figure out when we should actually resolve the deferred.

        Args:
            pdu (Pdu): The new Pdu.

        Returns:
            Deferred: Completes when we have successfully processed the PDU
            and replicated it to any interested remote home servers.
        """
        order = self._order
        self._order += 1

        sent_pdus_destination_dist.inc_by(len(destinations))

        logger.debug("[%s] transaction_layer.enqueue_pdu... ", pdu.event_id)

        # TODO, add errback, etc.
        self._transaction_queue.enqueue_pdu(pdu, destinations, order)

        logger.debug(
            "[%s] transaction_layer.enqueue_pdu... done",
            pdu.event_id
        )

    @log_function
    def send_edu(self, destination, edu_type, content):
        edu = Edu(
            origin=self.server_name,
            destination=destination,
            edu_type=edu_type,
            content=content,
        )

        sent_edus_counter.inc()

        # TODO, add errback, etc.
        self._transaction_queue.enqueue_edu(edu)
        return defer.succeed(None)

    @log_function
    def send_failure(self, failure, destination):
        self._transaction_queue.enqueue_failure(failure, destination)
        return defer.succeed(None)

    @log_function
    def make_query(self, destination, query_type, args,
                   retry_on_dns_fail=False):
        """Sends a federation Query to a remote homeserver of the given type
        and arguments.

        Args:
            destination (str): Domain name of the remote homeserver
            query_type (str): Category of the query type; should match the
                handler name used in register_query_handler().
            args (dict): Mapping of strings to strings containing the details
                of the query request.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.inc(query_type)

        return self.transport_layer.make_query(
            destination, query_type, args, retry_on_dns_fail=retry_on_dns_fail
        )

    @log_function
    def query_client_keys(self, destination, content):
        """Query device keys for a device hosted on a remote server.

        Args:
            destination (str): Domain name of the remote homeserver
            content (dict): The query content.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.inc("client_device_keys")
        return self.transport_layer.query_client_keys(destination, content)

    @log_function
    def claim_client_keys(self, destination, content):
        """Claims one-time keys for a device hosted on a remote server.

        Args:
            destination (str): Domain name of the remote homeserver
            content (dict): The query content.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.inc("client_one_time_keys")
        return self.transport_layer.claim_client_keys(destination, content)

    @defer.inlineCallbacks
    @log_function
    def backfill(self, dest, context, limit, extremities):
        """Requests some more historic PDUs for the given context from the
        given destination server.

        Args:
            dest (str): The remote home server to ask.
            context (str): The context to backfill.
            limit (int): The maximum number of PDUs to return.
            extremities (list): List of PDU id and origins of the first pdus
                we have seen from the context

        Returns:
            Deferred: Results in the received PDUs.
        """
        logger.debug("backfill extrem=%s", extremities)

        # If there are no extremeties then we've (probably) reached the start.
        if not extremities:
            return

        transaction_data = yield self.transport_layer.backfill(
            dest, context, extremities, limit)

        logger.debug("backfill transaction_data=%s", repr(transaction_data))

        pdus = [
            self.event_from_pdu_json(p, outlier=False)
            for p in transaction_data["pdus"]
        ]

        # FIXME: We should handle signature failures more gracefully.
        pdus[:] = yield defer.gatherResults(
            self._check_sigs_and_hashes(pdus),
            consumeErrors=True,
        ).addErrback(unwrapFirstError)

        defer.returnValue(pdus)

    @defer.inlineCallbacks
    @log_function
    def get_pdu(self, destinations, event_id, outlier=False, timeout=None):
        """Requests the PDU with given origin and ID from the remote home
        servers.

        Will attempt to get the PDU from each destination in the list until
        one succeeds.

        This will persist the PDU locally upon receipt.

        Args:
            destinations (list): Which home servers to query
            pdu_origin (str): The home server that originally sent the pdu.
            event_id (str)
            outlier (bool): Indicates whether the PDU is an `outlier`, i.e. if
                it's from an arbitary point in the context as opposed to part
                of the current block of PDUs. Defaults to `False`
            timeout (int): How long to try (in ms) each destination for before
                moving to the next destination. None indicates no timeout.

        Returns:
            Deferred: Results in the requested PDU.
        """

        # TODO: Rate limit the number of times we try and get the same event.

        if self._get_pdu_cache:
            e = self._get_pdu_cache.get(event_id)
            if e:
                defer.returnValue(e)

        pdu = None
        for destination in destinations:
            try:
                limiter = yield get_retry_limiter(
                    destination,
                    self._clock,
                    self.store,
                )

                with limiter:
                    transaction_data = yield self.transport_layer.get_event(
                        destination, event_id, timeout=timeout,
                    )

                    logger.debug("transaction_data %r", transaction_data)

                    pdu_list = [
                        self.event_from_pdu_json(p, outlier=outlier)
                        for p in transaction_data["pdus"]
                    ]

                    if pdu_list and pdu_list[0]:
                        pdu = pdu_list[0]

                        # Check signatures are correct.
                        pdu = yield self._check_sigs_and_hashes([pdu])[0]

                        break

            except SynapseError:
                logger.info(
                    "Failed to get PDU %s from %s because %s",
                    event_id, destination, e,
                )
                continue
            except CodeMessageException as e:
                if 400 <= e.code < 500:
                    raise

                logger.info(
                    "Failed to get PDU %s from %s because %s",
                    event_id, destination, e,
                )
                continue
            except NotRetryingDestination as e:
                logger.info(e.message)
                continue
            except Exception as e:
                logger.info(
                    "Failed to get PDU %s from %s because %s",
                    event_id, destination, e,
                )
                continue

        if self._get_pdu_cache is not None and pdu:
            self._get_pdu_cache[event_id] = pdu

        defer.returnValue(pdu)

    @defer.inlineCallbacks
    @log_function
    def get_state_for_room(self, destination, room_id, event_id):
        """Requests all of the `current` state PDUs for a given room from
        a remote home server.

        Args:
            destination (str): The remote homeserver to query for the state.
            room_id (str): The id of the room we're interested in.
            event_id (str): The id of the event we want the state at.

        Returns:
            Deferred: Results in a list of PDUs.
        """

        result = yield self.transport_layer.get_room_state(
            destination, room_id, event_id=event_id,
        )

        pdus = [
            self.event_from_pdu_json(p, outlier=True) for p in result["pdus"]
        ]

        auth_chain = [
            self.event_from_pdu_json(p, outlier=True)
            for p in result.get("auth_chain", [])
        ]

        signed_pdus = yield self._check_sigs_and_hash_and_fetch(
            destination, pdus, outlier=True
        )

        signed_auth = yield self._check_sigs_and_hash_and_fetch(
            destination, auth_chain, outlier=True
        )

        signed_auth.sort(key=lambda e: e.depth)

        defer.returnValue((signed_pdus, signed_auth))

    @defer.inlineCallbacks
    @log_function
    def get_event_auth(self, destination, room_id, event_id):
        res = yield self.transport_layer.get_event_auth(
            destination, room_id, event_id,
        )

        auth_chain = [
            self.event_from_pdu_json(p, outlier=True)
            for p in res["auth_chain"]
        ]

        signed_auth = yield self._check_sigs_and_hash_and_fetch(
            destination, auth_chain, outlier=True
        )

        signed_auth.sort(key=lambda e: e.depth)

        defer.returnValue(signed_auth)

    @defer.inlineCallbacks
    def make_membership_event(self, destinations, room_id, user_id, membership,
                              content={},):
        """
        Creates an m.room.member event, with context, without participating in the room.

        Does so by asking one of the already participating servers to create an
        event with proper context.

        Note that this does not append any events to any graphs.

        Args:
            destinations (str): Candidate homeservers which are probably
                participating in the room.
            room_id (str): The room in which the event will happen.
            user_id (str): The user whose membership is being evented.
            membership (str): The "membership" property of the event. Must be
                one of "join" or "leave".
            content (object): Any additional data to put into the content field
                of the event.
        Return:
            A tuple of (origin (str), event (object)) where origin is the remote
            homeserver which generated the event.
        """
        valid_memberships = {Membership.JOIN, Membership.LEAVE}
        if membership not in valid_memberships:
            raise RuntimeError(
                "make_membership_event called with membership='%s', must be one of %s" %
                (membership, ",".join(valid_memberships))
            )
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                ret = yield self.transport_layer.make_membership_event(
                    destination, room_id, user_id, membership
                )

                pdu_dict = ret["event"]

                logger.debug("Got response to make_%s: %s", membership, pdu_dict)

                pdu_dict["content"].update(content)

                # The protoevent received over the JSON wire may not have all
                # the required fields. Lets just gloss over that because
                # there's some we never care about
                if "prev_state" not in pdu_dict:
                    pdu_dict["prev_state"] = []

                defer.returnValue(
                    (destination, self.event_from_pdu_json(pdu_dict))
                )
                break
            except CodeMessageException:
                raise
            except Exception as e:
                logger.warn(
                    "Failed to make_%s via %s: %s",
                    membership, destination, e.message
                )
                raise

        raise RuntimeError("Failed to send to any server.")

    @defer.inlineCallbacks
    def send_join(self, destinations, pdu):
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                time_now = self._clock.time_msec()
                _, content = yield self.transport_layer.send_join(
                    destination=destination,
                    room_id=pdu.room_id,
                    event_id=pdu.event_id,
                    content=pdu.get_pdu_json(time_now),
                )

                logger.debug("Got content: %s", content)

                state = [
                    self.event_from_pdu_json(p, outlier=True)
                    for p in content.get("state", [])
                ]

                auth_chain = [
                    self.event_from_pdu_json(p, outlier=True)
                    for p in content.get("auth_chain", [])
                ]

                pdus = {
                    p.event_id: p
                    for p in itertools.chain(state, auth_chain)
                }

                valid_pdus = yield self._check_sigs_and_hash_and_fetch(
                    destination, pdus.values(),
                    outlier=True,
                )

                valid_pdus_map = {
                    p.event_id: p
                    for p in valid_pdus
                }

                # NB: We *need* to copy to ensure that we don't have multiple
                # references being passed on, as that causes... issues.
                signed_state = [
                    copy.copy(valid_pdus_map[p.event_id])
                    for p in state
                    if p.event_id in valid_pdus_map
                ]

                signed_auth = [
                    valid_pdus_map[p.event_id]
                    for p in auth_chain
                    if p.event_id in valid_pdus_map
                ]

                # NB: We *need* to copy to ensure that we don't have multiple
                # references being passed on, as that causes... issues.
                for s in signed_state:
                    s.internal_metadata = copy.deepcopy(s.internal_metadata)

                auth_chain.sort(key=lambda e: e.depth)

                defer.returnValue({
                    "state": signed_state,
                    "auth_chain": signed_auth,
                    "origin": destination,
                })
            except CodeMessageException:
                raise
            except Exception as e:
                logger.exception(
                    "Failed to send_join via %s: %s",
                    destination, e.message
                )

        raise RuntimeError("Failed to send to any server.")

    @defer.inlineCallbacks
    def send_invite(self, destination, room_id, event_id, pdu):
        time_now = self._clock.time_msec()
        code, content = yield self.transport_layer.send_invite(
            destination=destination,
            room_id=room_id,
            event_id=event_id,
            content=pdu.get_pdu_json(time_now),
        )

        pdu_dict = content["event"]

        logger.debug("Got response to send_invite: %s", pdu_dict)

        pdu = self.event_from_pdu_json(pdu_dict)

        # Check signatures are correct.
        pdu = yield self._check_sigs_and_hash(pdu)

        # FIXME: We should handle signature failures more gracefully.

        defer.returnValue(pdu)

    @defer.inlineCallbacks
    def send_leave(self, destinations, pdu):
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                time_now = self._clock.time_msec()
                _, content = yield self.transport_layer.send_leave(
                    destination=destination,
                    room_id=pdu.room_id,
                    event_id=pdu.event_id,
                    content=pdu.get_pdu_json(time_now),
                )

                logger.debug("Got content: %s", content)
                defer.returnValue(None)
            except CodeMessageException:
                raise
            except Exception as e:
                logger.exception(
                    "Failed to send_leave via %s: %s",
                    destination, e.message
                )

        raise RuntimeError("Failed to send to any server.")

    @defer.inlineCallbacks
    def query_auth(self, destination, room_id, event_id, local_auth):
        """
        Params:
            destination (str)
            event_it (str)
            local_auth (list)
        """
        time_now = self._clock.time_msec()

        send_content = {
            "auth_chain": [e.get_pdu_json(time_now) for e in local_auth],
        }

        code, content = yield self.transport_layer.send_query_auth(
            destination=destination,
            room_id=room_id,
            event_id=event_id,
            content=send_content,
        )

        auth_chain = [
            self.event_from_pdu_json(e)
            for e in content["auth_chain"]
        ]

        signed_auth = yield self._check_sigs_and_hash_and_fetch(
            destination, auth_chain, outlier=True
        )

        signed_auth.sort(key=lambda e: e.depth)

        ret = {
            "auth_chain": signed_auth,
            "rejects": content.get("rejects", []),
            "missing": content.get("missing", []),
        }

        defer.returnValue(ret)

    @defer.inlineCallbacks
    def get_missing_events(self, destination, room_id, earliest_events_ids,
                           latest_events, limit, min_depth):
        """Tries to fetch events we are missing. This is called when we receive
        an event without having received all of its ancestors.

        Args:
            destination (str)
            room_id (str)
            earliest_events_ids (list): List of event ids. Effectively the
                events we expected to receive, but haven't. `get_missing_events`
                should only return events that didn't happen before these.
            latest_events (list): List of events we have received that we don't
                have all previous events for.
            limit (int): Maximum number of events to return.
            min_depth (int): Minimum depth of events tor return.
        """
        try:
            content = yield self.transport_layer.get_missing_events(
                destination=destination,
                room_id=room_id,
                earliest_events=earliest_events_ids,
                latest_events=[e.event_id for e in latest_events],
                limit=limit,
                min_depth=min_depth,
            )

            events = [
                self.event_from_pdu_json(e)
                for e in content.get("events", [])
            ]

            signed_events = yield self._check_sigs_and_hash_and_fetch(
                destination, events, outlier=False
            )

            have_gotten_all_from_destination = True
        except HttpResponseException as e:
            if not e.code == 400:
                raise

            # We are probably hitting an old server that doesn't support
            # get_missing_events
            signed_events = []
            have_gotten_all_from_destination = False

        if len(signed_events) >= limit:
            defer.returnValue(signed_events)

        servers = yield self.store.get_joined_hosts_for_room(room_id)

        servers = set(servers)
        servers.discard(self.server_name)

        failed_to_fetch = set()

        while len(signed_events) < limit:
            # Are we missing any?

            seen_events = set(earliest_events_ids)
            seen_events.update(e.event_id for e in signed_events if e)

            missing_events = {}
            for e in itertools.chain(latest_events, signed_events):
                if e.depth > min_depth:
                    missing_events.update({
                        e_id: e.depth for e_id, _ in e.prev_events
                        if e_id not in seen_events
                        and e_id not in failed_to_fetch
                    })

            if not missing_events:
                break

            have_seen = yield self.store.have_events(missing_events)

            for k in have_seen:
                missing_events.pop(k, None)

            if not missing_events:
                break

            # Okay, we haven't gotten everything yet. Lets get them.
            ordered_missing = sorted(missing_events.items(), key=lambda x: x[0])

            if have_gotten_all_from_destination:
                servers.discard(destination)

            def random_server_list():
                srvs = list(servers)
                random.shuffle(srvs)
                return srvs

            deferreds = [
                self.get_pdu(
                    destinations=random_server_list(),
                    event_id=e_id,
                )
                for e_id, depth in ordered_missing[:limit - len(signed_events)]
            ]

            res = yield defer.DeferredList(deferreds, consumeErrors=True)
            for (result, val), (e_id, _) in zip(res, ordered_missing):
                if result and val:
                    signed_events.append(val)
                else:
                    failed_to_fetch.add(e_id)

        defer.returnValue(signed_events)

    def event_from_pdu_json(self, pdu_json, outlier=False):
        event = FrozenEvent(
            pdu_json
        )

        event.internal_metadata.outlier = outlier

        return event

    @defer.inlineCallbacks
    def forward_third_party_invite(self, destinations, room_id, event_dict):
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                yield self.transport_layer.exchange_third_party_invite(
                    destination=destination,
                    room_id=room_id,
                    event_dict=event_dict,
                )
                defer.returnValue(None)
            except CodeMessageException:
                raise
            except Exception as e:
                logger.exception(
                    "Failed to send_third_party_invite via %s: %s",
                    destination, e.message
                )

        raise RuntimeError("Failed to send to any server.")
Пример #13
0
class StateHandler(object):
    """ Responsible for doing state conflict resolution.
    """

    def __init__(self, hs):
        self.clock = hs.get_clock()
        self.store = hs.get_datastore()
        self.hs = hs

        # dict of set of event_ids -> _StateCacheEntry.
        self._state_cache = None

    def start_caching(self):
        logger.debug("start_caching")

        self._state_cache = ExpiringCache(
            cache_name="state_cache",
            clock=self.clock,
            max_len=SIZE_OF_CACHE,
            expiry_ms=EVICTION_TIMEOUT_SECONDS*1000,
            reset_expiry_on_get=True,
        )

        self._state_cache.start()

    @defer.inlineCallbacks
    def get_current_state(self, room_id, event_type=None, state_key=""):
        """ Returns the current state for the room as a list. This is done by
        calling `get_latest_events_in_room` to get the leading edges of the
        event graph and then resolving any of the state conflicts.

        This is equivalent to getting the state of an event that were to send
        next before receiving any new events.

        If `event_type` is specified, then the method returns only the one
        event (or None) with that `event_type` and `state_key`.
        """
        event_ids = yield self.store.get_latest_event_ids_in_room(room_id)

        cache = None
        if self._state_cache is not None:
            cache = self._state_cache.get(frozenset(event_ids), None)

        if cache:
            cache.ts = self.clock.time_msec()
            state = cache.state
        else:
            res = yield self.resolve_state_groups(room_id, event_ids)
            state = res[1]

        if event_type:
            defer.returnValue(state.get((event_type, state_key)))
            return

        defer.returnValue(state)

    @defer.inlineCallbacks
    def compute_event_context(self, event, old_state=None, outlier=False):
        """ Fills out the context with the `current state` of the graph. The
        `current state` here is defined to be the state of the event graph
        just before the event - i.e. it never includes `event`

        If `event` has `auth_events` then this will also fill out the
        `auth_events` field on `context` from the `current_state`.

        Args:
            event (EventBase)
        Returns:
            an EventContext
        """
        yield run_on_reactor()

        context = EventContext()

        if outlier:
            # If this is an outlier, then we know it shouldn't have any current
            # state. Certainly store.get_current_state won't return any, and
            # persisting the event won't store the state group.
            if old_state:
                context.current_state = {
                    (s.type, s.state_key): s for s in old_state
                }
            else:
                context.current_state = {}
            context.prev_state_events = []
            context.state_group = None
            defer.returnValue(context)

        if old_state:
            context.current_state = {
                (s.type, s.state_key): s for s in old_state
            }
            context.state_group = None

            if event.is_state():
                key = (event.type, event.state_key)
                if key in context.current_state:
                    replaces = context.current_state[key]
                    if replaces.event_id != event.event_id:  # Paranoia check
                        event.unsigned["replaces_state"] = replaces.event_id

            context.prev_state_events = []
            defer.returnValue(context)

        if event.is_state():
            ret = yield self.resolve_state_groups(
                event.room_id, [e for e, _ in event.prev_events],
                event_type=event.type,
                state_key=event.state_key,
            )
        else:
            ret = yield self.resolve_state_groups(
                event.room_id, [e for e, _ in event.prev_events],
            )

        group, curr_state, prev_state = ret

        context.current_state = curr_state
        context.state_group = group if not event.is_state() else None

        if event.is_state():
            key = (event.type, event.state_key)
            if key in context.current_state:
                replaces = context.current_state[key]
                event.unsigned["replaces_state"] = replaces.event_id

        context.prev_state_events = prev_state
        defer.returnValue(context)

    @defer.inlineCallbacks
    @log_function
    def resolve_state_groups(self, room_id, event_ids, event_type=None, state_key=""):
        """ Given a list of event_ids this method fetches the state at each
        event, resolves conflicts between them and returns them.

        Return format is a tuple: (`state_group`, `state_events`), where the
        first is the name of a state group if one and only one is involved,
        otherwise `None`.
        """
        logger.debug("resolve_state_groups event_ids %s", event_ids)

        if self._state_cache is not None:
            cache = self._state_cache.get(frozenset(event_ids), None)
            if cache and cache.state_group:
                cache.ts = self.clock.time_msec()
                prev_state = cache.state.get((event_type, state_key), None)
                if prev_state:
                    prev_state = prev_state.event_id
                    prev_states = [prev_state]
                else:
                    prev_states = []
                defer.returnValue(
                    (cache.state_group, cache.state, prev_states)
                )

        state_groups = yield self.store.get_state_groups(
            room_id, event_ids
        )

        logger.debug(
            "resolve_state_groups state_groups %s",
            state_groups.keys()
        )

        group_names = set(state_groups.keys())
        if len(group_names) == 1:
            name, state_list = state_groups.items().pop()
            state = {
                (e.type, e.state_key): e
                for e in state_list
            }
            prev_state = state.get((event_type, state_key), None)
            if prev_state:
                prev_state = prev_state.event_id
                prev_states = [prev_state]
            else:
                prev_states = []

            if self._state_cache is not None:
                cache = _StateCacheEntry(
                    state=state,
                    state_group=name,
                    ts=self.clock.time_msec()
                )

                self._state_cache[frozenset(event_ids)] = cache

            defer.returnValue((name, state, prev_states))

        new_state, prev_states = self._resolve_events(
            state_groups.values(), event_type, state_key
        )

        if self._state_cache is not None:
            cache = _StateCacheEntry(
                state=new_state,
                state_group=None,
                ts=self.clock.time_msec()
            )

            self._state_cache[frozenset(event_ids)] = cache

        defer.returnValue((None, new_state, prev_states))

    def resolve_events(self, state_sets, event):
        if event.is_state():
            return self._resolve_events(
                state_sets, event.type, event.state_key
            )
        else:
            return self._resolve_events(state_sets)

    def _resolve_events(self, state_sets, event_type=None, state_key=""):
        state = {}
        for st in state_sets:
            for e in st:
                state.setdefault(
                    (e.type, e.state_key),
                    {}
                )[e.event_id] = e

        unconflicted_state = {
            k: v.values()[0] for k, v in state.items()
            if len(v.values()) == 1
        }

        conflicted_state = {
            k: v.values()
            for k, v in state.items()
            if len(v.values()) > 1
        }

        if event_type:
            prev_states_events = conflicted_state.get(
                (event_type, state_key), []
            )
            prev_states = [s.event_id for s in prev_states_events]
        else:
            prev_states = []

        auth_events = {
            k: e for k, e in unconflicted_state.items()
            if k[0] in AuthEventTypes
        }

        try:
            resolved_state = self._resolve_state_events(
                conflicted_state, auth_events
            )
        except:
            logger.exception("Failed to resolve state")
            raise

        new_state = unconflicted_state
        new_state.update(resolved_state)

        return new_state, prev_states

    @log_function
    def _resolve_state_events(self, conflicted_state, auth_events):
        """ This is where we actually decide which of the conflicted state to
        use.

        We resolve conflicts in the following order:
            1. power levels
            2. memberships
            3. other events.
        """
        resolved_state = {}
        power_key = (EventTypes.PowerLevels, "")
        if power_key in conflicted_state.items():
            power_levels = conflicted_state[power_key]
            resolved_state[power_key] = self._resolve_auth_events(power_levels)

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key[0] == EventTypes.JoinRules:
                resolved_state[key] = self._resolve_auth_events(
                    events,
                    auth_events
                )

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key[0] == EventTypes.Member:
                resolved_state[key] = self._resolve_auth_events(
                    events,
                    auth_events
                )

        auth_events.update(resolved_state)

        for key, events in conflicted_state.items():
            if key not in resolved_state:
                resolved_state[key] = self._resolve_normal_events(
                    events, auth_events
                )

        return resolved_state

    def _resolve_auth_events(self, events, auth_events):
        reverse = [i for i in reversed(self._ordered_events(events))]

        auth_events = dict(auth_events)

        prev_event = reverse[0]
        for event in reverse[1:]:
            auth_events[(prev_event.type, prev_event.state_key)] = prev_event
            try:
                # FIXME: hs.get_auth() is bad style, but we need to do it to
                # get around circular deps.
                self.hs.get_auth().check(event, auth_events)
                prev_event = event
            except AuthError:
                return prev_event

        return event

    def _resolve_normal_events(self, events, auth_events):
        for event in self._ordered_events(events):
            try:
                # FIXME: hs.get_auth() is bad style, but we need to do it to
                # get around circular deps.
                self.hs.get_auth().check(event, auth_events)
                return event
            except AuthError:
                pass

        # Use the last event (the one with the least depth) if they all fail
        # the auth check.
        return event

    def _ordered_events(self, events):
        def key_func(e):
            return -int(e.depth), hashlib.sha1(e.event_id).hexdigest()

        return sorted(events, key=key_func)
Пример #14
0
class FederationClient(FederationBase):
    def __init__(self, hs):
        super(FederationClient, self).__init__(hs)

    def start_get_pdu_cache(self):
        self._get_pdu_cache = ExpiringCache(
            cache_name="get_pdu_cache",
            clock=self._clock,
            max_len=1000,
            expiry_ms=120 * 1000,
            reset_expiry_on_get=False,
        )

        self._get_pdu_cache.start()

    @log_function
    def send_pdu(self, pdu, destinations):
        """Informs the replication layer about a new PDU generated within the
        home server that should be transmitted to others.

        TODO: Figure out when we should actually resolve the deferred.

        Args:
            pdu (Pdu): The new Pdu.

        Returns:
            Deferred: Completes when we have successfully processed the PDU
            and replicated it to any interested remote home servers.
        """
        order = self._order
        self._order += 1

        sent_pdus_destination_dist.inc_by(len(destinations))

        logger.debug("[%s] transaction_layer.enqueue_pdu... ", pdu.event_id)

        # TODO, add errback, etc.
        self._transaction_queue.enqueue_pdu(pdu, destinations, order)

        logger.debug("[%s] transaction_layer.enqueue_pdu... done",
                     pdu.event_id)

    @log_function
    def send_edu(self, destination, edu_type, content):
        edu = Edu(
            origin=self.server_name,
            destination=destination,
            edu_type=edu_type,
            content=content,
        )

        sent_edus_counter.inc()

        # TODO, add errback, etc.
        self._transaction_queue.enqueue_edu(edu)
        return defer.succeed(None)

    @log_function
    def send_failure(self, failure, destination):
        self._transaction_queue.enqueue_failure(failure, destination)
        return defer.succeed(None)

    @log_function
    def make_query(self,
                   destination,
                   query_type,
                   args,
                   retry_on_dns_fail=False):
        """Sends a federation Query to a remote homeserver of the given type
        and arguments.

        Args:
            destination (str): Domain name of the remote homeserver
            query_type (str): Category of the query type; should match the
                handler name used in register_query_handler().
            args (dict): Mapping of strings to strings containing the details
                of the query request.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.inc(query_type)

        return self.transport_layer.make_query(
            destination, query_type, args, retry_on_dns_fail=retry_on_dns_fail)

    @log_function
    def query_client_keys(self, destination, content):
        """Query device keys for a device hosted on a remote server.

        Args:
            destination (str): Domain name of the remote homeserver
            content (dict): The query content.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.inc("client_device_keys")
        return self.transport_layer.query_client_keys(destination, content)

    @log_function
    def claim_client_keys(self, destination, content):
        """Claims one-time keys for a device hosted on a remote server.

        Args:
            destination (str): Domain name of the remote homeserver
            content (dict): The query content.

        Returns:
            a Deferred which will eventually yield a JSON object from the
            response
        """
        sent_queries_counter.inc("client_one_time_keys")
        return self.transport_layer.claim_client_keys(destination, content)

    @defer.inlineCallbacks
    @log_function
    def backfill(self, dest, context, limit, extremities):
        """Requests some more historic PDUs for the given context from the
        given destination server.

        Args:
            dest (str): The remote home server to ask.
            context (str): The context to backfill.
            limit (int): The maximum number of PDUs to return.
            extremities (list): List of PDU id and origins of the first pdus
                we have seen from the context

        Returns:
            Deferred: Results in the received PDUs.
        """
        logger.debug("backfill extrem=%s", extremities)

        # If there are no extremeties then we've (probably) reached the start.
        if not extremities:
            return

        transaction_data = yield self.transport_layer.backfill(
            dest, context, extremities, limit)

        logger.debug("backfill transaction_data=%s", repr(transaction_data))

        pdus = [
            self.event_from_pdu_json(p, outlier=False)
            for p in transaction_data["pdus"]
        ]

        # FIXME: We should handle signature failures more gracefully.
        pdus[:] = yield defer.gatherResults(
            self._check_sigs_and_hashes(pdus),
            consumeErrors=True,
        ).addErrback(unwrapFirstError)

        defer.returnValue(pdus)

    @defer.inlineCallbacks
    @log_function
    def get_pdu(self, destinations, event_id, outlier=False, timeout=None):
        """Requests the PDU with given origin and ID from the remote home
        servers.

        Will attempt to get the PDU from each destination in the list until
        one succeeds.

        This will persist the PDU locally upon receipt.

        Args:
            destinations (list): Which home servers to query
            pdu_origin (str): The home server that originally sent the pdu.
            event_id (str)
            outlier (bool): Indicates whether the PDU is an `outlier`, i.e. if
                it's from an arbitary point in the context as opposed to part
                of the current block of PDUs. Defaults to `False`
            timeout (int): How long to try (in ms) each destination for before
                moving to the next destination. None indicates no timeout.

        Returns:
            Deferred: Results in the requested PDU.
        """

        # TODO: Rate limit the number of times we try and get the same event.

        if self._get_pdu_cache:
            e = self._get_pdu_cache.get(event_id)
            if e:
                defer.returnValue(e)

        pdu = None
        for destination in destinations:
            try:
                limiter = yield get_retry_limiter(
                    destination,
                    self._clock,
                    self.store,
                )

                with limiter:
                    transaction_data = yield self.transport_layer.get_event(
                        destination,
                        event_id,
                        timeout=timeout,
                    )

                    logger.debug("transaction_data %r", transaction_data)

                    pdu_list = [
                        self.event_from_pdu_json(p, outlier=outlier)
                        for p in transaction_data["pdus"]
                    ]

                    if pdu_list and pdu_list[0]:
                        pdu = pdu_list[0]

                        # Check signatures are correct.
                        pdu = yield self._check_sigs_and_hashes([pdu])[0]

                        break

            except SynapseError:
                logger.info(
                    "Failed to get PDU %s from %s because %s",
                    event_id,
                    destination,
                    e,
                )
                continue
            except CodeMessageException as e:
                if 400 <= e.code < 500:
                    raise

                logger.info(
                    "Failed to get PDU %s from %s because %s",
                    event_id,
                    destination,
                    e,
                )
                continue
            except NotRetryingDestination as e:
                logger.info(e.message)
                continue
            except Exception as e:
                logger.info(
                    "Failed to get PDU %s from %s because %s",
                    event_id,
                    destination,
                    e,
                )
                continue

        if self._get_pdu_cache is not None and pdu:
            self._get_pdu_cache[event_id] = pdu

        defer.returnValue(pdu)

    @defer.inlineCallbacks
    @log_function
    def get_state_for_room(self, destination, room_id, event_id):
        """Requests all of the `current` state PDUs for a given room from
        a remote home server.

        Args:
            destination (str): The remote homeserver to query for the state.
            room_id (str): The id of the room we're interested in.
            event_id (str): The id of the event we want the state at.

        Returns:
            Deferred: Results in a list of PDUs.
        """

        result = yield self.transport_layer.get_room_state(
            destination,
            room_id,
            event_id=event_id,
        )

        pdus = [
            self.event_from_pdu_json(p, outlier=True) for p in result["pdus"]
        ]

        auth_chain = [
            self.event_from_pdu_json(p, outlier=True)
            for p in result.get("auth_chain", [])
        ]

        signed_pdus = yield self._check_sigs_and_hash_and_fetch(destination,
                                                                pdus,
                                                                outlier=True)

        signed_auth = yield self._check_sigs_and_hash_and_fetch(destination,
                                                                auth_chain,
                                                                outlier=True)

        signed_auth.sort(key=lambda e: e.depth)

        defer.returnValue((signed_pdus, signed_auth))

    @defer.inlineCallbacks
    @log_function
    def get_event_auth(self, destination, room_id, event_id):
        res = yield self.transport_layer.get_event_auth(
            destination,
            room_id,
            event_id,
        )

        auth_chain = [
            self.event_from_pdu_json(p, outlier=True)
            for p in res["auth_chain"]
        ]

        signed_auth = yield self._check_sigs_and_hash_and_fetch(destination,
                                                                auth_chain,
                                                                outlier=True)

        signed_auth.sort(key=lambda e: e.depth)

        defer.returnValue(signed_auth)

    @defer.inlineCallbacks
    def make_membership_event(
        self,
        destinations,
        room_id,
        user_id,
        membership,
        content={},
    ):
        """
        Creates an m.room.member event, with context, without participating in the room.

        Does so by asking one of the already participating servers to create an
        event with proper context.

        Note that this does not append any events to any graphs.

        Args:
            destinations (str): Candidate homeservers which are probably
                participating in the room.
            room_id (str): The room in which the event will happen.
            user_id (str): The user whose membership is being evented.
            membership (str): The "membership" property of the event. Must be
                one of "join" or "leave".
            content (object): Any additional data to put into the content field
                of the event.
        Return:
            A tuple of (origin (str), event (object)) where origin is the remote
            homeserver which generated the event.
        """
        valid_memberships = {Membership.JOIN, Membership.LEAVE}
        if membership not in valid_memberships:
            raise RuntimeError(
                "make_membership_event called with membership='%s', must be one of %s"
                % (membership, ",".join(valid_memberships)))
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                ret = yield self.transport_layer.make_membership_event(
                    destination, room_id, user_id, membership)

                pdu_dict = ret["event"]

                logger.debug("Got response to make_%s: %s", membership,
                             pdu_dict)

                pdu_dict["content"].update(content)

                # The protoevent received over the JSON wire may not have all
                # the required fields. Lets just gloss over that because
                # there's some we never care about
                if "prev_state" not in pdu_dict:
                    pdu_dict["prev_state"] = []

                defer.returnValue(
                    (destination, self.event_from_pdu_json(pdu_dict)))
                break
            except CodeMessageException:
                raise
            except Exception as e:
                logger.warn("Failed to make_%s via %s: %s", membership,
                            destination, e.message)
                raise

        raise RuntimeError("Failed to send to any server.")

    @defer.inlineCallbacks
    def send_join(self, destinations, pdu):
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                time_now = self._clock.time_msec()
                _, content = yield self.transport_layer.send_join(
                    destination=destination,
                    room_id=pdu.room_id,
                    event_id=pdu.event_id,
                    content=pdu.get_pdu_json(time_now),
                )

                logger.debug("Got content: %s", content)

                state = [
                    self.event_from_pdu_json(p, outlier=True)
                    for p in content.get("state", [])
                ]

                auth_chain = [
                    self.event_from_pdu_json(p, outlier=True)
                    for p in content.get("auth_chain", [])
                ]

                pdus = {
                    p.event_id: p
                    for p in itertools.chain(state, auth_chain)
                }

                valid_pdus = yield self._check_sigs_and_hash_and_fetch(
                    destination,
                    pdus.values(),
                    outlier=True,
                )

                valid_pdus_map = {p.event_id: p for p in valid_pdus}

                # NB: We *need* to copy to ensure that we don't have multiple
                # references being passed on, as that causes... issues.
                signed_state = [
                    copy.copy(valid_pdus_map[p.event_id]) for p in state
                    if p.event_id in valid_pdus_map
                ]

                signed_auth = [
                    valid_pdus_map[p.event_id] for p in auth_chain
                    if p.event_id in valid_pdus_map
                ]

                # NB: We *need* to copy to ensure that we don't have multiple
                # references being passed on, as that causes... issues.
                for s in signed_state:
                    s.internal_metadata = copy.deepcopy(s.internal_metadata)

                auth_chain.sort(key=lambda e: e.depth)

                defer.returnValue({
                    "state": signed_state,
                    "auth_chain": signed_auth,
                    "origin": destination,
                })
            except CodeMessageException:
                raise
            except Exception as e:
                logger.exception("Failed to send_join via %s: %s", destination,
                                 e.message)

        raise RuntimeError("Failed to send to any server.")

    @defer.inlineCallbacks
    def send_invite(self, destination, room_id, event_id, pdu):
        time_now = self._clock.time_msec()
        code, content = yield self.transport_layer.send_invite(
            destination=destination,
            room_id=room_id,
            event_id=event_id,
            content=pdu.get_pdu_json(time_now),
        )

        pdu_dict = content["event"]

        logger.debug("Got response to send_invite: %s", pdu_dict)

        pdu = self.event_from_pdu_json(pdu_dict)

        # Check signatures are correct.
        pdu = yield self._check_sigs_and_hash(pdu)

        # FIXME: We should handle signature failures more gracefully.

        defer.returnValue(pdu)

    @defer.inlineCallbacks
    def send_leave(self, destinations, pdu):
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                time_now = self._clock.time_msec()
                _, content = yield self.transport_layer.send_leave(
                    destination=destination,
                    room_id=pdu.room_id,
                    event_id=pdu.event_id,
                    content=pdu.get_pdu_json(time_now),
                )

                logger.debug("Got content: %s", content)
                defer.returnValue(None)
            except CodeMessageException:
                raise
            except Exception as e:
                logger.exception("Failed to send_leave via %s: %s",
                                 destination, e.message)

        raise RuntimeError("Failed to send to any server.")

    @defer.inlineCallbacks
    def get_public_rooms(self, destinations):
        results_by_server = {}

        @defer.inlineCallbacks
        def _get_result(s):
            if s == self.server_name:
                defer.returnValue()

            try:
                result = yield self.transport_layer.get_public_rooms(s)
                results_by_server[s] = result
            except:
                logger.exception("Error getting room list from server %r", s)

        yield concurrently_execute(_get_result, destinations, 3)

        defer.returnValue(results_by_server)

    @defer.inlineCallbacks
    def query_auth(self, destination, room_id, event_id, local_auth):
        """
        Params:
            destination (str)
            event_it (str)
            local_auth (list)
        """
        time_now = self._clock.time_msec()

        send_content = {
            "auth_chain": [e.get_pdu_json(time_now) for e in local_auth],
        }

        code, content = yield self.transport_layer.send_query_auth(
            destination=destination,
            room_id=room_id,
            event_id=event_id,
            content=send_content,
        )

        auth_chain = [
            self.event_from_pdu_json(e) for e in content["auth_chain"]
        ]

        signed_auth = yield self._check_sigs_and_hash_and_fetch(destination,
                                                                auth_chain,
                                                                outlier=True)

        signed_auth.sort(key=lambda e: e.depth)

        ret = {
            "auth_chain": signed_auth,
            "rejects": content.get("rejects", []),
            "missing": content.get("missing", []),
        }

        defer.returnValue(ret)

    @defer.inlineCallbacks
    def get_missing_events(self, destination, room_id, earliest_events_ids,
                           latest_events, limit, min_depth):
        """Tries to fetch events we are missing. This is called when we receive
        an event without having received all of its ancestors.

        Args:
            destination (str)
            room_id (str)
            earliest_events_ids (list): List of event ids. Effectively the
                events we expected to receive, but haven't. `get_missing_events`
                should only return events that didn't happen before these.
            latest_events (list): List of events we have received that we don't
                have all previous events for.
            limit (int): Maximum number of events to return.
            min_depth (int): Minimum depth of events tor return.
        """
        try:
            content = yield self.transport_layer.get_missing_events(
                destination=destination,
                room_id=room_id,
                earliest_events=earliest_events_ids,
                latest_events=[e.event_id for e in latest_events],
                limit=limit,
                min_depth=min_depth,
            )

            events = [
                self.event_from_pdu_json(e) for e in content.get("events", [])
            ]

            signed_events = yield self._check_sigs_and_hash_and_fetch(
                destination, events, outlier=False)

            have_gotten_all_from_destination = True
        except HttpResponseException as e:
            if not e.code == 400:
                raise

            # We are probably hitting an old server that doesn't support
            # get_missing_events
            signed_events = []
            have_gotten_all_from_destination = False

        if len(signed_events) >= limit:
            defer.returnValue(signed_events)

        servers = yield self.store.get_joined_hosts_for_room(room_id)

        servers = set(servers)
        servers.discard(self.server_name)

        failed_to_fetch = set()

        while len(signed_events) < limit:
            # Are we missing any?

            seen_events = set(earliest_events_ids)
            seen_events.update(e.event_id for e in signed_events if e)

            missing_events = {}
            for e in itertools.chain(latest_events, signed_events):
                if e.depth > min_depth:
                    missing_events.update({
                        e_id: e.depth
                        for e_id, _ in e.prev_events if e_id not in seen_events
                        and e_id not in failed_to_fetch
                    })

            if not missing_events:
                break

            have_seen = yield self.store.have_events(missing_events)

            for k in have_seen:
                missing_events.pop(k, None)

            if not missing_events:
                break

            # Okay, we haven't gotten everything yet. Lets get them.
            ordered_missing = sorted(missing_events.items(),
                                     key=lambda x: x[0])

            if have_gotten_all_from_destination:
                servers.discard(destination)

            def random_server_list():
                srvs = list(servers)
                random.shuffle(srvs)
                return srvs

            deferreds = [
                self.get_pdu(
                    destinations=random_server_list(),
                    event_id=e_id,
                )
                for e_id, depth in ordered_missing[:limit - len(signed_events)]
            ]

            res = yield defer.DeferredList(deferreds, consumeErrors=True)
            for (result, val), (e_id, _) in zip(res, ordered_missing):
                if result and val:
                    signed_events.append(val)
                else:
                    failed_to_fetch.add(e_id)

        defer.returnValue(signed_events)

    def event_from_pdu_json(self, pdu_json, outlier=False):
        event = FrozenEvent(pdu_json)

        event.internal_metadata.outlier = outlier

        return event

    @defer.inlineCallbacks
    def forward_third_party_invite(self, destinations, room_id, event_dict):
        for destination in destinations:
            if destination == self.server_name:
                continue

            try:
                yield self.transport_layer.exchange_third_party_invite(
                    destination=destination,
                    room_id=room_id,
                    event_dict=event_dict,
                )
                defer.returnValue(None)
            except CodeMessageException:
                raise
            except Exception as e:
                logger.exception(
                    "Failed to send_third_party_invite via %s: %s",
                    destination, e.message)

        raise RuntimeError("Failed to send to any server.")
Пример #15
0
class PreviewUrlResource(Resource):
    isLeaf = True

    def __init__(self, hs, media_repo):
        Resource.__init__(self)

        self.auth = hs.get_auth()
        self.clock = hs.get_clock()
        self.version_string = hs.version_string
        self.filepaths = media_repo.filepaths
        self.max_spider_size = hs.config.max_spider_size
        self.server_name = hs.hostname
        self.store = hs.get_datastore()
        self.client = SpiderHttpClient(hs)
        self.media_repo = media_repo
        self.primary_base_path = media_repo.primary_base_path

        self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist

        # memory cache mapping urls to an ObservableDeferred returning
        # JSON-encoded OG metadata
        self._cache = ExpiringCache(
            cache_name="url_previews",
            clock=self.clock,
            # don't spider URLs more often than once an hour
            expiry_ms=60 * 60 * 1000,
        )
        self._cache.start()

        self._cleaner_loop = self.clock.looping_call(
            self._expire_url_cache_data, 10 * 1000
        )

    def render_OPTIONS(self, request):
        return respond_with_json(request, 200, {}, send_cors=True)

    def render_GET(self, request):
        self._async_render_GET(request)
        return NOT_DONE_YET

    @request_handler()
    @defer.inlineCallbacks
    def _async_render_GET(self, request):

        # XXX: if get_user_by_req fails, what should we do in an async render?
        requester = yield self.auth.get_user_by_req(request)
        url = request.args.get("url")[0]
        if "ts" in request.args:
            ts = int(request.args.get("ts")[0])
        else:
            ts = self.clock.time_msec()

        # XXX: we could move this into _do_preview if we wanted.
        url_tuple = urlparse.urlsplit(url)
        for entry in self.url_preview_url_blacklist:
            match = True
            for attrib in entry:
                pattern = entry[attrib]
                value = getattr(url_tuple, attrib)
                logger.debug((
                    "Matching attrib '%s' with value '%s' against"
                    " pattern '%s'"
                ) % (attrib, value, pattern))

                if value is None:
                    match = False
                    continue

                if pattern.startswith('^'):
                    if not re.match(pattern, getattr(url_tuple, attrib)):
                        match = False
                        continue
                else:
                    if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
                        match = False
                        continue
            if match:
                logger.warn(
                    "URL %s blocked by url_blacklist entry %s", url, entry
                )
                raise SynapseError(
                    403, "URL blocked by url pattern blacklist entry",
                    Codes.UNKNOWN
                )

        # the in-memory cache:
        # * ensures that only one request is active at a time
        # * takes load off the DB for the thundering herds
        # * also caches any failures (unlike the DB) so we don't keep
        #    requesting the same endpoint

        observable = self._cache.get(url)

        if not observable:
            download = preserve_fn(self._do_preview)(
                url, requester.user, ts,
            )
            observable = ObservableDeferred(
                download,
                consumeErrors=True
            )
            self._cache[url] = observable
        else:
            logger.info("Returning cached response")

        og = yield make_deferred_yieldable(observable.observe())
        respond_with_json_bytes(request, 200, og, send_cors=True)

    @defer.inlineCallbacks
    def _do_preview(self, url, user, ts):
        """Check the db, and download the URL and build a preview

        Args:
            url (str):
            user (str):
            ts (int):

        Returns:
            Deferred[str]: json-encoded og data
        """
        # check the URL cache in the DB (which will also provide us with
        # historical previews, if we have any)
        cache_result = yield self.store.get_url_cache(url, ts)
        if (
            cache_result and
            cache_result["expires_ts"] > ts and
            cache_result["response_code"] / 100 == 2
        ):
            defer.returnValue(cache_result["og"])
            return

        media_info = yield self._download_url(url, user)

        logger.debug("got media_info of '%s'" % media_info)

        if _is_media(media_info['media_type']):
            dims = yield self.media_repo._generate_thumbnails(
                None, media_info['filesystem_id'], media_info, url_cache=True,
            )

            og = {
                "og:description": media_info['download_name'],
                "og:image": "mxc://%s/%s" % (
                    self.server_name, media_info['filesystem_id']
                ),
                "og:image:type": media_info['media_type'],
                "matrix:image:size": media_info['media_length'],
            }

            if dims:
                og["og:image:width"] = dims['width']
                og["og:image:height"] = dims['height']
            else:
                logger.warn("Couldn't get dims for %s" % url)

            # define our OG response for this media
        elif _is_html(media_info['media_type']):
            # TODO: somehow stop a big HTML tree from exploding synapse's RAM

            file = open(media_info['filename'])
            body = file.read()
            file.close()

            # clobber the encoding from the content-type, or default to utf-8
            # XXX: this overrides any <meta/> or XML charset headers in the body
            # which may pose problems, but so far seems to work okay.
            match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
            encoding = match.group(1) if match else "utf-8"

            og = decode_and_calc_og(body, media_info['uri'], encoding)

            # pre-cache the image for posterity
            # FIXME: it might be cleaner to use the same flow as the main /preview_url
            # request itself and benefit from the same caching etc.  But for now we
            # just rely on the caching on the master request to speed things up.
            if 'og:image' in og and og['og:image']:
                image_info = yield self._download_url(
                    _rebase_url(og['og:image'], media_info['uri']), user
                )

                if _is_media(image_info['media_type']):
                    # TODO: make sure we don't choke on white-on-transparent images
                    dims = yield self.media_repo._generate_thumbnails(
                        None, image_info['filesystem_id'], image_info, url_cache=True,
                    )
                    if dims:
                        og["og:image:width"] = dims['width']
                        og["og:image:height"] = dims['height']
                    else:
                        logger.warn("Couldn't get dims for %s" % og["og:image"])

                    og["og:image"] = "mxc://%s/%s" % (
                        self.server_name, image_info['filesystem_id']
                    )
                    og["og:image:type"] = image_info['media_type']
                    og["matrix:image:size"] = image_info['media_length']
                else:
                    del og["og:image"]
        else:
            logger.warn("Failed to find any OG data in %s", url)
            og = {}

        logger.debug("Calculated OG for %s as %s" % (url, og))

        jsonog = json.dumps(og)

        # store OG in history-aware DB cache
        yield self.store.store_url_cache(
            url,
            media_info["response_code"],
            media_info["etag"],
            media_info["expires"] + media_info["created_ts"],
            jsonog,
            media_info["filesystem_id"],
            media_info["created_ts"],
        )

        defer.returnValue(jsonog)

    @defer.inlineCallbacks
    def _download_url(self, url, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
        # bot, so are we really a robot?

        file_id = datetime.date.today().isoformat() + '_' + random_string(16)

        fpath = self.filepaths.url_cache_filepath_rel(file_id)
        fname = os.path.join(self.primary_base_path, fpath)
        self.media_repo._makedirs(fname)

        try:
            with open(fname, "wb") as f:
                logger.debug("Trying to get url '%s'" % url)
                length, headers, uri, code = yield self.client.get_file(
                    url, output_stream=f, max_size=self.max_spider_size,
                )
                # FIXME: pass through 404s and other error messages nicely

            yield self.media_repo.copy_to_backup(fpath)

            media_type = headers["Content-Type"][0]
            time_now_ms = self.clock.time_msec()

            content_disposition = headers.get("Content-Disposition", None)
            if content_disposition:
                _, params = cgi.parse_header(content_disposition[0],)
                download_name = None

                # First check if there is a valid UTF-8 filename
                download_name_utf8 = params.get("filename*", None)
                if download_name_utf8:
                    if download_name_utf8.lower().startswith("utf-8''"):
                        download_name = download_name_utf8[7:]

                # If there isn't check for an ascii name.
                if not download_name:
                    download_name_ascii = params.get("filename", None)
                    if download_name_ascii and is_ascii(download_name_ascii):
                        download_name = download_name_ascii

                if download_name:
                    download_name = urlparse.unquote(download_name)
                    try:
                        download_name = download_name.decode("utf-8")
                    except UnicodeDecodeError:
                        download_name = None
            else:
                download_name = None

            yield self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
                time_now_ms=self.clock.time_msec(),
                upload_name=download_name,
                media_length=length,
                user_id=user,
                url_cache=url,
            )

        except Exception as e:
            os.remove(fname)
            raise SynapseError(
                500, ("Failed to download content: %s" % e),
                Codes.UNKNOWN
            )

        defer.returnValue({
            "media_type": media_type,
            "media_length": length,
            "download_name": download_name,
            "created_ts": time_now_ms,
            "filesystem_id": file_id,
            "filename": fname,
            "uri": uri,
            "response_code": code,
            # FIXME: we should calculate a proper expiration based on the
            # Cache-Control and Expire headers.  But for now, assume 1 hour.
            "expires": 60 * 60 * 1000,
            "etag": headers["ETag"][0] if "ETag" in headers else None,
        })

    @defer.inlineCallbacks
    def _expire_url_cache_data(self):
        """Clean up expired url cache content, media and thumbnails.
        """
        # TODO: Delete from backup media store

        now = self.clock.time_msec()

        logger.info("Running url preview cache expiry")

        if not (yield self.store.has_completed_background_updates()):
            logger.info("Still running DB updates; skipping expiry")
            return

        # First we delete expired url cache entries
        media_ids = yield self.store.get_expired_url_cache(now)

        removed_media = []
        for media_id in media_ids:
            fname = self.filepaths.url_cache_filepath(media_id)
            try:
                os.remove(fname)
            except OSError as e:
                # If the path doesn't exist, meh
                if e.errno != errno.ENOENT:
                    logger.warn("Failed to remove media: %r: %s", media_id, e)
                    continue

            removed_media.append(media_id)

            try:
                dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
                for dir in dirs:
                    os.rmdir(dir)
            except Exception:
                pass

        yield self.store.delete_url_cache(removed_media)

        if removed_media:
            logger.info("Deleted %d entries from url cache", len(removed_media))

        # Now we delete old images associated with the url cache.
        # These may be cached for a bit on the client (i.e., they
        # may have a room open with a preview url thing open).
        # So we wait a couple of days before deleting, just in case.
        expire_before = now - 2 * 24 * 60 * 60 * 1000
        media_ids = yield self.store.get_url_cache_media_before(expire_before)

        removed_media = []
        for media_id in media_ids:
            fname = self.filepaths.url_cache_filepath(media_id)
            try:
                os.remove(fname)
            except OSError as e:
                # If the path doesn't exist, meh
                if e.errno != errno.ENOENT:
                    logger.warn("Failed to remove media: %r: %s", media_id, e)
                    continue

            try:
                dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
                for dir in dirs:
                    os.rmdir(dir)
            except Exception:
                pass

            thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
            try:
                shutil.rmtree(thumbnail_dir)
            except OSError as e:
                # If the path doesn't exist, meh
                if e.errno != errno.ENOENT:
                    logger.warn("Failed to remove media: %r: %s", media_id, e)
                    continue

            removed_media.append(media_id)

            try:
                dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
                for dir in dirs:
                    os.rmdir(dir)
            except Exception:
                pass

        yield self.store.delete_url_cache_media(removed_media)

        logger.info("Deleted %d media from url cache", len(removed_media))
Пример #16
0
class PreviewUrlResource(Resource):
    isLeaf = True

    def __init__(self, hs, media_repo):
        Resource.__init__(self)

        self.auth = hs.get_auth()
        self.clock = hs.get_clock()
        self.version_string = hs.version_string
        self.filepaths = media_repo.filepaths
        self.max_spider_size = hs.config.max_spider_size
        self.server_name = hs.hostname
        self.store = hs.get_datastore()
        self.client = SpiderHttpClient(hs)
        self.media_repo = media_repo

        self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist

        # simple memory cache mapping urls to OG metadata
        self.cache = ExpiringCache(
            cache_name="url_previews",
            clock=self.clock,
            # don't spider URLs more often than once an hour
            expiry_ms=60 * 60 * 1000,
        )
        self.cache.start()

        self.downloads = {}

    def render_GET(self, request):
        self._async_render_GET(request)
        return NOT_DONE_YET

    @request_handler()
    @defer.inlineCallbacks
    def _async_render_GET(self, request):

        # XXX: if get_user_by_req fails, what should we do in an async render?
        requester = yield self.auth.get_user_by_req(request)
        url = request.args.get("url")[0]
        if "ts" in request.args:
            ts = int(request.args.get("ts")[0])
        else:
            ts = self.clock.time_msec()

        url_tuple = urlparse.urlsplit(url)
        for entry in self.url_preview_url_blacklist:
            match = True
            for attrib in entry:
                pattern = entry[attrib]
                value = getattr(url_tuple, attrib)
                logger.debug((
                    "Matching attrib '%s' with value '%s' against"
                    " pattern '%s'"
                ) % (attrib, value, pattern))

                if value is None:
                    match = False
                    continue

                if pattern.startswith('^'):
                    if not re.match(pattern, getattr(url_tuple, attrib)):
                        match = False
                        continue
                else:
                    if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
                        match = False
                        continue
            if match:
                logger.warn(
                    "URL %s blocked by url_blacklist entry %s", url, entry
                )
                raise SynapseError(
                    403, "URL blocked by url pattern blacklist entry",
                    Codes.UNKNOWN
                )

        # first check the memory cache - good to handle all the clients on this
        # HS thundering away to preview the same URL at the same time.
        og = self.cache.get(url)
        if og:
            respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
            return

        # then check the URL cache in the DB (which will also provide us with
        # historical previews, if we have any)
        cache_result = yield self.store.get_url_cache(url, ts)
        if (
            cache_result and
            cache_result["download_ts"] + cache_result["expires"] > ts and
            cache_result["response_code"] / 100 == 2
        ):
            respond_with_json_bytes(
                request, 200, cache_result["og"].encode('utf-8'),
                send_cors=True
            )
            return

        # Ensure only one download for a given URL is active at a time
        download = self.downloads.get(url)
        if download is None:
            download = self._download_url(url, requester.user)
            download = ObservableDeferred(
                download,
                consumeErrors=True
            )
            self.downloads[url] = download

            @download.addBoth
            def callback(media_info):
                del self.downloads[url]
                return media_info
        media_info = yield download.observe()

        # FIXME: we should probably update our cache now anyway, so that
        # even if the OG calculation raises, we don't keep hammering on the
        # remote server.  For now, leave it uncached to aid debugging OG
        # calculation problems

        logger.debug("got media_info of '%s'" % media_info)

        if self._is_media(media_info['media_type']):
            dims = yield self.media_repo._generate_local_thumbnails(
                media_info['filesystem_id'], media_info
            )

            og = {
                "og:description": media_info['download_name'],
                "og:image": "mxc://%s/%s" % (
                    self.server_name, media_info['filesystem_id']
                ),
                "og:image:type": media_info['media_type'],
                "matrix:image:size": media_info['media_length'],
            }

            if dims:
                og["og:image:width"] = dims['width']
                og["og:image:height"] = dims['height']
            else:
                logger.warn("Couldn't get dims for %s" % url)

            # define our OG response for this media
        elif self._is_html(media_info['media_type']):
            # TODO: somehow stop a big HTML tree from exploding synapse's RAM

            from lxml import etree

            file = open(media_info['filename'])
            body = file.read()
            file.close()

            # clobber the encoding from the content-type, or default to utf-8
            # XXX: this overrides any <meta/> or XML charset headers in the body
            # which may pose problems, but so far seems to work okay.
            match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
            encoding = match.group(1) if match else "utf-8"

            try:
                parser = etree.HTMLParser(recover=True, encoding=encoding)
                tree = etree.fromstring(body, parser)
                og = yield self._calc_og(tree, media_info, requester)
            except UnicodeDecodeError:
                # blindly try decoding the body as utf-8, which seems to fix
                # the charset mismatches on https://google.com
                parser = etree.HTMLParser(recover=True, encoding=encoding)
                tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
                og = yield self._calc_og(tree, media_info, requester)

        else:
            logger.warn("Failed to find any OG data in %s", url)
            og = {}

        logger.debug("Calculated OG for %s as %s" % (url, og))

        # store OG in ephemeral in-memory cache
        self.cache[url] = og

        # store OG in history-aware DB cache
        yield self.store.store_url_cache(
            url,
            media_info["response_code"],
            media_info["etag"],
            media_info["expires"],
            json.dumps(og),
            media_info["filesystem_id"],
            media_info["created_ts"],
        )

        respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)

    @defer.inlineCallbacks
    def _calc_og(self, tree, media_info, requester):
        # suck our tree into lxml and define our OG response.

        # if we see any image URLs in the OG response, then spider them
        # (although the client could choose to do this by asking for previews of those
        # URLs to avoid DoSing the server)

        # "og:type"         : "video",
        # "og:url"          : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
        # "og:site_name"    : "YouTube",
        # "og:video:type"   : "application/x-shockwave-flash",
        # "og:description"  : "Fun stuff happening here",
        # "og:title"        : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon",
        # "og:image"        : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg",
        # "og:video:url"    : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1",
        # "og:video:width"  : "1280"
        # "og:video:height" : "720",
        # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",

        og = {}
        for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
            og[tag.attrib['property']] = tag.attrib['content']

        # TODO: grab article: meta tags too, e.g.:

        # "article:publisher" : "https://www.facebook.com/thethudonline" />
        # "article:author" content="https://www.facebook.com/thethudonline" />
        # "article:tag" content="baby" />
        # "article:section" content="Breaking News" />
        # "article:published_time" content="2016-03-31T19:58:24+00:00" />
        # "article:modified_time" content="2016-04-01T18:31:53+00:00" />

        if 'og:title' not in og:
            # do some basic spidering of the HTML
            title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
            og['og:title'] = title[0].text.strip() if title else None

        if 'og:image' not in og:
            # TODO: extract a favicon failing all else
            meta_image = tree.xpath(
                "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
            )
            if meta_image:
                og['og:image'] = self._rebase_url(meta_image[0], media_info['uri'])
            else:
                # TODO: consider inlined CSS styles as well as width & height attribs
                images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
                images = sorted(images, key=lambda i: (
                    -1 * int(i.attrib['width']) * int(i.attrib['height'])
                ))
                if not images:
                    images = tree.xpath("//img[@src]")
                if images:
                    og['og:image'] = images[0].attrib['src']

        # pre-cache the image for posterity
        # FIXME: it might be cleaner to use the same flow as the main /preview_url request
        # itself and benefit from the same caching etc.  But for now we just rely on the
        # caching on the master request to speed things up.
        if 'og:image' in og and og['og:image']:
            image_info = yield self._download_url(
                self._rebase_url(og['og:image'], media_info['uri']), requester.user
            )

            if self._is_media(image_info['media_type']):
                # TODO: make sure we don't choke on white-on-transparent images
                dims = yield self.media_repo._generate_local_thumbnails(
                    image_info['filesystem_id'], image_info
                )
                if dims:
                    og["og:image:width"] = dims['width']
                    og["og:image:height"] = dims['height']
                else:
                    logger.warn("Couldn't get dims for %s" % og["og:image"])

                og["og:image"] = "mxc://%s/%s" % (
                    self.server_name, image_info['filesystem_id']
                )
                og["og:image:type"] = image_info['media_type']
                og["matrix:image:size"] = image_info['media_length']
            else:
                del og["og:image"]

        if 'og:description' not in og:
            meta_description = tree.xpath(
                "//*/meta"
                "[translate(@name, 'DESCRIPTION', 'description')='description']"
                "/@content")
            if meta_description:
                og['og:description'] = meta_description[0]
            else:
                # grab any text nodes which are inside the <body/> tag...
                # unless they are within an HTML5 semantic markup tag...
                # <header/>, <nav/>, <aside/>, <footer/>
                # ...or if they are within a <script/> or <style/> tag.
                # This is a very very very coarse approximation to a plain text
                # render of the page.
                text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | "
                                        "ancestor::aside | ancestor::footer | "
                                        "ancestor::script | ancestor::style)]" +
                                        "[ancestor::body]")
                text = ''
                for text_node in text_nodes:
                    if len(text) < 500:
                        text += text_node + ' '
                    else:
                        break
                text = re.sub(r'[\t ]+', ' ', text)
                text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text)
                text = text.strip()[:500]
                og['og:description'] = text if text else None

        # TODO: delete the url downloads to stop diskfilling,
        # as we only ever cared about its OG
        defer.returnValue(og)

    def _rebase_url(self, url, base):
        base = list(urlparse.urlparse(base))
        url = list(urlparse.urlparse(url))
        if not url[0]:  # fix up schema
            url[0] = base[0] or "http"
        if not url[1]:  # fix up hostname
            url[1] = base[1]
            if not url[2].startswith('/'):
                url[2] = re.sub(r'/[^/]+$', '/', base[2]) + url[2]
        return urlparse.urlunparse(url)

    @defer.inlineCallbacks
    def _download_url(self, url, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
        # bot, so are we really a robot?

        # XXX: horrible duplication with base_resource's _download_remote_file()
        file_id = random_string(24)

        fname = self.filepaths.local_media_filepath(file_id)
        self.media_repo._makedirs(fname)

        try:
            with open(fname, "wb") as f:
                logger.debug("Trying to get url '%s'" % url)
                length, headers, uri, code = yield self.client.get_file(
                    url, output_stream=f, max_size=self.max_spider_size,
                )
                # FIXME: pass through 404s and other error messages nicely

            media_type = headers["Content-Type"][0]
            time_now_ms = self.clock.time_msec()

            content_disposition = headers.get("Content-Disposition", None)
            if content_disposition:
                _, params = cgi.parse_header(content_disposition[0],)
                download_name = None

                # First check if there is a valid UTF-8 filename
                download_name_utf8 = params.get("filename*", None)
                if download_name_utf8:
                    if download_name_utf8.lower().startswith("utf-8''"):
                        download_name = download_name_utf8[7:]

                # If there isn't check for an ascii name.
                if not download_name:
                    download_name_ascii = params.get("filename", None)
                    if download_name_ascii and is_ascii(download_name_ascii):
                        download_name = download_name_ascii

                if download_name:
                    download_name = urlparse.unquote(download_name)
                    try:
                        download_name = download_name.decode("utf-8")
                    except UnicodeDecodeError:
                        download_name = None
            else:
                download_name = None

            yield self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
                time_now_ms=self.clock.time_msec(),
                upload_name=download_name,
                media_length=length,
                user_id=user,
            )

        except Exception as e:
            os.remove(fname)
            raise SynapseError(
                500, ("Failed to download content: %s" % e),
                Codes.UNKNOWN
            )

        defer.returnValue({
            "media_type": media_type,
            "media_length": length,
            "download_name": download_name,
            "created_ts": time_now_ms,
            "filesystem_id": file_id,
            "filename": fname,
            "uri": uri,
            "response_code": code,
            # FIXME: we should calculate a proper expiration based on the
            # Cache-Control and Expire headers.  But for now, assume 1 hour.
            "expires": 60 * 60 * 1000,
            "etag": headers["ETag"][0] if "ETag" in headers else None,
        })

    def _is_media(self, content_type):
        if content_type.lower().startswith("image/"):
            return True

    def _is_html(self, content_type):
        content_type = content_type.lower()
        if (
            content_type.startswith("text/html") or
            content_type.startswith("application/xhtml")
        ):
            return True
Пример #17
0
class StateResolutionHandler(object):
    """Responsible for doing state conflict resolution.

    Note that the storage layer depends on this handler, so all functions must
    be storage-independent.
    """
    def __init__(self, hs):
        self.clock = hs.get_clock()

        # dict of set of event_ids -> _StateCacheEntry.
        self._state_cache = None
        self.resolve_linearizer = Linearizer(name="state_resolve_lock")

    def start_caching(self):
        logger.debug("start_caching")

        self._state_cache = ExpiringCache(
            cache_name="state_cache",
            clock=self.clock,
            max_len=SIZE_OF_CACHE,
            expiry_ms=EVICTION_TIMEOUT_SECONDS * 1000,
            iterable=True,
            reset_expiry_on_get=True,
        )

        self._state_cache.start()

    @defer.inlineCallbacks
    @log_function
    def resolve_state_groups(
        self, room_id, state_groups_ids, event_map, state_map_factory,
    ):
        """Resolves conflicts between a set of state groups

        Always generates a new state group (unless we hit the cache), so should
        not be called for a single state group

        Args:
            room_id (str): room we are resolving for (used for logging)
            state_groups_ids (dict[int, dict[(str, str), str]]):
                 map from state group id to the state in that state group
                (where 'state' is a map from state key to event id)

            event_map(dict[str,FrozenEvent]|None):
                a dict from event_id to event, for any events that we happen to
                have in flight (eg, those currently being persisted). This will be
                used as a starting point fof finding the state we need; any missing
                events will be requested via state_map_factory.

                If None, all events will be fetched via state_map_factory.

        Returns:
            Deferred[_StateCacheEntry]: resolved state
        """
        logger.debug(
            "resolve_state_groups state_groups %s",
            state_groups_ids.keys()
        )

        group_names = frozenset(state_groups_ids.keys())

        with (yield self.resolve_linearizer.queue(group_names)):
            if self._state_cache is not None:
                cache = self._state_cache.get(group_names, None)
                if cache:
                    defer.returnValue(cache)

            logger.info(
                "Resolving state for %s with %d groups", room_id, len(state_groups_ids)
            )

            # start by assuming we won't have any conflicted state, and build up the new
            # state map by iterating through the state groups. If we discover a conflict,
            # we give up and instead use `resolve_events_with_factory`.
            #
            # XXX: is this actually worthwhile, or should we just let
            # resolve_events_with_factory do it?
            new_state = {}
            conflicted_state = False
            for st in itervalues(state_groups_ids):
                for key, e_id in iteritems(st):
                    if key in new_state:
                        conflicted_state = True
                        break
                    new_state[key] = e_id
                if conflicted_state:
                    break

            if conflicted_state:
                logger.info("Resolving conflicted state for %r", room_id)
                with Measure(self.clock, "state._resolve_events"):
                    new_state = yield resolve_events_with_factory(
                        list(itervalues(state_groups_ids)),
                        event_map=event_map,
                        state_map_factory=state_map_factory,
                    )

            # if the new state matches any of the input state groups, we can
            # use that state group again. Otherwise we will generate a state_id
            # which will be used as a cache key for future resolutions, but
            # not get persisted.

            with Measure(self.clock, "state.create_group_ids"):
                cache = _make_state_cache_entry(new_state, state_groups_ids)

            if self._state_cache is not None:
                self._state_cache[group_names] = cache

            defer.returnValue(cache)