Пример #1
0
def test_on_close():
    io = CocaineIO.instance()
    locator = Locator("localhost", 10053, loop=io)
    locator.disconnect()

    locator = Locator("localhost", 10053, loop=io)
    locator.connect().wait(4)
    locator.connect().wait(4)
    locator.disconnect()
Пример #2
0
    def reelect_app(self, request, app):
        """tries to connect to the same app on differnet host from dist-info"""

        # disconnect app explicitly to break possibly existing connection
        app.disconnect()
        endpoints_size = len(app.locator.endpoints)

        # try x times, where x is the number of different endpoints in app locator.
        for _ in xrange(0, endpoints_size + 1):
            # last chance to take app from common pool
            if len(app.locator.endpoints) == 0:
                request.logger.info(
                    "giving up on connecting to dist-info hosts, falling back to common pool processing"
                )
                app = yield self.proxy.reelect_app(request, app)
                raise gen.Return(app)

            try:
                # always create new locator to prevent locking as we do connect with timeout
                # however lock can be still held during TCP timeout
                locator = Locator(endpoints=app.locator.endpoints)
                request.logger.info("connecting to locator %s",
                                    locator.endpoints[0])

                # first try to connect to locator only on remote host with timeout
                yield gen.with_timeout(self.service_connect_timeout,
                                       locator.connect())
                request.logger.debug("connected to locator %s for %s",
                                     locator.endpoints[0], app.name)
                app = Service(app.name,
                              locator=locator,
                              timeout=RESOLVE_TIMEOUT)

                # try to resolve and connect to application itself
                yield gen.with_timeout(self.service_connect_timeout,
                                       app.connect())
                request.logger.debug("connected to application %s via %s",
                                     app.name, app.endpoints)
            except gen.TimeoutError:
                # on timeout try next endpoint first
                request.logger.warning(
                    "timed out while connecting to application")
                continue
            except ServiceError as err:
                request.logger.warning("got error while resolving app - %s",
                                       err)
                if err.category in LOCATORCATEGORY and err.code == ESERVICENOTAVAILABLE:
                    # if the application is down - also try next endpoint
                    continue
                else:
                    raise err
            finally:
                # drop first endpoint to start next connection from different endpoint
                # we do this, as default logic of connection attempts in locator do not fit here
                app.locator.endpoints = app.locator.endpoints[1:]
            # return connected app
            raise gen.Return(app)
        raise PluginApplicationError(42, 42,
                                     "could not connect to application")
Пример #3
0
    def reelect_app(self, request, app):
        """tries to connect to the same app on differnet host from dist-info"""

        # store current endpoints of locator
        locator_endpoints = app.locator.endpoints

        # disconnect app explicitly to break possibly existing connection
        app.disconnect()
        endpoints_size = len(locator_endpoints)

        # try x times, where x is the number of different endpoints in app locator.
        for _ in xrange(0, endpoints_size):
            try:
                # move first endpoint to the end to start new connection from different endpoint
                # we do this, as default logic of connection attempts in locator do not fit here
                locator_endpoints = locator_endpoints[1:] + locator_endpoints[:1]

                # always create new locator to prevent locking as we do connect with timeout
                # however lock can be still held during TCP timeout
                locator = Locator(endpoints=locator_endpoints)
                request.logger.info("connecting to locator %s", locator.endpoints[0])

                # first try to connect to locator only on remote host with timeout
                yield gen.with_timeout(self.service_connect_timeout, locator.connect())
                request.logger.debug("connected to locator %s for %s", locator.endpoints[0], app.name)
                app = Service(app.name, locator=locator, timeout=RESOLVE_TIMEOUT)

                # try to resolve and connect to application itself
                yield gen.with_timeout(self.service_connect_timeout, app.connect())
                request.logger.debug("connected to application %s via %s", app.name, app.endpoints)
            except gen.TimeoutError:
                # on timeout try next endpoint first
                request.logger.warning("timed out while connecting to application")
                continue
            except ServiceError as err:
                request.logger.warning("got error while resolving app - %s", err)
                if err.category in LOCATORCATEGORY and err.code == ESERVICENOTAVAILABLE:
                    # if the application is down - also try next endpoint
                    continue
                else:
                    raise err
            # return connected app
            raise gen.Return(app)
        raise PluginApplicationError(42, 42, "could not connect to application")
Пример #4
0
class CocaineProxy(object):
    def __init__(self, locators=("localhost:10053",),
                 cache=DEFAULT_SERVICE_CACHE_COUNT,
                 request_id_header="", sticky_header="X-Cocaine-Sticky",
                 ioloop=None, **config):
        # stats
        self.requests_in_progress = 0
        self.requests_disconnections = 0
        self.requests_total = 0

        self.io_loop = ioloop or tornado.ioloop.IOLoop.current()
        self.serviceCacheCount = cache
        self.spoolSize = int(self.serviceCacheCount * 1.5)
        self.refreshPeriod = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD)
        self.timeouts = config.get("timeouts", {})
        self.locator_endpoints = map(parse_locators_endpoints, locators)
        # it's initialized after start
        # to avoid an io_loop creation before fork
        self.locator = Locator(endpoints=self.locator_endpoints)

        # active applications
        self.cache = collections.defaultdict(list)

        self.logger = ContextAdapter(logging.getLogger("cocaine.proxy"), {"id": "0" * 16})
        self.tracking_logger = logging.getLogger("cocaine.proxy.tracking")
        self.logger.info("locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints))

        self.sticky_header = sticky_header

        if request_id_header:
            self.get_request_id = functools.partial(get_request_id, request_id_header)
        else:
            self.get_request_id = generate_request_id

        # post the watcher for routing groups
        self.io_loop.add_future(self.on_routing_groups_update(),
                                lambda x: self.logger.error("the updater must not exit"))

    @gen.coroutine
    def on_routing_groups_update(self):
        uid = gen_uid()
        self.logger.info("generate new uniqque id %s", uid)
        maximum_timeout = 32  # sec
        timeout = 1  # sec
        while True:
            current = {}
            try:
                self.logger.info("subscribe to updates with id %s", uid)
                channel = yield self.locator.routing(uid, True)
                timeout = 1
                while True:
                    new = yield channel.rx.get()
                    if isinstance(new, EmptyResponse):
                        # it means that the cocaine has been stopped
                        self.logger.info("locator sends close")
                        break
                    updates = scan_for_updates(current, new)
                    # replace current
                    current = new
                    if len(updates) == 0:
                        self.logger.info("locator sends an update message, "
                                         "but no updates have been found")
                        continue

                    self.logger.info("%d routing groups have been refreshed %s",
                                     len(updates), updates)
                    for group in updates:
                        # if we have not created an instance of
                        # the group it is absent in cache
                        if group not in self.cache:
                            self.logger.info("nothing to update in group %s", group)
                            continue

                        for app in self.cache[group]:
                            self.logger.info("%d: move %s to the inactive queue to refresh"
                                             " routing group", app.id, app.name)
                            self.migrate_from_cache_to_inactive(app, group)
            except Exception as err:
                timeout = min(timeout << 1, maximum_timeout)
                self.logger.error("error occured while watching for group updates %s. Sleep %d",
                                  err, timeout)
                yield gen.sleep(timeout)

    def get_timeout(self, name):
        return self.timeouts.get(name, DEFAULT_TIMEOUT)

    def migrate_from_cache_to_inactive(self, app, name):
        try:
            self.cache[name].remove(app)
        except ValueError as err:
            self.logger.error("broken cache: %s", err)
        except KeyError as err:
            self.logger.error("broken cache: no such key %s", err)

        self.io_loop.call_later(self.get_timeout(name) * 3,
                                functools.partial(self.dispose, app, name))

    def move_to_inactive(self, app, name):
        def wrapper():
            active_apps = len(self.cache[name])
            if active_apps < self.serviceCacheCount:
                self.io_loop.call_later(self.get_timeout(name), self.move_to_inactive(app, name))
                return

            self.logger.info("%s: move %s %s to an inactive queue (active %d)",
                             app.id, app.name, "{0}:{1}".format(*app.address), active_apps)
            self.migrate_from_cache_to_inactive(app, name)
        return wrapper

    def dispose(self, app, name):
        self.logger.info("dispose service %s %s", name, app.id)
        app.disconnect()

    @context
    @gen.coroutine
    def __call__(self, request):
        if "X-Cocaine-Service" in request.headers and "X-Cocaine-Event" in request.headers:
            request.logger.debug('dispatch by headers')
            name = request.headers['X-Cocaine-Service']
            event = request.headers['X-Cocaine-Event']
        else:
            request.logger.debug('dispatch by uri')
            match = URL_REGEX.match(request.uri)
            if match is None:
                if request.path == "/ping":
                    try:
                        yield self.locator.connect()
                        fill_response_in(request, httplib.OK, "OK", "OK")
                    except Exception as err:
                        request.logger.error("unable to conenct to the locator: %s", err)
                        fill_response_in(request, httplib.SERVICE_UNAVAILABLE,
                                         httplib.responses[httplib.SERVICE_UNAVAILABLE],
                                         "locator is unavailable")
                elif request.path == '/__info':
                    # ToDo: may we should remove keys with len == 0 values from cache
                    # to avoid memory consumption for strings and the dict
                    body = json.dumps({
                        'services': {
                            'cache': dict(((k, len(v)) for k, v in self.cache.items())),
                        },
                        'requests': {
                            'inprogress': self.requests_in_progress,
                            'total': self.requests_total,
                        },
                        'errors': {
                            'disconnections': self.requests_disconnections,
                        }
                    }, sort_keys=True)
                    headers = httputil.HTTPHeaders({"Content-Type": "application/json"})
                    fill_response_in(request, httplib.OK, httplib.responses[httplib.OK],
                                     body, headers)
                else:
                    fill_response_in(request, httplib.NOT_FOUND,
                                     httplib.responses[httplib.NOT_FOUND], "Invalid url")
                return

            name, event, other = match.groups()
            if name == '' or event == '':
                fill_response_in(request, httplib.BAD_REQUEST,
                                 httplib.responses[httplib.BAD_REQUEST], "Proxy invalid request")
                return

            # Drop from query appname and event's name
            if not other.startswith('/'):
                other = "/" + other
            request.uri = other
            request.path, _, _ = other.partition("?")

        if self.sticky_header not in request.headers:
            app = yield self.get_service(name, request)
        else:
            seed = request.headers.get(self.sticky_header)
            request.logger.info('sticky_header has been found: %s', seed)
            app = yield self.get_service_with_seed(name, seed, request)

        if app is None:
            message = "current application %s is unavailable" % name
            fill_response_in(request, NO_SUCH_APP,
                             "No Such Application", message)
            return

        try:
            request.logger.debug("%s: processing request app: `%s`, event `%s`",
                                 app.id, app.name, event)
            yield self.process(request, name, app, event, pack_httprequest(request))
        except Exception as err:
            request.logger.error("error during processing request %s", err)
            fill_response_in(request, httplib.INTERNAL_SERVER_ERROR,
                             httplib.responses[httplib.INTERNAL_SERVER_ERROR],
                             "UID %s: %s" % (request.traceid, str(err)))

        request.logger.info("exit from process")

    @gen.coroutine
    def process(self, request, name, app, event, data):
        request.logger.info("start processing request after %.3f ms", request.request_time() * 1000)
        timeout = self.get_timeout(name)
        # allow to reconnect this amount of times.
        attempts = 2  # make it configurable

        parentid = 0

        if request.traceid is not None:
            traceid = int(request.traceid, 16)
            trace = Trace(traceid=traceid, spanid=traceid, parentid=parentid)
        else:
            trace = None

        while attempts > 0:
            headers = {}
            body_parts = []
            attempts -= 1
            try:
                request.logger.info("%s: enqueue event (attempt %d)", app.id, attempts)
                channel = yield app.enqueue(event, trace=trace)
                request.logger.debug("%s: send event data (attempt %d)", app.id, attempts)
                yield channel.tx.write(msgpack.packb(data), trace=trace)
                yield channel.tx.close(trace=trace)
                request.logger.debug("%s: waiting for a code and headers (attempt %d)",
                                     app.id, attempts)
                code_and_headers = yield channel.rx.get(timeout=timeout)
                request.logger.debug("%s: code and headers have been received (attempt %d)",
                                     app.id, attempts)
                code, raw_headers = msgpack.unpackb(code_and_headers)
                headers = tornado.httputil.HTTPHeaders(raw_headers)
                while True:
                    body = yield channel.rx.get(timeout=timeout)
                    if isinstance(body, EmptyResponse):
                        request.logger.info("%s: body finished (attempt %d)", app.id, attempts)
                        break

                    request.logger.debug("%s: received %d bytes as a body chunk (attempt %d)",
                                         app.id, len(body), attempts)
                    body_parts.append(body)
            except gen.TimeoutError as err:
                request.logger.error("%s %s:  %s", app.id, name, err)
                message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err))
                fill_response_in(request, httplib.GATEWAY_TIMEOUT,
                                 httplib.responses[httplib.GATEWAY_TIMEOUT], message)

            except (DisconnectionError, StreamClosedError) as err:
                self.requests_disconnections += 1
                # Probably it's dangerous to retry requests all the time.
                # I must find the way to determine whether it failed during writing
                # or reading a reply. And retry only writing fails.
                request.logger.error("%s: %s", app.id, err)
                if attempts <= 0:
                    request.logger.info("%s: no more attempts", app.id)
                    fill_response_in(request, httplib.INTERNAL_SERVER_ERROR,
                                     httplib.responses[httplib.INTERNAL_SERVER_ERROR],
                                     "UID %s: Connection problem" % request.traceid)
                    return

                # Seems on_close callback is not called in case of connecting through IPVS
                # We detect disconnection here to avoid unnecessary errors.
                # Try to reconnect here and give the request a go
                try:
                    start_time = time.time()
                    reconn_timeout = timeout - request.request_time()
                    request.logger.info("%s: connecting with timeout %.fms", app.id, reconn_timeout * 1000)
                    yield gen.with_timeout(start_time + reconn_timeout, app.connect(request.logger.traceid))
                    reconn_time = time.time() - start_time
                    request.logger.info("%s: connecting took %.3fms", app.id, reconn_time * 1000)
                except Exception as err:
                    if attempts <= 0:
                        # we have no attempts more, so quit here
                        request.logger.error("%s: %s (no attempts left)", app.id, err)
                        message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err))
                        fill_response_in(request, httplib.INTERNAL_SERVER_ERROR,
                                         httplib.responses[httplib.INTERNAL_SERVER_ERROR], message)
                        return

                    request.logger.error("%s: unable to reconnect: %s (%d attempts left)", err, attempts)
                # We have an attempt to process request again.
                # Jump to the begining of `while attempts > 0`, either we connected successfully
                # or we were failed to connect
                continue

            except ServiceError as err:
                # if the application has been restarted, we get broken pipe code
                # and system category
                if err.code == errno.EPIPE and err.category == ESYSTEMCATEGORY:
                    request.logger.error("%s: the application has been restarted", app.id)
                    app.disconnect()
                    continue

                request.logger.error("%s: %s", app.id, err)
                message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err))
                fill_response_in(request, httplib.INTERNAL_SERVER_ERROR,
                                 httplib.responses[httplib.INTERNAL_SERVER_ERROR], message)

            except Exception as err:
                request.logger.error("%s: %s", app.id, err)
                message = "UID %s: unknown `%s` error: %s" % (request.traceid, name, str(err))
                fill_response_in(request, httplib.INTERNAL_SERVER_ERROR,
                                 httplib.responses[httplib.INTERNAL_SERVER_ERROR], message)
            else:
                message = ''.join(body_parts)
                fill_response_in(request, code,
                                 httplib.responses.get(code, httplib.OK),
                                 message, headers)
            # to return from all errors except Disconnection
            # or receiving a good reply
            return

    @gen.coroutine
    def get_service(self, name, request):
        # cache isn't full for the current application
        if len(self.cache[name]) < self.spoolSize:
            logger = request.logger
            try:
                app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT)
                logger.info("%s: creating an instance of %s", app.id, name)
                self.cache[name].append(app)
                yield app.connect(request.traceid)
                logger.info("%s: connect to an app %s endpoint %s ",
                            app.id, app.name, "{0}:{1}".format(*app.address))

                timeout = (1 + random.random()) * self.refreshPeriod
                self.io_loop.call_later(timeout, self.move_to_inactive(app, name))
            except Exception as err:
                logger.error("%s: unable to connect to `%s`: %s", app.id, name, err)
                if app in self.cache[name]:
                    self.cache[name].remove(app)
                raise gen.Return()
            else:
                raise gen.Return(app)

        # get an instance from cache
        chosen = random.choice(self.cache[name])
        raise gen.Return(chosen)

    @gen.coroutine
    def get_service_with_seed(self, name, seed, request):
        logger = request.logger
        app = Service(name, seed=seed, locator=self.locator)
        try:
            logger.info("%s: creating an instance of %s, seed %s", app.id, name, seed)
            yield app.connect(logger.traceid)
        except Exception as err:
            logger.error("%s: unable to connect to `%s`: %s", app.id, name, err)
            raise gen.Return()

        raise gen.Return(app)