def test_on_close(): io = CocaineIO.instance() locator = Locator("localhost", 10053, loop=io) locator.disconnect() locator = Locator("localhost", 10053, loop=io) locator.connect().wait(4) locator.connect().wait(4) locator.disconnect()
def reelect_app(self, request, app): """tries to connect to the same app on differnet host from dist-info""" # disconnect app explicitly to break possibly existing connection app.disconnect() endpoints_size = len(app.locator.endpoints) # try x times, where x is the number of different endpoints in app locator. for _ in xrange(0, endpoints_size + 1): # last chance to take app from common pool if len(app.locator.endpoints) == 0: request.logger.info( "giving up on connecting to dist-info hosts, falling back to common pool processing" ) app = yield self.proxy.reelect_app(request, app) raise gen.Return(app) try: # always create new locator to prevent locking as we do connect with timeout # however lock can be still held during TCP timeout locator = Locator(endpoints=app.locator.endpoints) request.logger.info("connecting to locator %s", locator.endpoints[0]) # first try to connect to locator only on remote host with timeout yield gen.with_timeout(self.service_connect_timeout, locator.connect()) request.logger.debug("connected to locator %s for %s", locator.endpoints[0], app.name) app = Service(app.name, locator=locator, timeout=RESOLVE_TIMEOUT) # try to resolve and connect to application itself yield gen.with_timeout(self.service_connect_timeout, app.connect()) request.logger.debug("connected to application %s via %s", app.name, app.endpoints) except gen.TimeoutError: # on timeout try next endpoint first request.logger.warning( "timed out while connecting to application") continue except ServiceError as err: request.logger.warning("got error while resolving app - %s", err) if err.category in LOCATORCATEGORY and err.code == ESERVICENOTAVAILABLE: # if the application is down - also try next endpoint continue else: raise err finally: # drop first endpoint to start next connection from different endpoint # we do this, as default logic of connection attempts in locator do not fit here app.locator.endpoints = app.locator.endpoints[1:] # return connected app raise gen.Return(app) raise PluginApplicationError(42, 42, "could not connect to application")
def reelect_app(self, request, app): """tries to connect to the same app on differnet host from dist-info""" # store current endpoints of locator locator_endpoints = app.locator.endpoints # disconnect app explicitly to break possibly existing connection app.disconnect() endpoints_size = len(locator_endpoints) # try x times, where x is the number of different endpoints in app locator. for _ in xrange(0, endpoints_size): try: # move first endpoint to the end to start new connection from different endpoint # we do this, as default logic of connection attempts in locator do not fit here locator_endpoints = locator_endpoints[1:] + locator_endpoints[:1] # always create new locator to prevent locking as we do connect with timeout # however lock can be still held during TCP timeout locator = Locator(endpoints=locator_endpoints) request.logger.info("connecting to locator %s", locator.endpoints[0]) # first try to connect to locator only on remote host with timeout yield gen.with_timeout(self.service_connect_timeout, locator.connect()) request.logger.debug("connected to locator %s for %s", locator.endpoints[0], app.name) app = Service(app.name, locator=locator, timeout=RESOLVE_TIMEOUT) # try to resolve and connect to application itself yield gen.with_timeout(self.service_connect_timeout, app.connect()) request.logger.debug("connected to application %s via %s", app.name, app.endpoints) except gen.TimeoutError: # on timeout try next endpoint first request.logger.warning("timed out while connecting to application") continue except ServiceError as err: request.logger.warning("got error while resolving app - %s", err) if err.category in LOCATORCATEGORY and err.code == ESERVICENOTAVAILABLE: # if the application is down - also try next endpoint continue else: raise err # return connected app raise gen.Return(app) raise PluginApplicationError(42, 42, "could not connect to application")
class CocaineProxy(object): def __init__(self, locators=("localhost:10053",), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.serviceCacheCount = cache self.spoolSize = int(self.serviceCacheCount * 1.5) self.refreshPeriod = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.timeouts = config.get("timeouts", {}) self.locator_endpoints = map(parse_locators_endpoints, locators) # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # active applications self.cache = collections.defaultdict(list) self.logger = ContextAdapter(logging.getLogger("cocaine.proxy"), {"id": "0" * 16}) self.tracking_logger = logging.getLogger("cocaine.proxy.tracking") self.logger.info("locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header if request_id_header: self.get_request_id = functools.partial(get_request_id, request_id_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future(self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit")) @gen.coroutine def on_routing_groups_update(self): uid = gen_uid() self.logger.info("generate new uniqque id %s", uid) maximum_timeout = 32 # sec timeout = 1 # sec while True: current = {} try: self.logger.info("subscribe to updates with id %s", uid) channel = yield self.locator.routing(uid, True) timeout = 1 while True: new = yield channel.rx.get() if isinstance(new, EmptyResponse): # it means that the cocaine has been stopped self.logger.info("locator sends close") break updates = scan_for_updates(current, new) # replace current current = new if len(updates) == 0: self.logger.info("locator sends an update message, " "but no updates have been found") continue self.logger.info("%d routing groups have been refreshed %s", len(updates), updates) for group in updates: # if we have not created an instance of # the group it is absent in cache if group not in self.cache: self.logger.info("nothing to update in group %s", group) continue for app in self.cache[group]: self.logger.info("%d: move %s to the inactive queue to refresh" " routing group", app.id, app.name) self.migrate_from_cache_to_inactive(app, group) except Exception as err: timeout = min(timeout << 1, maximum_timeout) self.logger.error("error occured while watching for group updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) def get_timeout(self, name): return self.timeouts.get(name, DEFAULT_TIMEOUT) def migrate_from_cache_to_inactive(self, app, name): try: self.cache[name].remove(app) except ValueError as err: self.logger.error("broken cache: %s", err) except KeyError as err: self.logger.error("broken cache: no such key %s", err) self.io_loop.call_later(self.get_timeout(name) * 3, functools.partial(self.dispose, app, name)) def move_to_inactive(self, app, name): def wrapper(): active_apps = len(self.cache[name]) if active_apps < self.serviceCacheCount: self.io_loop.call_later(self.get_timeout(name), self.move_to_inactive(app, name)) return self.logger.info("%s: move %s %s to an inactive queue (active %d)", app.id, app.name, "{0}:{1}".format(*app.address), active_apps) self.migrate_from_cache_to_inactive(app, name) return wrapper def dispose(self, app, name): self.logger.info("dispose service %s %s", name, app.id) app.disconnect() @context @gen.coroutine def __call__(self, request): if "X-Cocaine-Service" in request.headers and "X-Cocaine-Event" in request.headers: request.logger.debug('dispatch by headers') name = request.headers['X-Cocaine-Service'] event = request.headers['X-Cocaine-Event'] else: request.logger.debug('dispatch by uri') match = URL_REGEX.match(request.uri) if match is None: if request.path == "/ping": try: yield self.locator.connect() fill_response_in(request, httplib.OK, "OK", "OK") except Exception as err: request.logger.error("unable to conenct to the locator: %s", err) fill_response_in(request, httplib.SERVICE_UNAVAILABLE, httplib.responses[httplib.SERVICE_UNAVAILABLE], "locator is unavailable") elif request.path == '/__info': # ToDo: may we should remove keys with len == 0 values from cache # to avoid memory consumption for strings and the dict body = json.dumps({ 'services': { 'cache': dict(((k, len(v)) for k, v in self.cache.items())), }, 'requests': { 'inprogress': self.requests_in_progress, 'total': self.requests_total, }, 'errors': { 'disconnections': self.requests_disconnections, } }, sort_keys=True) headers = httputil.HTTPHeaders({"Content-Type": "application/json"}) fill_response_in(request, httplib.OK, httplib.responses[httplib.OK], body, headers) else: fill_response_in(request, httplib.NOT_FOUND, httplib.responses[httplib.NOT_FOUND], "Invalid url") return name, event, other = match.groups() if name == '' or event == '': fill_response_in(request, httplib.BAD_REQUEST, httplib.responses[httplib.BAD_REQUEST], "Proxy invalid request") return # Drop from query appname and event's name if not other.startswith('/'): other = "/" + other request.uri = other request.path, _, _ = other.partition("?") if self.sticky_header not in request.headers: app = yield self.get_service(name, request) else: seed = request.headers.get(self.sticky_header) request.logger.info('sticky_header has been found: %s', seed) app = yield self.get_service_with_seed(name, seed, request) if app is None: message = "current application %s is unavailable" % name fill_response_in(request, NO_SUCH_APP, "No Such Application", message) return try: request.logger.debug("%s: processing request app: `%s`, event `%s`", app.id, app.name, event) yield self.process(request, name, app, event, pack_httprequest(request)) except Exception as err: request.logger.error("error during processing request %s", err) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: %s" % (request.traceid, str(err))) request.logger.info("exit from process") @gen.coroutine def process(self, request, name, app, event, data): request.logger.info("start processing request after %.3f ms", request.request_time() * 1000) timeout = self.get_timeout(name) # allow to reconnect this amount of times. attempts = 2 # make it configurable parentid = 0 if request.traceid is not None: traceid = int(request.traceid, 16) trace = Trace(traceid=traceid, spanid=traceid, parentid=parentid) else: trace = None while attempts > 0: headers = {} body_parts = [] attempts -= 1 try: request.logger.info("%s: enqueue event (attempt %d)", app.id, attempts) channel = yield app.enqueue(event, trace=trace) request.logger.debug("%s: send event data (attempt %d)", app.id, attempts) yield channel.tx.write(msgpack.packb(data), trace=trace) yield channel.tx.close(trace=trace) request.logger.debug("%s: waiting for a code and headers (attempt %d)", app.id, attempts) code_and_headers = yield channel.rx.get(timeout=timeout) request.logger.debug("%s: code and headers have been received (attempt %d)", app.id, attempts) code, raw_headers = msgpack.unpackb(code_and_headers) headers = tornado.httputil.HTTPHeaders(raw_headers) while True: body = yield channel.rx.get(timeout=timeout) if isinstance(body, EmptyResponse): request.logger.info("%s: body finished (attempt %d)", app.id, attempts) break request.logger.debug("%s: received %d bytes as a body chunk (attempt %d)", app.id, len(body), attempts) body_parts.append(body) except gen.TimeoutError as err: request.logger.error("%s %s: %s", app.id, name, err) message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.GATEWAY_TIMEOUT, httplib.responses[httplib.GATEWAY_TIMEOUT], message) except (DisconnectionError, StreamClosedError) as err: self.requests_disconnections += 1 # Probably it's dangerous to retry requests all the time. # I must find the way to determine whether it failed during writing # or reading a reply. And retry only writing fails. request.logger.error("%s: %s", app.id, err) if attempts <= 0: request.logger.info("%s: no more attempts", app.id) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: Connection problem" % request.traceid) return # Seems on_close callback is not called in case of connecting through IPVS # We detect disconnection here to avoid unnecessary errors. # Try to reconnect here and give the request a go try: start_time = time.time() reconn_timeout = timeout - request.request_time() request.logger.info("%s: connecting with timeout %.fms", app.id, reconn_timeout * 1000) yield gen.with_timeout(start_time + reconn_timeout, app.connect(request.logger.traceid)) reconn_time = time.time() - start_time request.logger.info("%s: connecting took %.3fms", app.id, reconn_time * 1000) except Exception as err: if attempts <= 0: # we have no attempts more, so quit here request.logger.error("%s: %s (no attempts left)", app.id, err) message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message) return request.logger.error("%s: unable to reconnect: %s (%d attempts left)", err, attempts) # We have an attempt to process request again. # Jump to the begining of `while attempts > 0`, either we connected successfully # or we were failed to connect continue except ServiceError as err: # if the application has been restarted, we get broken pipe code # and system category if err.code == errno.EPIPE and err.category == ESYSTEMCATEGORY: request.logger.error("%s: the application has been restarted", app.id) app.disconnect() continue request.logger.error("%s: %s", app.id, err) message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message) except Exception as err: request.logger.error("%s: %s", app.id, err) message = "UID %s: unknown `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message) else: message = ''.join(body_parts) fill_response_in(request, code, httplib.responses.get(code, httplib.OK), message, headers) # to return from all errors except Disconnection # or receiving a good reply return @gen.coroutine def get_service(self, name, request): # cache isn't full for the current application if len(self.cache[name]) < self.spoolSize: logger = request.logger try: app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT) logger.info("%s: creating an instance of %s", app.id, name) self.cache[name].append(app) yield app.connect(request.traceid) logger.info("%s: connect to an app %s endpoint %s ", app.id, app.name, "{0}:{1}".format(*app.address)) timeout = (1 + random.random()) * self.refreshPeriod self.io_loop.call_later(timeout, self.move_to_inactive(app, name)) except Exception as err: logger.error("%s: unable to connect to `%s`: %s", app.id, name, err) if app in self.cache[name]: self.cache[name].remove(app) raise gen.Return() else: raise gen.Return(app) # get an instance from cache chosen = random.choice(self.cache[name]) raise gen.Return(chosen) @gen.coroutine def get_service_with_seed(self, name, seed, request): logger = request.logger app = Service(name, seed=seed, locator=self.locator) try: logger.info("%s: creating an instance of %s, seed %s", app.id, name, seed) yield app.connect(logger.traceid) except Exception as err: logger.error("%s: unable to connect to `%s`: %s", app.id, name, err) raise gen.Return() raise gen.Return(app)