def reelect_app(self, request, app): """tries to connect to the same app on differnet host from dist-info""" # disconnect app explicitly to break possibly existing connection app.disconnect() endpoints_size = len(app.locator.endpoints) # try x times, where x is the number of different endpoints in app locator. for _ in xrange(0, endpoints_size + 1): # last chance to take app from common pool if len(app.locator.endpoints) == 0: request.logger.info( "giving up on connecting to dist-info hosts, falling back to common pool processing" ) app = yield self.proxy.reelect_app(request, app) raise gen.Return(app) try: # always create new locator to prevent locking as we do connect with timeout # however lock can be still held during TCP timeout locator = Locator(endpoints=app.locator.endpoints) request.logger.info("connecting to locator %s", locator.endpoints[0]) # first try to connect to locator only on remote host with timeout yield gen.with_timeout(self.service_connect_timeout, locator.connect()) request.logger.debug("connected to locator %s for %s", locator.endpoints[0], app.name) app = Service(app.name, locator=locator, timeout=RESOLVE_TIMEOUT) # try to resolve and connect to application itself yield gen.with_timeout(self.service_connect_timeout, app.connect()) request.logger.debug("connected to application %s via %s", app.name, app.endpoints) except gen.TimeoutError: # on timeout try next endpoint first request.logger.warning( "timed out while connecting to application") continue except ServiceError as err: request.logger.warning("got error while resolving app - %s", err) if err.category in LOCATORCATEGORY and err.code == ESERVICENOTAVAILABLE: # if the application is down - also try next endpoint continue else: raise err finally: # drop first endpoint to start next connection from different endpoint # we do this, as default logic of connection attempts in locator do not fit here app.locator.endpoints = app.locator.endpoints[1:] # return connected app raise gen.Return(app) raise PluginApplicationError(42, 42, "could not connect to application")
def test_locator(): io = IOLoop.current() locator = Locator(endpoints=[["localhost", 10053]], io_loop=io) chan = io.run_sync(lambda: locator.resolve("storage")) endpoint, version, api = io.run_sync(chan.rx.get, timeout=4) assert version == 1, "invalid version number %s" % version assert isinstance(endpoint, (list, tuple)), "invalid endpoint type %s" % type(endpoint) assert isinstance(api, dict)
def test_locator(): io = CocaineIO.instance() locator = Locator("localhost", 10053, loop=io) chan = locator.resolve("storage").wait(4) endpoint, version, api = chan.rx.get().wait(1) assert version == 1, "invalid version number %s" % version assert isinstance(endpoint, (list, tuple)), "invalid endpoint type %s" % type(endpoint) assert isinstance(api, dict)
def test_locator(): io = IOLoop.current() locator = Locator(endpoints=[["localhost", 10053]], io_loop=io) chan = io.run_sync(lambda: locator.resolve("storage")) endpoint, version, api = io.run_sync(chan.rx.get, timeout=4) assert version == 1, "invalid version number %s" % version assert isinstance( endpoint, (list, tuple)), "invalid endpoint type %s" % type(endpoint) assert isinstance(api, dict)
def test_locate(): locator = Locator() res = io.run_sync(common.Locate(locator, "locator").execute, timeout=2) assert isinstance(res, dict) assert "api" in res assert "version" in res assert "endpoints" in res
def test_on_close(): io = IOLoop.current() locator = Locator(endpoints=[["localhost", 10053]], io_loop=io) locator.disconnect() locator = Locator(endpoints=[["localhost", 10053]], io_loop=io) io.run_sync(locator.connect) io.run_sync(locator.connect) locator.disconnect()
def test_on_close(): io = CocaineIO.instance() locator = Locator("localhost", 10053, loop=io) locator.disconnect() locator = Locator("localhost", 10053, loop=io) locator.connect().wait(4) locator.connect().wait(4) locator.disconnect()
def create_service(self, name): if name not in self._cache: if name == 'locator': service = Locator(endpoints=self._endpoints) else: service = Service(name, endpoints=self._endpoints) self._cache[name] = service return self._cache[name]
def __init__(self, locators=("localhost:10053",), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", forcegen_request_header=False, default_tracing_chance=DEFAULT_TRACING_CHANCE, configuration_service="unicorn", tracing_conf_path="/zipkin_sampling", ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.service_cache_count = cache self.spool_size = int(self.service_cache_count * 1.5) self.refresh_period = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.timeouts = config.get("timeouts", {}) self.locator_endpoints = [parse_locators_endpoints(i) for i in locators] # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # it's used to reply on `ping` method self.locator_status = False # active applications self.cache = collections.defaultdict(list) self.logger = logging.getLogger("cocaine.proxy.general") self.access_log = logging.getLogger("cocaine.proxy.access") self.access_log.propagate = False self.logger.info("locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header self.logger.info("conf path in `%s` configuration service: %s", configuration_service, tracing_conf_path) self.unicorn = Service(configuration_service, locator=self.locator) self.sampled_apps = {} self.default_tracing_chance = default_tracing_chance self.tracing_conf_path = tracing_conf_path self.io_loop.add_future(self.on_sampling_updates(), lambda x: self.logger.error("the sample updater must not exit")) if request_id_header: self.get_request_id = functools.partial(get_request_id, request_id_header, force=forcegen_request_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future(self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit")) # run infinity check locator health status self.locator_health_check()
def reelect_app(self, request, app): """tries to connect to the same app on differnet host from dist-info""" # store current endpoints of locator locator_endpoints = app.locator.endpoints # disconnect app explicitly to break possibly existing connection app.disconnect() endpoints_size = len(locator_endpoints) # try x times, where x is the number of different endpoints in app locator. for _ in xrange(0, endpoints_size): try: # move first endpoint to the end to start new connection from different endpoint # we do this, as default logic of connection attempts in locator do not fit here locator_endpoints = locator_endpoints[1:] + locator_endpoints[:1] # always create new locator to prevent locking as we do connect with timeout # however lock can be still held during TCP timeout locator = Locator(endpoints=locator_endpoints) request.logger.info("connecting to locator %s", locator.endpoints[0]) # first try to connect to locator only on remote host with timeout yield gen.with_timeout(self.service_connect_timeout, locator.connect()) request.logger.debug("connected to locator %s for %s", locator.endpoints[0], app.name) app = Service(app.name, locator=locator, timeout=RESOLVE_TIMEOUT) # try to resolve and connect to application itself yield gen.with_timeout(self.service_connect_timeout, app.connect()) request.logger.debug("connected to application %s via %s", app.name, app.endpoints) except gen.TimeoutError: # on timeout try next endpoint first request.logger.warning("timed out while connecting to application") continue except ServiceError as err: request.logger.warning("got error while resolving app - %s", err) if err.category in LOCATORCATEGORY and err.code == ESERVICENOTAVAILABLE: # if the application is down - also try next endpoint continue else: raise err # return connected app raise gen.Return(app) raise PluginApplicationError(42, 42, "could not connect to application")
def locator(self): if self._locator: return self._locator else: try: locator = Locator(endpoints=self.endpoints) self._locator = locator return locator except Exception as err: raise ToolsError(err)
def process(self, request): mds_request_headers = httputil.HTTPHeaders() if "Authorization" in request.headers: mds_request_headers["Authorization"] = request.headers[ "Authorization"] traceid = getattr(request, "traceid", None) if traceid is not None: mds_request_headers["X-Request-Id"] = traceid key = request.headers["X-Srw-Key"] name, event = extract_app_and_event(request) self.proxy.setup_tracing(request, name) timeout = self.proxy.get_timeout(name, event) name = self.proxy.resolve_group_to_version(name) if self.is_stid_request(request): url = "%s/gate/dist-info/%s?primary-only" % ( self.dist_info_endpoint, key) request.logger.debug( "fetching endpoints via mulcagate dist-info - %s", url) srw_request = HTTPRequest(url, method="GET", headers=mds_request_headers, allow_ipv6=True, request_timeout=timeout) else: url = "%s/dist-info-%s/%s" % (self.mds_dist_info_endpoint, request.headers["X-Srw-Namespace"], key) request.logger.debug("fetching endpoints via mds dist-info - %s", url) srw_request = HTTPRequest(url, method="GET", headers=mds_request_headers, allow_ipv6=True, request_timeout=timeout) endpoints = yield self.fetch_mds_endpoints(request, srw_request) locator = Locator(endpoints=endpoints) app = Service(name, locator=locator, timeout=RESOLVE_TIMEOUT) request.logger.info("connecting to app %s", name) app = yield self.reelect_app(request, app) # TODO: attempts should be configurable yield self.proxy.process(request, name, app, event, pack_httprequest(request), self.reelect_app, 4, timeout)
def __init__(self, locators=("localhost:10053",), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.serviceCacheCount = cache self.spoolSize = int(self.serviceCacheCount * 1.5) self.refreshPeriod = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.timeouts = config.get("timeouts", {}) self.locator_endpoints = map(parse_locators_endpoints, locators) # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # active applications self.cache = collections.defaultdict(list) self.logger = ContextAdapter(logging.getLogger("cocaine.proxy"), {"id": "0" * 16}) self.tracking_logger = logging.getLogger("cocaine.proxy.tracking") self.logger.info("locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header if request_id_header: self.get_request_id = functools.partial(get_request_id, request_id_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future(self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit"))
def __init__(self, locators=("localhost:10053",), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", forcegen_request_header=False, default_tracing_chance=DEFAULT_TRACING_CHANCE, configuration_service="unicorn", client_id=0, client_secret='', mapped_headers=[], tracing_conf_path="/zipkin_sampling", timeouts_conf_path="/proxy_apps_timeouts", srw_config=None, allow_json_rpc=True, ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.service_cache_count = cache self.spool_size = int(self.service_cache_count * 1.5) self.refresh_period = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.locator_endpoints = [parse_locators_endpoints(i) for i in locators] # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # it's used to reply on `ping` method self.locator_status = False # active applications self.cache = collections.defaultdict(list) # routing groups from Locator service self.current_rg = {} self.logger = logging.getLogger("cocaine.proxy.general") self.access_log = logging.getLogger("cocaine.proxy.access") self.access_log.propagate = False self.logger.info("locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header self.mapped_headers = mapped_headers self.logger.info("mapping headers - %s", str(self.mapped_headers)) self.plugins = [] if srw_config: for config in srw_config: name, cfg = config["type"], config["args"] self.logger.info("initialize plugin %s", name) self.plugins.append(load_plugin(name, self, cfg)) if allow_json_rpc: self.plugins.append(load_plugin('cocaine.proxy.jsonrpc.JSONRPC', self, {})) self.logger.info("conf path in `%s` configuration service: %s", configuration_service, tracing_conf_path) repo = PooledServiceFactory(self.locator_endpoints) repo.secure = TVM(repo, client_id, client_secret) if client_id == 0 or client_secret == '': self.logger.info("using non-authenticated unicorn access") self.unicorn = repo.create_service(configuration_service) else: self.logger.info("using authenticated unicorn access") self.unicorn = repo.create_secure_service(configuration_service) self.sampled_apps = {} self.default_tracing_chance = default_tracing_chance self.tracing_conf_path = tracing_conf_path self.io_loop.add_future(self.on_sampling_updates(), lambda x: self.logger.error("the sample updater must not exit")) self.timeouts_conf_path = timeouts_conf_path self.timeouts = {} self.io_loop.add_future(self.on_timeouts_updates(), lambda x: self.logger.error("the timeouts updater must not exit")) if request_id_header: self.get_request_id = functools.partial(get_request_id, request_id_header, force=forcegen_request_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future(self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit")) # run infinity check locator health status self.locator_health_check()
class CocaineProxy(object): def __init__(self, locators=("localhost:10053",), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", forcegen_request_header=False, default_tracing_chance=DEFAULT_TRACING_CHANCE, configuration_service="unicorn", client_id=0, client_secret='', mapped_headers=[], tracing_conf_path="/zipkin_sampling", timeouts_conf_path="/proxy_apps_timeouts", srw_config=None, allow_json_rpc=True, ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.service_cache_count = cache self.spool_size = int(self.service_cache_count * 1.5) self.refresh_period = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.locator_endpoints = [parse_locators_endpoints(i) for i in locators] # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # it's used to reply on `ping` method self.locator_status = False # active applications self.cache = collections.defaultdict(list) # routing groups from Locator service self.current_rg = {} self.logger = logging.getLogger("cocaine.proxy.general") self.access_log = logging.getLogger("cocaine.proxy.access") self.access_log.propagate = False self.logger.info("locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header self.mapped_headers = mapped_headers self.logger.info("mapping headers - %s", str(self.mapped_headers)) self.plugins = [] if srw_config: for config in srw_config: name, cfg = config["type"], config["args"] self.logger.info("initialize plugin %s", name) self.plugins.append(load_plugin(name, self, cfg)) if allow_json_rpc: self.plugins.append(load_plugin('cocaine.proxy.jsonrpc.JSONRPC', self, {})) self.logger.info("conf path in `%s` configuration service: %s", configuration_service, tracing_conf_path) repo = PooledServiceFactory(self.locator_endpoints) repo.secure = TVM(repo, client_id, client_secret) if client_id == 0 or client_secret == '': self.logger.info("using non-authenticated unicorn access") self.unicorn = repo.create_service(configuration_service) else: self.logger.info("using authenticated unicorn access") self.unicorn = repo.create_secure_service(configuration_service) self.sampled_apps = {} self.default_tracing_chance = default_tracing_chance self.tracing_conf_path = tracing_conf_path self.io_loop.add_future(self.on_sampling_updates(), lambda x: self.logger.error("the sample updater must not exit")) self.timeouts_conf_path = timeouts_conf_path self.timeouts = {} self.io_loop.add_future(self.on_timeouts_updates(), lambda x: self.logger.error("the timeouts updater must not exit")) if request_id_header: self.get_request_id = functools.partial(get_request_id, request_id_header, force=forcegen_request_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future(self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit")) # run infinity check locator health status self.locator_health_check() @gen.coroutine def locator_health_check(self, period=5): wait_timeot = datetime.timedelta(seconds=period) while True: try: self.logger.debug("check health status of locator via cluster method") channel = yield gen.with_timeout(wait_timeot, self.locator.cluster()) cluster = yield gen.with_timeout(wait_timeot, channel.rx.get()) self.locator_status = True self.logger.debug("dumped cluster %s", cluster) yield gen.sleep(period) except Exception as err: self.logger.error("health status check failed: %s", err) self.locator_status = False yield gen.sleep(1) @gen.coroutine def on_routing_groups_update(self): uid = gen_uid() self.logger.info("generate new unique id %s", uid) maximum_timeout = 32 # sec timeout = 1 # sec while True: self.current_rg = {} try: self.logger.info("subscribe to updates with id %s", uid) channel = yield self.locator.routing(uid, True) timeout = 1 while True: new = yield channel.rx.get() if isinstance(new, EmptyResponse): # it means that the cocaine has been stopped self.logger.error("locator sends close") break updates = scan_for_updates(self.current_rg, new) # replace current self.current_rg = new if len(updates) == 0: self.logger.info("locator sends an update message, " "but no updates have been found") continue self.logger.info("%d routing groups have been refreshed %s", len(updates), updates) for group in updates: # if we have not created an instance of # the group it is absent in cache if group not in self.cache: self.logger.debug("nothing to update in group %s", group) continue for app in self.cache[group]: self.logger.debug("%s: move %s to the inactive queue to refresh" " routing group", app.id, app.name) self.migrate_from_cache_to_inactive(app, group) except Exception as err: timeout = min(timeout << 1, maximum_timeout) self.logger.error("error occurred while watching for group updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) @gen.coroutine def watch_app(self, name, path): version = 0 self.sampled_apps[name] = self.default_tracing_chance try: self.logger.info("start watching for sampling updates of %s", name) watch_channel = yield self.unicorn.subscribe(path, version) while True: value, version = yield watch_channel.rx.get() self.logger.info("got sampling updates for %s: version %d value %.2f", name, version, value) try: weight = float(value) self.sampled_apps[name] = weight except ValueError as err: self.logger.error("sample value %s for %s can NOT be converted: %s. Use %f", value, name, err, self.default_tracing_chance) self.sampled_apps[name] = self.default_tracing_chance except ServiceError as err: # verify that the err is `zookeeper: no node [-101]`` if err.code != -101: self.logger.error("watching of `%s` raised an unexpected service error (cat. %d): %s", name, err.category, err) except Exception as err: self.logger.error("watching of %s error: %s", name, err) finally: self.logger.info("stop watching for sampling updates of %s", name) self.sampled_apps.pop(name, None) try: watch_channel.tx.close() except Exception: pass @gen.coroutine def on_sampling_updates(self): maximum_timeout = 32 # sec timeout = 1 # sec listing_version = 0 while True: try: listing_channel = yield self.unicorn.children_subscribe(self.tracing_conf_path, listing_version) while True: listing_version, apps = yield listing_channel.rx.get() self.logger.info("on_sampling_updates: version %d value %s", listing_version, apps) for app in (i for i in apps if i not in self.sampled_apps): self.watch_app(app, self.tracing_conf_path + "/" + app) except Exception as err: timeout = min(timeout << 1, maximum_timeout) listing_version = 0 self.logger.error("error occurred while subscribing for sampling updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) @gen.coroutine def watch_app_timeouts(self, name, path): version = 0 self.timeouts[name] = {} try: self.logger.info("start watching for timeouts updates of %s", name) watch_channel = yield self.unicorn.subscribe(path, version) while True: value, version = yield watch_channel.rx.get() self.logger.info("got timeouts updates for %s: version %d value %s", name, version, value) if isinstance(value, dict): self.timeouts[name] = value else: self.logger.error("timeout value %s for %s is not dict", value, name) self.timeouts[name] = {} except ServiceError as err: # verify that the err is `zookeeper: no node [-101]`` if err.code != -101: self.logger.error("watching of `%s` raised an unexpected service error (cat. %d): %s", name, err.category, err) except Exception as err: self.logger.error("watching of %s error: %s", name, err) finally: self.logger.info("stop watching for timeouts updates of %s", name) self.timeouts.pop(name, None) try: watch_channel.tx.close() except Exception: pass @gen.coroutine def on_timeouts_updates(self): maximum_timeout = 32 # sec timeout = 1 # sec listing_version = 0 while True: try: listing_channel = yield self.unicorn.children_subscribe(self.timeouts_conf_path, listing_version) while True: listing_version, apps = yield listing_channel.rx.get() self.logger.info("on_timeouts_updates: version %d value %s", listing_version, apps) for app in (i for i in apps if i not in self.timeouts): self.watch_app_timeouts(app, self.timeouts_conf_path + "/" + app) except Exception as err: timeout = min(timeout << 1, maximum_timeout) listing_version = 0 self.logger.error("error occurred while subscribing for sampling updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) def get_timeout(self, name, event=''): if name in self.timeouts: tmts = self.timeouts[name] return tmts.get(event) or tmts.get('', DEFAULT_TIMEOUT) return DEFAULT_TIMEOUT def migrate_from_cache_to_inactive(self, app, name): try: drop_app_from_cache(self.cache, app, name) except Exception as err: self.logger.error("app %s %s: drop cache error %s", app, name, err) # dispose service after 3 x timeouts # assume that all requests will be finished self.io_loop.call_later(self.get_timeout(name) * 3, functools.partial(self.dispose, app, name)) self.logger.info("app %s %s is scheduled to dispose", app, name) def move_to_inactive(self, app, name): @gen.coroutine def wrapper(): active_apps = len(self.cache[name]) self.logger.info("%s: preparing to moving %s %s to an inactive queue (active %d)", app.id, app.name, "{0}:{1}".format(*app.address), active_apps) try: new_app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT) self.logger.info("%s: creating an instance of %s", new_app.id, name) yield new_app.connect() self.logger.info("%s: connect to an app %s endpoint %s ", new_app.id, new_app.name, "{0}:{1}".format(*new_app.address)) timeout = (1 + random.random()) * self.refresh_period self.io_loop.call_later(timeout, self.move_to_inactive(new_app, name)) # add to cache only after successfully connected self.cache[name].append(new_app) except Exception as err: self.logger.error("%s: unable to connect to `%s`: %s", new_app.id, name, err) # schedule later self.io_loop.call_later(self.get_timeout(name), self.move_to_inactive(app, name)) else: self.logger.info("%s: move %s %s to an inactive queue", app.id, app.name, "{0}:{1}".format(*app.address)) # current active app will be dropped here self.migrate_from_cache_to_inactive(app, name) return wrapper def dispose(self, app, name): self.logger.info("dispose service %s %s", name, app.id) app.disconnect() def resolve_group_to_version(self, name, value=None): """ Pick a version from a routing group using a random or provided value A routing group looks like (weight, version): {"APP": [[29431330, 'A'], [82426238, 'B'], [101760716, 'C'], [118725487, 'D'], [122951927, 'E']]} """ if name not in self.current_rg: return name routing_group = self.current_rg[name] if len(routing_group) == 0: self.logger.warning("empty rounting group %s", name) return name value = value or random.randint(0, 1 << 32) index = upper_bound(routing_group, value) return routing_group[index if index < len(routing_group) else 0][1] def ping(self, request): if self.locator_status: fill_response_in(request, httplib.OK, "OK", "OK") return fill_response_in(request, httplib.SERVICE_UNAVAILABLE, httplib.responses[httplib.SERVICE_UNAVAILABLE], "Failed", proxy_error_headers()) def setup_tracing(self, request, name): if getattr(request, "traceid", None) is not None: tracing_chance = self.sampled_apps.get(name, self.default_tracing_chance) rolled_dice = random.uniform(0, 100) request.logger.debug("tracing_chance %f, rolled dice %f", tracing_chance, rolled_dice) if tracing_chance < rolled_dice: request.logger.info('stop tracing the request') request.logger = NULLLOGGER request.tracebit = False else: request.tracebit = False @context @gen.coroutine def __call__(self, request): for plugin in self.plugins: if plugin.match(request): request.logger.info('processed by %s plugin', plugin.name()) try: yield plugin.process(request) except PluginNoSuchApplication as err: fill_response_in(request, NO_SUCH_APP, "No such application", str(err), proxy_error_headers()) except PluginApplicationError: message = "application error" fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers()) except ProxyInvalidRequest: if request.path == "/ping": self.ping(request) else: fill_response_in(request, httplib.NOT_FOUND, httplib.responses[httplib.NOT_FOUND], "Invalid url", proxy_error_headers()) except Exception as err: request.logger.exception('plugin %s returned error: %s', plugin.name(), err) message = "unknown error" fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers()) return try: name, event = extract_app_and_event(request) except ProxyInvalidRequest: if request.path == "/ping": self.ping(request) else: fill_response_in(request, httplib.NOT_FOUND, httplib.responses[httplib.NOT_FOUND], "Invalid url", proxy_error_headers()) return self.setup_tracing(request, name) if self.sticky_header in request.headers: seed = request.headers.get(self.sticky_header) seed_value = header_to_seed(seed) request.logger.info('sticky_header has been found: name %s, value %s, seed %d', name, seed, seed_value) name = self.resolve_group_to_version(name, seed_value) app = yield self.get_service(name, request) if app is None: message = "current application %s is unavailable" % name fill_response_in(request, NO_SUCH_APP, "No Such Application", message, proxy_error_headers(name)) return try: # TODO: attempts should be configurable yield self.process(request, name, app, event, pack_httprequest(request), self.reelect_app, 2) except Exception as err: request.logger.exception("error during processing request %s", err) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: %s" % (request.traceid, str(err)), proxy_error_headers(name)) request.logger.info("exit from process") def info(self): return {'services': {'cache': dict(((k, len(v)) for k, v in self.cache.items()))}, 'requests': {'inprogress': self.requests_in_progress, 'total': self.requests_total}, 'errors': {'disconnections': self.requests_disconnections}, 'sampling': self.sampled_apps} @gen.coroutine def reelect_app(self, request, app): cache_size = len(self.cache[app.name]) if cache_size < self.spool_size: request.logger.info("spool is not full. Create a new application instance") app = yield self.get_service(app.name, request) elif cache_size == 1: # NOTE: if we have spool_size 1, the same app will be picked # Probably we can create a new one and mark the old one inactive request.logger.warning("spool size is limited by 1, cannot pick a new instance of th app. Use the old one") # pass else: request.logger.info("pick a random instance of the application") try: index = self.cache[app.name].index(app) request.logger.info("the app is located in cache at pos %d", index) if cache_size == 2: # shortcut picked = (index + 1) % 2 else: picked = index while picked == index: picked = random.randint(0, cache_size - 1) request.logger.info("an instance at pos %d has been picked", index) app = self.cache[app.name][picked] except ValueError: app = random.choice(self.cache[app.name]) raise gen.Return(app) @gen.coroutine def process(self, request, name, app, event, data, reelect_app_fn, attempts, timeout=None): if timeout is None: timeout = self.get_timeout(name, event) request.logger.info("start processing event `%s` for an app `%s` (appid: %s) after %.3f ms with timeout %f", event, app.name, app.id, request.request_time() * 1000, timeout) parentid = 0 if request.traceid is not None: traceid = int(request.traceid, 16) trace = Trace(traceid=traceid, spanid=traceid, parentid=parentid) else: trace = None headers = { 'trace_bit': '{:d}'.format(request.tracebit), } if 'authorization' in request.headers: headers['authorization'] = request.headers['authorization'] for mapped in self.mapped_headers: if mapped in request.headers: headers[mapped] = request.headers[mapped] def on_error(app, err, extra_msg, code=httplib.INTERNAL_SERVER_ERROR): if len(extra_msg) > 0 and not extra_msg.endswith(' '): extra_msg += ' ' request.logger.error("%s: %s%s", app.id, extra_msg, err) message = "UID %s: application `%s` error: %s" % (request.traceid, app.name, str(err)) fill_response_in(request, code, httplib.responses[code], message, proxy_error_headers(app.name)) def check_attempts(app, err): if attempts > 0: return True # we have no attempts more, so quit here on_error(app, err, '(no attempts left) ') return False while attempts > 0: attempts -= 1 processor = None try: request.logger.debug("%s: enqueue event (attempt %d)", app.id, attempts) channel = yield app.enqueue(event, trace=trace, **headers) request.logger.debug("%s: send event data (attempt %d)", app.id, attempts) yield channel.tx.write(msgpack.packb(data), trace=trace) yield channel.tx.close(trace=trace) request.logger.debug("%s: waiting for a code and headers (attempt %d)", app.id, attempts) code_and_headers = yield channel.rx.get(timeout=timeout) request.logger.debug("%s: code and headers have been received (attempt %d)", app.id, attempts) code, raw_headers = msgpack.unpackb(code_and_headers) headers = httputil.HTTPHeaders(raw_headers) cocaine_http_proto_version = headers.get(X_COCAINE_HTTP_PROTO_VERSION) if cocaine_http_proto_version is None or cocaine_http_proto_version == "1.0": cocaine_http_proto_version = "1.0" def stop_condition(body): return isinstance(body, EmptyResponse) elif cocaine_http_proto_version == "1.1": def stop_condition(body): return isinstance(body, EmptyResponse) or len(body) == 0 else: raise Exception("unsupported X-Cocaine-HTTP-Proto-Version: %s" % cocaine_http_proto_version) processor = BodyProcessor.make_processor( headers.get('Content-Length'), request, name, code, headers) while True: body = yield channel.rx.get(timeout=timeout) if stop_condition(body): request.logger.info("%s: body finished (attempt %d)", app.id, attempts) break request.logger.debug("%s: received %d bytes as a body chunk (attempt %d)", app.id, len(body), attempts) processor.swallow(body) except gen.TimeoutError as err: on_error(app, err, '', httplib.GATEWAY_TIMEOUT) except (DisconnectionError, StreamClosedError) as err: self.requests_disconnections += 1 # Probably it's dangerous to retry requests all the time. # I must find the way to determine whether it failed during writing # or reading a reply. And retry only writing fails. request.logger.error("%s: %s", app.id, err) if not check_attempts(app, err): return # Seems on_close callback is not called in case of connecting through IPVS # We detect disconnection here to avoid unnecessary errors. # Try to reconnect here and give the request a go try: start_time = time.time() reconn_timeout = timeout - request.request_time() request.logger.info("%s: connecting with timeout %.fms", app.id, reconn_timeout * 1000) yield gen.with_timeout(start_time + reconn_timeout, app.connect(request.traceid)) reconn_time = time.time() - start_time request.logger.info("%s: connecting took %.3fms", app.id, reconn_time * 1000) except Exception as err: request.logger.error("%s: unable to reconnect: %s (%d attempts left)", err, attempts) # We have an attempt to process request again. # Jump to the begining of `while attempts > 0`, either we connected successfully # or we were failed to connect continue except ServiceError as err: if not check_attempts(app, err): return # if the application has been restarted, we get broken pipe code # and system category if err.category in SYSTEMCATEGORY and err.code == EAPPSTOPPED: request.logger.error("%s: the application has been restarted", app.id) app.disconnect() continue elif err.category in OVERSEERCATEGORY and err.code == EQUEUEISFULL: request.logger.error("%s: queue is full. Pick another application instance", app.id) try: app = yield reelect_app_fn(request, app) except Exception as reelect_err: on_error(app, reelect_err, '(could not reelect app)') return request.logger.info("fetched new app from reelect_app_fn") continue on_error(app, err, '') except Exception as err: on_error(app, err, '(unknown error) ') else: if processor: processor.finish() # to return from all errors except Disconnection # or receiving a good reply return @gen.coroutine def get_service(self, name, request): # cache isn't full for the current application if len(self.cache[name]) < self.spool_size: logger = request.logger try: app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT) logger.info("%s: creating an instance of %s", app.id, name) self.cache[name].append(app) yield app.connect(request.traceid) logger.info("%s: connect to an app %s endpoint %s ", app.id, app.name, "{0}:{1}".format(*app.address)) timeout = (1 + random.random()) * self.refresh_period self.io_loop.call_later(timeout, self.move_to_inactive(app, name)) except Exception as err: logger.error("%s: unable to connect to `%s`: %s", app.id, name, err) drop_app_from_cache(self.cache, app, name) raise gen.Return() else: raise gen.Return(app) # get an instance from cache chosen = random.choice(self.cache[name]) raise gen.Return(chosen)
class CocaineProxy(object): def __init__(self, locators=("localhost:10053",), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.serviceCacheCount = cache self.spoolSize = int(self.serviceCacheCount * 1.5) self.refreshPeriod = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.timeouts = config.get("timeouts", {}) self.locator_endpoints = map(parse_locators_endpoints, locators) # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # active applications self.cache = collections.defaultdict(list) self.logger = ContextAdapter(logging.getLogger("cocaine.proxy"), {"id": "0" * 16}) self.tracking_logger = logging.getLogger("cocaine.proxy.tracking") self.logger.info("locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header if request_id_header: self.get_request_id = functools.partial(get_request_id, request_id_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future(self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit")) @gen.coroutine def on_routing_groups_update(self): uid = gen_uid() self.logger.info("generate new uniqque id %s", uid) maximum_timeout = 32 # sec timeout = 1 # sec while True: current = {} try: self.logger.info("subscribe to updates with id %s", uid) channel = yield self.locator.routing(uid, True) timeout = 1 while True: new = yield channel.rx.get() if isinstance(new, EmptyResponse): # it means that the cocaine has been stopped self.logger.info("locator sends close") break updates = scan_for_updates(current, new) # replace current current = new if len(updates) == 0: self.logger.info("locator sends an update message, " "but no updates have been found") continue self.logger.info("%d routing groups have been refreshed %s", len(updates), updates) for group in updates: # if we have not created an instance of # the group it is absent in cache if group not in self.cache: self.logger.info("nothing to update in group %s", group) continue for app in self.cache[group]: self.logger.info("%d: move %s to the inactive queue to refresh" " routing group", app.id, app.name) self.migrate_from_cache_to_inactive(app, group) except Exception as err: timeout = min(timeout << 1, maximum_timeout) self.logger.error("error occured while watching for group updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) def get_timeout(self, name): return self.timeouts.get(name, DEFAULT_TIMEOUT) def migrate_from_cache_to_inactive(self, app, name): try: self.cache[name].remove(app) except ValueError as err: self.logger.error("broken cache: %s", err) except KeyError as err: self.logger.error("broken cache: no such key %s", err) self.io_loop.call_later(self.get_timeout(name) * 3, functools.partial(self.dispose, app, name)) def move_to_inactive(self, app, name): def wrapper(): active_apps = len(self.cache[name]) if active_apps < self.serviceCacheCount: self.io_loop.call_later(self.get_timeout(name), self.move_to_inactive(app, name)) return self.logger.info("%s: move %s %s to an inactive queue (active %d)", app.id, app.name, "{0}:{1}".format(*app.address), active_apps) self.migrate_from_cache_to_inactive(app, name) return wrapper def dispose(self, app, name): self.logger.info("dispose service %s %s", name, app.id) app.disconnect() @context @gen.coroutine def __call__(self, request): if "X-Cocaine-Service" in request.headers and "X-Cocaine-Event" in request.headers: request.logger.debug('dispatch by headers') name = request.headers['X-Cocaine-Service'] event = request.headers['X-Cocaine-Event'] else: request.logger.debug('dispatch by uri') match = URL_REGEX.match(request.uri) if match is None: if request.path == "/ping": try: yield self.locator.connect() fill_response_in(request, httplib.OK, "OK", "OK") except Exception as err: request.logger.error("unable to conenct to the locator: %s", err) fill_response_in(request, httplib.SERVICE_UNAVAILABLE, httplib.responses[httplib.SERVICE_UNAVAILABLE], "locator is unavailable") elif request.path == '/__info': # ToDo: may we should remove keys with len == 0 values from cache # to avoid memory consumption for strings and the dict body = json.dumps({ 'services': { 'cache': dict(((k, len(v)) for k, v in self.cache.items())), }, 'requests': { 'inprogress': self.requests_in_progress, 'total': self.requests_total, }, 'errors': { 'disconnections': self.requests_disconnections, } }, sort_keys=True) headers = httputil.HTTPHeaders({"Content-Type": "application/json"}) fill_response_in(request, httplib.OK, httplib.responses[httplib.OK], body, headers) else: fill_response_in(request, httplib.NOT_FOUND, httplib.responses[httplib.NOT_FOUND], "Invalid url") return name, event, other = match.groups() if name == '' or event == '': fill_response_in(request, httplib.BAD_REQUEST, httplib.responses[httplib.BAD_REQUEST], "Proxy invalid request") return # Drop from query appname and event's name if not other.startswith('/'): other = "/" + other request.uri = other request.path, _, _ = other.partition("?") if self.sticky_header not in request.headers: app = yield self.get_service(name, request) else: seed = request.headers.get(self.sticky_header) request.logger.info('sticky_header has been found: %s', seed) app = yield self.get_service_with_seed(name, seed, request) if app is None: message = "current application %s is unavailable" % name fill_response_in(request, NO_SUCH_APP, "No Such Application", message) return try: request.logger.debug("%s: processing request app: `%s`, event `%s`", app.id, app.name, event) yield self.process(request, name, app, event, pack_httprequest(request)) except Exception as err: request.logger.error("error during processing request %s", err) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: %s" % (request.traceid, str(err))) request.logger.info("exit from process") @gen.coroutine def process(self, request, name, app, event, data): request.logger.info("start processing request after %.3f ms", request.request_time() * 1000) timeout = self.get_timeout(name) # allow to reconnect this amount of times. attempts = 2 # make it configurable parentid = 0 if request.traceid is not None: traceid = int(request.traceid, 16) trace = Trace(traceid=traceid, spanid=traceid, parentid=parentid) else: trace = None while attempts > 0: headers = {} body_parts = [] attempts -= 1 try: request.logger.info("%s: enqueue event (attempt %d)", app.id, attempts) channel = yield app.enqueue(event, trace=trace) request.logger.debug("%s: send event data (attempt %d)", app.id, attempts) yield channel.tx.write(msgpack.packb(data), trace=trace) yield channel.tx.close(trace=trace) request.logger.debug("%s: waiting for a code and headers (attempt %d)", app.id, attempts) code_and_headers = yield channel.rx.get(timeout=timeout) request.logger.debug("%s: code and headers have been received (attempt %d)", app.id, attempts) code, raw_headers = msgpack.unpackb(code_and_headers) headers = tornado.httputil.HTTPHeaders(raw_headers) while True: body = yield channel.rx.get(timeout=timeout) if isinstance(body, EmptyResponse): request.logger.info("%s: body finished (attempt %d)", app.id, attempts) break request.logger.debug("%s: received %d bytes as a body chunk (attempt %d)", app.id, len(body), attempts) body_parts.append(body) except gen.TimeoutError as err: request.logger.error("%s %s: %s", app.id, name, err) message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.GATEWAY_TIMEOUT, httplib.responses[httplib.GATEWAY_TIMEOUT], message) except (DisconnectionError, StreamClosedError) as err: self.requests_disconnections += 1 # Probably it's dangerous to retry requests all the time. # I must find the way to determine whether it failed during writing # or reading a reply. And retry only writing fails. request.logger.error("%s: %s", app.id, err) if attempts <= 0: request.logger.info("%s: no more attempts", app.id) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: Connection problem" % request.traceid) return # Seems on_close callback is not called in case of connecting through IPVS # We detect disconnection here to avoid unnecessary errors. # Try to reconnect here and give the request a go try: start_time = time.time() reconn_timeout = timeout - request.request_time() request.logger.info("%s: connecting with timeout %.fms", app.id, reconn_timeout * 1000) yield gen.with_timeout(start_time + reconn_timeout, app.connect(request.logger.traceid)) reconn_time = time.time() - start_time request.logger.info("%s: connecting took %.3fms", app.id, reconn_time * 1000) except Exception as err: if attempts <= 0: # we have no attempts more, so quit here request.logger.error("%s: %s (no attempts left)", app.id, err) message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message) return request.logger.error("%s: unable to reconnect: %s (%d attempts left)", err, attempts) # We have an attempt to process request again. # Jump to the begining of `while attempts > 0`, either we connected successfully # or we were failed to connect continue except ServiceError as err: # if the application has been restarted, we get broken pipe code # and system category if err.code == errno.EPIPE and err.category == ESYSTEMCATEGORY: request.logger.error("%s: the application has been restarted", app.id) app.disconnect() continue request.logger.error("%s: %s", app.id, err) message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message) except Exception as err: request.logger.error("%s: %s", app.id, err) message = "UID %s: unknown `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message) else: message = ''.join(body_parts) fill_response_in(request, code, httplib.responses.get(code, httplib.OK), message, headers) # to return from all errors except Disconnection # or receiving a good reply return @gen.coroutine def get_service(self, name, request): # cache isn't full for the current application if len(self.cache[name]) < self.spoolSize: logger = request.logger try: app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT) logger.info("%s: creating an instance of %s", app.id, name) self.cache[name].append(app) yield app.connect(request.traceid) logger.info("%s: connect to an app %s endpoint %s ", app.id, app.name, "{0}:{1}".format(*app.address)) timeout = (1 + random.random()) * self.refreshPeriod self.io_loop.call_later(timeout, self.move_to_inactive(app, name)) except Exception as err: logger.error("%s: unable to connect to `%s`: %s", app.id, name, err) if app in self.cache[name]: self.cache[name].remove(app) raise gen.Return() else: raise gen.Return(app) # get an instance from cache chosen = random.choice(self.cache[name]) raise gen.Return(chosen) @gen.coroutine def get_service_with_seed(self, name, seed, request): logger = request.logger app = Service(name, seed=seed, locator=self.locator) try: logger.info("%s: creating an instance of %s, seed %s", app.id, name, seed) yield app.connect(logger.traceid) except Exception as err: logger.error("%s: unable to connect to `%s`: %s", app.id, name, err) raise gen.Return() raise gen.Return(app)
def __init__(self, locators=("localhost:10053", ), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", forcegen_request_header=False, default_tracing_chance=DEFAULT_TRACING_CHANCE, configuration_service="unicorn", tracing_conf_path="/zipkin_sampling", timeouts_conf_path="/proxy_apps_timeouts", srw_config=None, allow_json_rpc=True, ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.service_cache_count = cache self.spool_size = int(self.service_cache_count * 1.5) self.refresh_period = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.locator_endpoints = [ parse_locators_endpoints(i) for i in locators ] # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # it's used to reply on `ping` method self.locator_status = False # active applications self.cache = collections.defaultdict(list) # routing groups from Locator service self.current_rg = {} self.logger = logging.getLogger("cocaine.proxy.general") self.access_log = logging.getLogger("cocaine.proxy.access") self.access_log.propagate = False self.logger.info( "locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header self.plugins = [] if srw_config: for config in srw_config: name, cfg = config["type"], config["args"] self.logger.info("initialize plugin %s", name) self.plugins.append(load_plugin(name, self, cfg)) if allow_json_rpc: self.plugins.append( load_plugin('cocaine.proxy.jsonrpc.JSONRPC', self, {})) self.logger.info("conf path in `%s` configuration service: %s", configuration_service, tracing_conf_path) self.unicorn = Service(configuration_service, locator=self.locator) self.sampled_apps = {} self.default_tracing_chance = default_tracing_chance self.tracing_conf_path = tracing_conf_path self.io_loop.add_future( self.on_sampling_updates(), lambda x: self.logger.error("the sample updater must not exit")) self.timeouts_conf_path = timeouts_conf_path self.timeouts = {} self.io_loop.add_future( self.on_timeouts_updates(), lambda x: self.logger.error("the timeouts updater must not exit")) if request_id_header: self.get_request_id = functools.partial( get_request_id, request_id_header, force=forcegen_request_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future( self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit")) # run infinity check locator health status self.locator_health_check()
class CocaineProxy(object): def __init__(self, locators=("localhost:10053", ), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", forcegen_request_header=False, default_tracing_chance=DEFAULT_TRACING_CHANCE, configuration_service="unicorn", tracing_conf_path="/zipkin_sampling", timeouts_conf_path="/proxy_apps_timeouts", srw_config=None, allow_json_rpc=True, ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.service_cache_count = cache self.spool_size = int(self.service_cache_count * 1.5) self.refresh_period = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.locator_endpoints = [ parse_locators_endpoints(i) for i in locators ] # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # it's used to reply on `ping` method self.locator_status = False # active applications self.cache = collections.defaultdict(list) # routing groups from Locator service self.current_rg = {} self.logger = logging.getLogger("cocaine.proxy.general") self.access_log = logging.getLogger("cocaine.proxy.access") self.access_log.propagate = False self.logger.info( "locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header self.plugins = [] if srw_config: for config in srw_config: name, cfg = config["type"], config["args"] self.logger.info("initialize plugin %s", name) self.plugins.append(load_plugin(name, self, cfg)) if allow_json_rpc: self.plugins.append( load_plugin('cocaine.proxy.jsonrpc.JSONRPC', self, {})) self.logger.info("conf path in `%s` configuration service: %s", configuration_service, tracing_conf_path) self.unicorn = Service(configuration_service, locator=self.locator) self.sampled_apps = {} self.default_tracing_chance = default_tracing_chance self.tracing_conf_path = tracing_conf_path self.io_loop.add_future( self.on_sampling_updates(), lambda x: self.logger.error("the sample updater must not exit")) self.timeouts_conf_path = timeouts_conf_path self.timeouts = {} self.io_loop.add_future( self.on_timeouts_updates(), lambda x: self.logger.error("the timeouts updater must not exit")) if request_id_header: self.get_request_id = functools.partial( get_request_id, request_id_header, force=forcegen_request_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future( self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit")) # run infinity check locator health status self.locator_health_check() @gen.coroutine def locator_health_check(self, period=5): wait_timeot = datetime.timedelta(seconds=period) while True: try: self.logger.debug( "check health status of locator via cluster method") channel = yield gen.with_timeout(wait_timeot, self.locator.cluster()) cluster = yield gen.with_timeout(wait_timeot, channel.rx.get()) self.locator_status = True self.logger.debug("dumped cluster %s", cluster) yield gen.sleep(period) except Exception as err: self.logger.error("health status check failed: %s", err) self.locator_status = False yield gen.sleep(1) @gen.coroutine def on_routing_groups_update(self): uid = gen_uid() self.logger.info("generate new unique id %s", uid) maximum_timeout = 32 # sec timeout = 1 # sec while True: self.current_rg = {} try: self.logger.info("subscribe to updates with id %s", uid) channel = yield self.locator.routing(uid, True) timeout = 1 while True: new = yield channel.rx.get() if isinstance(new, EmptyResponse): # it means that the cocaine has been stopped self.logger.error("locator sends close") break updates = scan_for_updates(self.current_rg, new) # replace current self.current_rg = new if len(updates) == 0: self.logger.info("locator sends an update message, " "but no updates have been found") continue self.logger.info( "%d routing groups have been refreshed %s", len(updates), updates) for group in updates: # if we have not created an instance of # the group it is absent in cache if group not in self.cache: self.logger.debug("nothing to update in group %s", group) continue for app in self.cache[group]: self.logger.debug( "%s: move %s to the inactive queue to refresh" " routing group", app.id, app.name) self.migrate_from_cache_to_inactive(app, group) except Exception as err: timeout = min(timeout << 1, maximum_timeout) self.logger.error( "error occurred while watching for group updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) @gen.coroutine def watch_app(self, name, path): version = 0 self.sampled_apps[name] = self.default_tracing_chance try: self.logger.info("start watching for sampling updates of %s", name) watch_channel = yield self.unicorn.subscribe(path, version) while True: value, version = yield watch_channel.rx.get() self.logger.info( "got sampling updates for %s: version %d value %.2f", name, version, value) try: weight = float(value) self.sampled_apps[name] = weight except ValueError as err: self.logger.error( "sample value %s for %s can NOT be converted: %s. Use %f", value, name, err, self.default_tracing_chance) self.sampled_apps[name] = self.default_tracing_chance except ServiceError as err: # verify that the err is `zookeeper: no node [-101]`` if err.code != -101: self.logger.error( "watching of `%s` raised an unexpected service error (cat. %d): %s", name, err.category, err) except Exception as err: self.logger.error("watching of %s error: %s", name, err) finally: self.logger.info("stop watching for sampling updates of %s", name) self.sampled_apps.pop(name, None) try: watch_channel.tx.close() except Exception: pass @gen.coroutine def on_sampling_updates(self): maximum_timeout = 32 # sec timeout = 1 # sec listing_version = 0 while True: try: listing_channel = yield self.unicorn.children_subscribe( self.tracing_conf_path, listing_version) while True: listing_version, apps = yield listing_channel.rx.get() self.logger.info( "on_sampling_updates: version %d value %s", listing_version, apps) for app in (i for i in apps if i not in self.sampled_apps): self.watch_app(app, self.tracing_conf_path + "/" + app) except Exception as err: timeout = min(timeout << 1, maximum_timeout) listing_version = 0 self.logger.error( "error occurred while subscribing for sampling updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) @gen.coroutine def watch_app_timeouts(self, name, path): version = 0 self.timeouts[name] = {} try: self.logger.info("start watching for timeouts updates of %s", name) watch_channel = yield self.unicorn.subscribe(path, version) while True: value, version = yield watch_channel.rx.get() self.logger.info( "got timeouts updates for %s: version %d value %s", name, version, value) if isinstance(value, dict): self.timeouts[name] = value else: self.logger.error("timeout value %s for %s is not dict", value, name) self.timeouts[name] = {} except ServiceError as err: # verify that the err is `zookeeper: no node [-101]`` if err.code != -101: self.logger.error( "watching of `%s` raised an unexpected service error (cat. %d): %s", name, err.category, err) except Exception as err: self.logger.error("watching of %s error: %s", name, err) finally: self.logger.info("stop watching for timeouts updates of %s", name) self.timeouts.pop(name, None) try: watch_channel.tx.close() except Exception: pass @gen.coroutine def on_timeouts_updates(self): maximum_timeout = 32 # sec timeout = 1 # sec listing_version = 0 while True: try: listing_channel = yield self.unicorn.children_subscribe( self.timeouts_conf_path, listing_version) while True: listing_version, apps = yield listing_channel.rx.get() self.logger.info( "on_timeouts_updates: version %d value %s", listing_version, apps) for app in (i for i in apps if i not in self.timeouts): self.watch_app_timeouts( app, self.timeouts_conf_path + "/" + app) except Exception as err: timeout = min(timeout << 1, maximum_timeout) listing_version = 0 self.logger.error( "error occurred while subscribing for sampling updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) def get_timeout(self, name, event=''): if name in self.timeouts: tmts = self.timeouts[name] return tmts.get(event) or tmts.get('', DEFAULT_TIMEOUT) return DEFAULT_TIMEOUT def migrate_from_cache_to_inactive(self, app, name): try: drop_app_from_cache(self.cache, app, name) except Exception as err: self.logger.error("app %s %s: drop cache error %s", app, name, err) # dispose service after 3 x timeouts # assume that all requests will be finished self.io_loop.call_later( self.get_timeout(name) * 3, functools.partial(self.dispose, app, name)) self.logger.info("app %s %s is scheduled to dispose", app, name) def move_to_inactive(self, app, name): @gen.coroutine def wrapper(): active_apps = len(self.cache[name]) self.logger.info( "%s: preparing to moving %s %s to an inactive queue (active %d)", app.id, app.name, "{0}:{1}".format(*app.address), active_apps) try: new_app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT) self.logger.info("%s: creating an instance of %s", new_app.id, name) yield new_app.connect() self.logger.info("%s: connect to an app %s endpoint %s ", new_app.id, new_app.name, "{0}:{1}".format(*new_app.address)) timeout = (1 + random.random()) * self.refresh_period self.io_loop.call_later(timeout, self.move_to_inactive(new_app, name)) # add to cache only after successfully connected self.cache[name].append(new_app) except Exception as err: self.logger.error("%s: unable to connect to `%s`: %s", new_app.id, name, err) # schedule later self.io_loop.call_later(self.get_timeout(name), self.move_to_inactive(app, name)) else: self.logger.info("%s: move %s %s to an inactive queue", app.id, app.name, "{0}:{1}".format(*app.address)) # current active app will be dropped here self.migrate_from_cache_to_inactive(app, name) return wrapper def dispose(self, app, name): self.logger.info("dispose service %s %s", name, app.id) app.disconnect() def resolve_group_to_version(self, name, value=None): """ Pick a version from a routing group using a random or provided value A routing group looks like (weight, version): {"APP": [[29431330, 'A'], [82426238, 'B'], [101760716, 'C'], [118725487, 'D'], [122951927, 'E']]} """ if name not in self.current_rg: return name routing_group = self.current_rg[name] if len(routing_group) == 0: self.logger.warning("empty rounting group %s", name) return name value = value or random.randint(0, 1 << 32) index = upper_bound(routing_group, value) return routing_group[index if index < len(routing_group) else 0][1] def ping(self, request): if self.locator_status: fill_response_in(request, httplib.OK, "OK", "OK") return fill_response_in(request, httplib.SERVICE_UNAVAILABLE, httplib.responses[httplib.SERVICE_UNAVAILABLE], "Failed", proxy_error_headers()) @context @gen.coroutine def __call__(self, request): for plugin in self.plugins: if plugin.match(request): request.logger.info('processed by %s plugin', plugin.name()) try: yield plugin.process(request) except PluginNoSuchApplication as err: fill_response_in(request, NO_SUCH_APP, "No such application", str(err), proxy_error_headers()) except PluginApplicationError: message = "application error" fill_response_in( request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers()) except ProxyInvalidRequest: if request.path == "/ping": self.ping(request) else: fill_response_in(request, httplib.NOT_FOUND, httplib.responses[httplib.NOT_FOUND], "Invalid url", proxy_error_headers()) except Exception as err: request.logger.exception('plugin %s returned error: %s', plugin.name(), err) message = "unknown error" fill_response_in( request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers()) return try: name, event = extract_app_and_event(request) except ProxyInvalidRequest: if request.path == "/ping": self.ping(request) else: fill_response_in(request, httplib.NOT_FOUND, httplib.responses[httplib.NOT_FOUND], "Invalid url", proxy_error_headers()) return if getattr(request, "traceid", None) is not None: tracing_chance = self.sampled_apps.get(name, self.default_tracing_chance) rolled_dice = random.uniform(0, 100) request.logger.debug("tracing_chance %f, rolled dice %f", tracing_chance, rolled_dice) if tracing_chance < rolled_dice: request.logger.info('stop tracing the request') request.logger = NULLLOGGER request.traceid = None if self.sticky_header in request.headers: seed = request.headers.get(self.sticky_header) seed_value = header_to_seed(seed) request.logger.info( 'sticky_header has been found: name %s, value %s, seed %d', name, seed, seed_value) name = self.resolve_group_to_version(name, seed_value) app = yield self.get_service(name, request) if app is None: message = "current application %s is unavailable" % name fill_response_in(request, NO_SUCH_APP, "No Such Application", message, proxy_error_headers(name)) return try: yield self.process(request, name, app, event, pack_httprequest(request), self.reelect_app) except Exception as err: request.logger.exception("error during processing request %s", err) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: %s" % (request.traceid, str(err)), proxy_error_headers(name)) request.logger.info("exit from process") def info(self): return { 'services': { 'cache': dict(((k, len(v)) for k, v in self.cache.items())) }, 'requests': { 'inprogress': self.requests_in_progress, 'total': self.requests_total }, 'errors': { 'disconnections': self.requests_disconnections }, 'sampling': self.sampled_apps } @gen.coroutine def reelect_app(self, request, app): cache_size = len(self.cache[app.name]) if cache_size < self.spool_size: request.logger.info( "spool is not full. Create a new application instance") app = yield self.get_service(app.name, request) elif cache_size == 1: # NOTE: if we have spool_size 1, the same app will be picked # Probably we can create a new one and mark the old one inactive request.logger.warning( "spool size is limited by 1, cannot pick a new instance of th app. Use the old one" ) # pass else: request.logger.info("pick a random instance of the application") try: index = self.cache[app.name].index(app) request.logger.info("the app is located in cache at pos %d", index) if cache_size == 2: # shortcut picked = (index + 1) % 2 else: picked = index while picked == index: picked = random.randint(0, cache_size - 1) request.logger.info("an instance at pos %d has been picked", index) app = self.cache[app.name][picked] except ValueError: app = random.choice(self.cache[app.name]) raise gen.Return(app) @gen.coroutine def process(self, request, name, app, event, data, reelect_app_fn, timeout=None): if timeout is None: timeout = self.get_timeout(name, event) request.logger.info( "start processing event `%s` for an app `%s` (appid: %s) after %.3f ms with timeout %f", event, app.name, app.id, request.request_time() * 1000, timeout) # allow to reconnect this amount of times. attempts = 2 # make it configurable parentid = 0 if request.traceid is not None: traceid = int(request.traceid, 16) trace = Trace(traceid=traceid, spanid=traceid, parentid=parentid) else: trace = None headers = {} if 'authorization' in request.headers: headers['authorization'] = request.headers['authorization'] while attempts > 0: body_parts = [] attempts -= 1 try: request.logger.debug("%s: enqueue event (attempt %d)", app.id, attempts) channel = yield app.enqueue(event, trace=trace, **headers) request.logger.debug("%s: send event data (attempt %d)", app.id, attempts) yield channel.tx.write(msgpack.packb(data), trace=trace) yield channel.tx.close(trace=trace) request.logger.debug( "%s: waiting for a code and headers (attempt %d)", app.id, attempts) code_and_headers = yield channel.rx.get(timeout=timeout) request.logger.debug( "%s: code and headers have been received (attempt %d)", app.id, attempts) code, raw_headers = msgpack.unpackb(code_and_headers) headers = httputil.HTTPHeaders(raw_headers) cocaine_http_proto_version = headers.get( X_COCAINE_HTTP_PROTO_VERSION) if cocaine_http_proto_version is None or cocaine_http_proto_version == "1.0": cocaine_http_proto_version = "1.0" def stop_condition(body): return isinstance(body, EmptyResponse) elif cocaine_http_proto_version == "1.1": def stop_condition(body): return isinstance(body, EmptyResponse) or len(body) == 0 else: raise Exception( "unsupported X-Cocaine-HTTP-Proto-Version: %s" % cocaine_http_proto_version) while True: body = yield channel.rx.get(timeout=timeout) if stop_condition(body): request.logger.info("%s: body finished (attempt %d)", app.id, attempts) break request.logger.debug( "%s: received %d bytes as a body chunk (attempt %d)", app.id, len(body), attempts) body_parts.append(body) except gen.TimeoutError as err: request.logger.error("%s %s: %s", app.id, name, err) message = "UID %s: application `%s` error: TimeoutError" % ( request.traceid, name) fill_response_in(request, httplib.GATEWAY_TIMEOUT, httplib.responses[httplib.GATEWAY_TIMEOUT], message, proxy_error_headers(name)) except (DisconnectionError, StreamClosedError) as err: self.requests_disconnections += 1 # Probably it's dangerous to retry requests all the time. # I must find the way to determine whether it failed during writing # or reading a reply. And retry only writing fails. request.logger.error("%s: %s", app.id, err) if attempts <= 0: request.logger.error("%s: no more attempts", app.id) fill_response_in( request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: Connection problem" % request.traceid, proxy_error_headers(name)) return # Seems on_close callback is not called in case of connecting through IPVS # We detect disconnection here to avoid unnecessary errors. # Try to reconnect here and give the request a go try: start_time = time.time() reconn_timeout = timeout - request.request_time() request.logger.info("%s: connecting with timeout %.fms", app.id, reconn_timeout * 1000) yield gen.with_timeout(start_time + reconn_timeout, app.connect(request.traceid)) reconn_time = time.time() - start_time request.logger.info("%s: connecting took %.3fms", app.id, reconn_time * 1000) except Exception as err: if attempts <= 0: # we have no attempts more, so quit here request.logger.error("%s: %s (no attempts left)", app.id, err) message = "UID %s: application `%s` error: %s" % ( request.traceid, name, str(err)) fill_response_in( request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers(name)) return request.logger.error( "%s: unable to reconnect: %s (%d attempts left)", err, attempts) # We have an attempt to process request again. # Jump to the begining of `while attempts > 0`, either we connected successfully # or we were failed to connect continue except ServiceError as err: # if the application has been restarted, we get broken pipe code # and system category if err.category in SYSTEMCATEGORY and err.code == EAPPSTOPPED: request.logger.error( "%s: the application has been restarted", app.id) app.disconnect() continue elif err.category in OVERSEERCATEGORY and err.code == EQUEUEISFULL: request.logger.error( "%s: queue is full. Pick another application instance", app.id) app = yield reelect_app_fn(request, app) continue request.logger.error("%s: service error: [%d, %d] %s", app.id, err.category, err.code, err.reason) message = "UID %s: application `%s` error: %s" % ( request.traceid, name, str(err)) fill_response_in( request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers(name)) except Exception as err: request.logger.exception("%s: %s", app.id, err) message = "UID %s: unknown `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in( request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers(name)) else: message = ''.join(body_parts) headers['X-Cocaine-Application'] = name fill_response_in(request, code, httplib.responses.get(code, httplib.OK), message, headers) # to return from all errors except Disconnection # or receiving a good reply return @gen.coroutine def get_service(self, name, request): # cache isn't full for the current application if len(self.cache[name]) < self.spool_size: logger = request.logger try: app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT) logger.info("%s: creating an instance of %s", app.id, name) self.cache[name].append(app) yield app.connect(request.traceid) logger.info("%s: connect to an app %s endpoint %s ", app.id, app.name, "{0}:{1}".format(*app.address)) timeout = (1 + random.random()) * self.refresh_period self.io_loop.call_later(timeout, self.move_to_inactive(app, name)) except Exception as err: logger.error("%s: unable to connect to `%s`: %s", app.id, name, err) drop_app_from_cache(self.cache, app, name) raise gen.Return() else: raise gen.Return(app) # get an instance from cache chosen = random.choice(self.cache[name]) raise gen.Return(chosen)
def __init__(self): self.storage = Service("storage") self.node = Service("node") self.locator = Locator()
def test_service_attribute_error(): io = IOLoop.current() locator = Locator([("localhost", 10053)], io_loop=io) locator.random_attribute().get()
class CocaineProxy(object): def __init__(self, locators=("localhost:10053",), cache=DEFAULT_SERVICE_CACHE_COUNT, request_id_header="", sticky_header="X-Cocaine-Sticky", forcegen_request_header=False, default_tracing_chance=DEFAULT_TRACING_CHANCE, configuration_service="unicorn", tracing_conf_path="/zipkin_sampling", ioloop=None, **config): # stats self.requests_in_progress = 0 self.requests_disconnections = 0 self.requests_total = 0 self.io_loop = ioloop or tornado.ioloop.IOLoop.current() self.service_cache_count = cache self.spool_size = int(self.service_cache_count * 1.5) self.refresh_period = config.get("refresh_timeout", DEFAULT_REFRESH_PERIOD) self.timeouts = config.get("timeouts", {}) self.locator_endpoints = [parse_locators_endpoints(i) for i in locators] # it's initialized after start # to avoid an io_loop creation before fork self.locator = Locator(endpoints=self.locator_endpoints) # it's used to reply on `ping` method self.locator_status = False # active applications self.cache = collections.defaultdict(list) self.logger = logging.getLogger("cocaine.proxy.general") self.access_log = logging.getLogger("cocaine.proxy.access") self.access_log.propagate = False self.logger.info("locators %s", ','.join("%s:%d" % (h, p) for h, p in self.locator_endpoints)) self.sticky_header = sticky_header self.logger.info("conf path in `%s` configuration service: %s", configuration_service, tracing_conf_path) self.unicorn = Service(configuration_service, locator=self.locator) self.sampled_apps = {} self.default_tracing_chance = default_tracing_chance self.tracing_conf_path = tracing_conf_path self.io_loop.add_future(self.on_sampling_updates(), lambda x: self.logger.error("the sample updater must not exit")) if request_id_header: self.get_request_id = functools.partial(get_request_id, request_id_header, force=forcegen_request_header) else: self.get_request_id = generate_request_id # post the watcher for routing groups self.io_loop.add_future(self.on_routing_groups_update(), lambda x: self.logger.error("the updater must not exit")) # run infinity check locator health status self.locator_health_check() @gen.coroutine def locator_health_check(self, period=5): wait_timeot = datetime.timedelta(seconds=period) while True: try: self.logger.debug("check health status of locator via cluster method") channel = yield gen.with_timeout(wait_timeot, self.locator.cluster()) cluster = yield gen.with_timeout(wait_timeot, channel.rx.get()) self.locator_status = True self.logger.debug("dumped cluster %s", cluster) yield gen.sleep(period) except Exception as err: self.logger.error("health status check failed: %s", err) self.locator_status = False yield gen.sleep(1) @gen.coroutine def on_routing_groups_update(self): uid = gen_uid() self.logger.info("generate new uniqque id %s", uid) maximum_timeout = 32 # sec timeout = 1 # sec while True: current = {} try: self.logger.info("subscribe to updates with id %s", uid) channel = yield self.locator.routing(uid, True) timeout = 1 while True: new = yield channel.rx.get() if isinstance(new, EmptyResponse): # it means that the cocaine has been stopped self.logger.error("locator sends close") break updates = scan_for_updates(current, new) # replace current current = new if len(updates) == 0: self.logger.info("locator sends an update message, " "but no updates have been found") continue self.logger.info("%d routing groups have been refreshed %s", len(updates), updates) for group in updates: # if we have not created an instance of # the group it is absent in cache if group not in self.cache: self.logger.debug("nothing to update in group %s", group) continue for app in self.cache[group]: self.logger.debug("%s: move %s to the inactive queue to refresh" " routing group", app.id, app.name) self.migrate_from_cache_to_inactive(app, group) except Exception as err: timeout = min(timeout << 1, maximum_timeout) self.logger.error("error occured while watching for group updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) @gen.coroutine def watch_app(self, name, path): version = 0 self.sampled_apps[name] = self.default_tracing_chance try: self.logger.info("start watching for sampling updates of %s", name) watch_channel = yield self.unicorn.subscribe(path, version) while True: value, version = yield watch_channel.rx.get() self.logger.info("got sampling updates for %s: version %d value %.2f", name, version, value) try: weight = float(value) self.sampled_apps[name] = weight except ValueError as err: self.logger.error("sample value %s for %s can NOT be converted: %s. Use %f", value, name, err, self.default_tracing_chance) self.sampled_apps[name] = self.default_tracing_chance except ServiceError as err: # verify that the err is `zookeeper: no node [-101]`` if err.code != -101: self.logger.error("watching of `%s` raised an unexpected service error (cat. %d): %s", name, err.category, err) except Exception as err: self.logger.error("watching of %s error: %s", name, err) finally: self.logger.info("stop watching for sampling updates of %s", name) self.sampled_apps.pop(name, None) try: watch_channel.tx.close() except Exception: pass @gen.coroutine def on_sampling_updates(self): maximum_timeout = 32 # sec timeout = 1 # sec listing_version = 0 while True: try: listing_channel = yield self.unicorn.children_subscribe(self.tracing_conf_path, listing_version) while True: listing_version, apps = yield listing_channel.rx.get() self.logger.info("on_sampling_updates: version %d value %s", listing_version, apps) for app in (i for i in apps if i not in self.sampled_apps): self.watch_app(app, self.tracing_conf_path + "/" + app) except Exception as err: timeout = min(timeout << 1, maximum_timeout) listing_version = 0 self.logger.error("error occured while subscribing for sampling updates %s. Sleep %d", err, timeout) yield gen.sleep(timeout) def get_timeout(self, name): return self.timeouts.get(name, DEFAULT_TIMEOUT) def migrate_from_cache_to_inactive(self, app, name): try: drop_app_from_cache(self.cache, app, name) except Exception as err: self.logger.error("app %s %s: drop cache error %s", app, name, err) # dispose service after 3 x timeouts # assume that all requests will be finished self.io_loop.call_later(self.get_timeout(name) * 3, functools.partial(self.dispose, app, name)) self.logger.info("app %s %s is scheduled to dispose", app, name) def move_to_inactive(self, app, name): @gen.coroutine def wrapper(): active_apps = len(self.cache[name]) self.logger.info("%s: preparing to moving %s %s to an inactive queue (active %d)", app.id, app.name, "{0}:{1}".format(*app.address), active_apps) try: new_app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT) self.logger.info("%s: creating an instance of %s", new_app.id, name) yield new_app.connect() self.logger.info("%s: connect to an app %s endpoint %s ", new_app.id, new_app.name, "{0}:{1}".format(*new_app.address)) timeout = (1 + random.random()) * self.refresh_period self.io_loop.call_later(timeout, self.move_to_inactive(new_app, name)) # add to cache only after successfully connected self.cache[name].append(new_app) except Exception as err: self.logger.error("%s: unable to connect to `%s`: %s", new_app.id, name, err) # schedule later self.io_loop.call_later(self.get_timeout(name), self.move_to_inactive(app, name)) else: self.logger.info("%s: move %s %s to an inactive queue", app.id, app.name, "{0}:{1}".format(*app.address)) # current active app will be dropped here self.migrate_from_cache_to_inactive(app, name) return wrapper def dispose(self, app, name): self.logger.info("dispose service %s %s", name, app.id) app.disconnect() @context @gen.coroutine def __call__(self, request): if "X-Cocaine-Service" in request.headers and "X-Cocaine-Event" in request.headers: request.logger.debug('dispatch by headers') name = request.headers['X-Cocaine-Service'] event = request.headers['X-Cocaine-Event'] else: request.logger.debug('dispatch by uri') match = URL_REGEX.match(request.uri) if match is None: if request.path == "/ping": if self.locator_status: fill_response_in(request, httplib.OK, "OK", "OK") else: fill_response_in(request, httplib.SERVICE_UNAVAILABLE, httplib.responses[httplib.SERVICE_UNAVAILABLE], "Failed", proxy_error_headers()) else: fill_response_in(request, httplib.NOT_FOUND, httplib.responses[httplib.NOT_FOUND], "Invalid url", proxy_error_headers()) return name, event, other = match.groups() if name == '' or event == '': fill_response_in(request, httplib.BAD_REQUEST, httplib.responses[httplib.BAD_REQUEST], "Proxy invalid request", proxy_error_headers()) return # Drop from query appname and event's name if not other.startswith('/'): other = "/" + other request.uri = other request.path, _, _ = other.partition("?") if getattr(request, "traceid", None) is not None: tracing_chance = self.sampled_apps.get(name, self.default_tracing_chance) rolled_dice = random.uniform(0, 100) request.logger.debug("tracing_chance %f, rolled dice %f", tracing_chance, rolled_dice) if tracing_chance < rolled_dice: request.logger.info('stop tracing the request') request.logger = NULLLOGGER request.traceid = None if self.sticky_header not in request.headers: app = yield self.get_service(name, request) else: seed = request.headers.get(self.sticky_header) request.logger.info('sticky_header has been found: %s', seed) app = yield self.get_service_with_seed(name, seed, request) if app is None: message = "current application %s is unavailable" % name fill_response_in(request, NO_SUCH_APP, "No Such Application", message, proxy_error_headers()) return try: yield self.process(request, name, app, event, pack_httprequest(request)) except Exception as err: request.logger.error("error during processing request %s", err) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: %s" % (request.traceid, str(err)), proxy_error_headers()) request.logger.info("exit from process") def info(self): return {'services': {'cache': dict(((k, len(v)) for k, v in self.cache.items()))}, 'requests': {'inprogress': self.requests_in_progress, 'total': self.requests_total}, 'errors': {'disconnections': self.requests_disconnections}, 'sampling': self.sampled_apps} @gen.coroutine def process(self, request, name, app, event, data): request.logger.info("start processing event `%s` for an app `%s` (appid: %s) after %.3f ms", event, app.name, app.id, request.request_time() * 1000) timeout = self.get_timeout(name) # allow to reconnect this amount of times. attempts = 2 # make it configurable parentid = 0 if request.traceid is not None: traceid = int(request.traceid, 16) trace = Trace(traceid=traceid, spanid=traceid, parentid=parentid) else: trace = None while attempts > 0: headers = {} body_parts = [] attempts -= 1 try: request.logger.debug("%s: enqueue event (attempt %d)", app.id, attempts) channel = yield app.enqueue(event, trace=trace) request.logger.debug("%s: send event data (attempt %d)", app.id, attempts) yield channel.tx.write(msgpack.packb(data), trace=trace) yield channel.tx.close(trace=trace) request.logger.debug("%s: waiting for a code and headers (attempt %d)", app.id, attempts) code_and_headers = yield channel.rx.get(timeout=timeout) request.logger.debug("%s: code and headers have been received (attempt %d)", app.id, attempts) code, raw_headers = msgpack.unpackb(code_and_headers) headers = httputil.HTTPHeaders(raw_headers) cocaine_http_proto_version = headers.get(X_COCAINE_HTTP_PROTO_VERSION) if cocaine_http_proto_version is None or cocaine_http_proto_version == "1.0": cocaine_http_proto_version = "1.0" def stop_condition(body): return isinstance(body, EmptyResponse) elif cocaine_http_proto_version == "1.1": def stop_condition(body): return isinstance(body, EmptyResponse) or len(body) == 0 else: raise Exception("unsupported X-Cocaine-HTTP-Proto-Version: %s" % cocaine_http_proto_version) while True: body = yield channel.rx.get(timeout=timeout) if stop_condition(body): request.logger.info("%s: body finished (attempt %d)", app.id, attempts) break request.logger.debug("%s: received %d bytes as a body chunk (attempt %d)", app.id, len(body), attempts) body_parts.append(body) except gen.TimeoutError as err: request.logger.error("%s %s: %s", app.id, name, err) message = "UID %s: application `%s` error: TimeoutError" % (request.traceid, name) fill_response_in(request, httplib.GATEWAY_TIMEOUT, httplib.responses[httplib.GATEWAY_TIMEOUT], message, proxy_error_headers()) except (DisconnectionError, StreamClosedError) as err: self.requests_disconnections += 1 # Probably it's dangerous to retry requests all the time. # I must find the way to determine whether it failed during writing # or reading a reply. And retry only writing fails. request.logger.error("%s: %s", app.id, err) if attempts <= 0: request.logger.error("%s: no more attempts", app.id) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], "UID %s: Connection problem" % request.traceid, proxy_error_headers()) return # Seems on_close callback is not called in case of connecting through IPVS # We detect disconnection here to avoid unnecessary errors. # Try to reconnect here and give the request a go try: start_time = time.time() reconn_timeout = timeout - request.request_time() request.logger.info("%s: connecting with timeout %.fms", app.id, reconn_timeout * 1000) yield gen.with_timeout(start_time + reconn_timeout, app.connect(request.traceid)) reconn_time = time.time() - start_time request.logger.info("%s: connecting took %.3fms", app.id, reconn_time * 1000) except Exception as err: if attempts <= 0: # we have no attempts more, so quit here request.logger.error("%s: %s (no attempts left)", app.id, err) message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers()) return request.logger.error("%s: unable to reconnect: %s (%d attempts left)", err, attempts) # We have an attempt to process request again. # Jump to the begining of `while attempts > 0`, either we connected successfully # or we were failed to connect continue except ServiceError as err: # if the application has been restarted, we get broken pipe code # and system category if err.category in SYSTEMCATEGORY and err.code == EAPPSTOPPED: request.logger.error("%s: the application has been restarted", app.id) app.disconnect() continue elif err.category in OVERSEERCATEGORY and err.code == EQUEUEISFULL: request.logger.error("%s: queue is full. Pick another application instance", app.id) cache_size = len(self.cache[app.name]) if cache_size < self.spool_size: request.logger.info("spool is not full. Create a new application instance") app = yield self.get_service(app.name, request) elif cache_size == 1: # NOTE: if we have spool_size 1, the same app will be picked # Probably we can create a new one and mark the old one inactive request.logger.warning("spool size is limited by 1, cannot pick a new instance of th app. Use the old one") # pass else: request.logger.info("pick a random instance of the application") try: index = self.cache[app.name].index(app) request.logger.info("the app is located in cache at pos %d", index) if cache_size == 2: # shortcut picked = (index + 1) % 2 else: picked = index while picked == index: picked = random.randint(0, cache_size - 1) request.logger.info("an instance at pos %d has been picked", index) app = self.cache[app.name][picked] except ValueError: app = random.choice(self.cache[app.name]) continue request.logger.error("%s: service error: [%d, %d] %s", app.id, err.category, err.code, err.reason) message = "UID %s: application `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers()) except Exception as err: request.logger.error("%s: %s", app.id, err) message = "UID %s: unknown `%s` error: %s" % (request.traceid, name, str(err)) fill_response_in(request, httplib.INTERNAL_SERVER_ERROR, httplib.responses[httplib.INTERNAL_SERVER_ERROR], message, proxy_error_headers()) else: message = ''.join(body_parts) fill_response_in(request, code, httplib.responses.get(code, httplib.OK), message, headers) # to return from all errors except Disconnection # or receiving a good reply return @gen.coroutine def get_service(self, name, request): # cache isn't full for the current application if len(self.cache[name]) < self.spool_size: logger = request.logger try: app = Service(name, locator=self.locator, timeout=RESOLVE_TIMEOUT) logger.info("%s: creating an instance of %s", app.id, name) self.cache[name].append(app) yield app.connect(request.traceid) logger.info("%s: connect to an app %s endpoint %s ", app.id, app.name, "{0}:{1}".format(*app.address)) timeout = (1 + random.random()) * self.refresh_period self.io_loop.call_later(timeout, self.move_to_inactive(app, name)) except Exception as err: logger.error("%s: unable to connect to `%s`: %s", app.id, name, err) drop_app_from_cache(self.cache, app, name) raise gen.Return() else: raise gen.Return(app) # get an instance from cache chosen = random.choice(self.cache[name]) raise gen.Return(chosen) @gen.coroutine def get_service_with_seed(self, name, seed, request): logger = request.logger app = Service(name, seed=seed, locator=self.locator) try: logger.info("%s: creating an instance of %s, seed %s", app.id, name, seed) yield app.connect(request.traceid) except Exception as err: logger.error("%s: unable to connect to `%s`: %s", app.id, name, err) raise gen.Return() raise gen.Return(app)
def test_service_attribute_error(): io = CocaineIO.instance() locator = Locator("localhost", 10053, loop=io) locator.random_attribute().get()