def healthcheck(): if request.method == 'POST' and isinstance(g.source, Server): data = request.get_json() try: heartbeat = dt.datetime.strptime(data['heartbeat'], defaults.DATETIME_FORMAT) except: raise errors.InvalidDateFormat(data['heartbeat'], defaults.DATETIME_FORMAT) current_app.dm.cluster_manager.put(data['me'], heartbeat) catalog_ver = Catalog.max_catalog() data = { "version": dimensigon.__version__, "catalog_version": catalog_ver.strftime(defaults.DATEMARK_FORMAT) if catalog_ver else None, "services": [], } if not check_param_in_uri('human'): server = {'id': str(g.server.id), 'name': g.server.name} neighbours = [{ 'id': str(s.id), 'name': s.name } for s in Server.get_neighbours()] cluster = { 'alive': current_app.dm.cluster_manager.get_alive(), 'in_coma': current_app.dm.cluster_manager.get_zombies() } else: server = g.server.name neighbours = sorted([s.name for s in Server.get_neighbours()]) cluster = { 'alive': sorted([ getattr(Server.query.get(i), 'name', i) for i in current_app.dm.cluster_manager.get_alive() ]), 'in_coma': sorted([ getattr(Server.query.get(i), 'name', i) for i in current_app.dm.cluster_manager.get_zombies() ]) } data.update(server=server, neighbours=neighbours, cluster=cluster, now=get_now().strftime(defaults.DATETIME_FORMAT)) return data
def cluster_in(server_id): user = User.get_current() data = request.get_json() if user and user.name == 'root': try: keepalive = dt.datetime.strptime(data.get('keepalive'), defaults.DATEMARK_FORMAT) except ValueError: raise errors.InvalidDateFormat(data.get('keepalive'), defaults.DATEMARK_FORMAT) current_app.dm.cluster_manager.put(server_id, keepalive) _cluster_logger.debug( f"{getattr(Server.query.get(server_id), 'name', server_id) or server_id} is a new alive server" ) current_app.dm.route_manager.new_node_in_cluster( server_id, data['routes']) return { 'cluster': current_app.dm.cluster_manager.get_cluster( defaults.DATEMARK_FORMAT), 'neighbours': [s.id for s in Server.get_neighbours()] }, 200 else: raise errors.UserForbiddenError
async def _async_set_current_neighbours( neighbours: t.List[Server] = None, changed_routes: t.Dict[Server, RouteContainer] = None) -> t.List[Server]: """Function checks and sets neighbours Args: neighbours: list of neighbours changed_routes: reference to a dict which will be populated with new routes Returns: list of servers which are not neighbours anymore """ not_neighbours_anymore = [] if neighbours is None: neighbours = Server.get_neighbours() if neighbours: resp = await asyncio.gather( *[async_check_gates(server) for server in neighbours]) for route, server in zip(resp, neighbours): if isinstance(route, RouteContainer): server.set_route(route) if changed_routes is not None: changed_routes[server] = route elif route is None: not_neighbours_anymore.append(server) rc = RouteContainer(None, None, None) server.set_route(rc) if changed_routes is not None: changed_routes[server] = rc return not_neighbours_anymore
def _notify_cluster_out(self): with self.dm.flask_app.app_context(): servers = Server.get_neighbours() if servers: self.logger.debug( f"Sending shutdown to {', '.join([s.name for s in servers])}" ) else: self.logger.debug("No server to send shutdown information") if servers: responses = asyncio.run( ntwrk.parallel_requests( servers, 'post', view_or_url='api_1_0.cluster_out', view_data=dict(server_id=str(Server.get_current().id)), json={ 'death': get_now().strftime(defaults.DATEMARK_FORMAT) }, timeout=2, auth=get_root_auth())) if self.logger.level <= logging.DEBUG: for r in responses: if not r.ok: self.logger.warning( f"Unable to send data to {r.server}: {r}")
def test_get_neighbours_no_route(self): n1 = Server('n1', port=8000) me = Server('me', port=8000, me=True) db.session.add_all([n1, me]) self.assertListEqual([], me.get_neighbours())
def test_get_neighbours(self): n1 = Server('n1', port=8000) n2 = Server('n2', port=8000) n3 = Server('n3', port=8000) r1 = Server('r1', port=8000) Route(destination=n1, cost=0) Route(destination=n2, proxy_server_or_gate=n2.gates[0]) Route(destination=r1, proxy_server_or_gate=n1, cost=1) me = Server('me', port=8000, me=True) db.session.add_all([n1, n2, n3, r1, me]) self.assertListEqual([n1, n2], me.get_neighbours()) self.assertListEqual([n2], me.get_neighbours(exclude=n1)) self.assertListEqual([n2], me.get_neighbours(exclude=[n1, n3])) self.assertListEqual([n2], me.get_neighbours(exclude=[n1.id, n3.id]))
async def _async_get_neighbour_healthcheck(self, cluster_heartbeat_id: str = None ) -> t.Dict[Server, dict]: server_responses = {} servers = Server.get_neighbours() self.logger.debug( f"Neighbour servers to check: {', '.join([s.name for s in servers])}" ) auth = get_root_auth() if cluster_heartbeat_id is None: cluster_heartbeat_id = get_now().strftime(defaults.DATETIME_FORMAT) cos = [ ntwrk.async_post(server, 'root.healthcheck', json={ 'me': self.dm.server_id, 'heartbeat': cluster_heartbeat_id }, auth=auth) for server in servers ] responses = await asyncio.gather(*cos) for server, resp in zip(servers, responses): if resp.ok: id_response = resp.msg.get('server', {}).get('id', '') if id_response and str(server.id) != id_response: e = HealthCheckMismatch(expected={ 'id': str(server.id), 'name': server.name }, actual=resp.msg.get('server', {})) self.logger.warning(str(e)) else: server_responses.update({server: resp.msg}) else: self.logger.warning( f"Unable to get Healthcheck from server {server.name}: {resp}" ) return server_responses
def _notify_cluster_in(self): from dimensigon.domain.entities import Server import dimensigon.web.network as ntwrk from dimensigon.domain.entities import Parameter try: signaled = self._route_initiated.wait(timeout=120) except Exception: return if not signaled: self.logger.warning("Route Event not fired.") self.logger.debug("Notify Cluster") with self.dm.flask_app.app_context(): not_notify = set() me = Server.get_current() msg = [ r.to_json() for r in Route.query.options( orm.lazyload(Route.destination), orm.lazyload(Route.gate), orm.lazyload(Route.proxy_server)).all() ] neighbours = Server.get_neighbours() if Parameter.get('join_server', None): join_server = Server.query.get(Parameter.get('join_server')) else: join_server = None now = get_now() msg = dict(keepalive=now.strftime(defaults.DATEMARK_FORMAT), routes=msg) if neighbours: random.shuffle(neighbours) first = [ s for s in neighbours if s.id == Parameter.get('new_gates_server', None) ] if first: neighbours.pop(neighbours.index(first[0])) neighbours = first + neighbours elif join_server in neighbours: neighbours.pop(neighbours.index(join_server)) neighbours = [join_server] + neighbours for s in neighbours: if s.id not in not_notify: self.logger.debug( f"Sending 'Cluster IN' message to {s}") resp = ntwrk.post(s, 'api_1_0.cluster_in', view_data=dict(server_id=str(me.id)), json=msg, timeout=10, auth=get_root_auth()) if resp.ok: converted = [] for ident, str_keepalive, death in resp.msg[ 'cluster']: try: keepalive = dt.datetime.strptime( str_keepalive, defaults.DATEMARK_FORMAT) except ValueError: continue converted.append((ident, keepalive, death)) self.put_many(converted) not_notify.update(resp.msg.get('neighbours', [])) else: self.logger.debug( f"Unable to send 'Cluster IN' message to {s} . Response: {resp}" ) else: self.logger.debug( f"Skiping server {s} from sending 'Cluster IN' message" ) # alive = [(getattr(Server.query.get(s_id), 'name', None) or s_id) for s_id in # self.get_alive()] # self.logger.info(f"Alive servers: {', '.join(alive)}") else: self.logger.debug("No neighbour to send 'Cluster IN'") self.logger.debug("Notify Cluster ended")
def _send_data(self): session = self.Session() def log_data(data): debug_data = [] for cr in data: server = dict(id=cr.id) name = getattr(session.query(Server).get(cr.id), 'name', cr.id) if name: server.update(name=name) debug_data.append({ 'server': server, 'keepalive': cr.keepalive.strftime(defaults.DATEMARK_FORMAT), 'death': cr.death }) return debug_data # time to send data with self.dm.flask_app.app_context(): neighbours = Server.get_neighbours(session=session) if neighbours: with self._change_buffer_lock: temp_buffer = dict(self._buffer) self._buffer.clear() self.logger.debug( f"Sending cluster information to the following nodes: {', '.join([s.name for s in neighbours])}" ) self.logger.log( 1, f"{json.dumps(log_data(temp_buffer.values()), indent=2)}") auth = get_root_auth() try: responses = asyncio.run( ntwrk.parallel_requests( neighbours, 'POST', view_or_url='api_1_0.cluster', json=[{ 'id': e.id, 'keepalive': e.keepalive.strftime(defaults.DATEMARK_FORMAT), 'death': e.death } for e in temp_buffer.values()], auth=auth, securizer=False), ) except Exception as e: self.logger.error( f"Unable to send cluster information to neighbours: {format_exception(e)}" ) # restore data with new data arrived with self._change_buffer_lock: temp_buffer.update(**self._buffer) self._buffer.clear() self._buffer.update(temp_buffer) else: for r in responses: if not r.ok: self.logger.warning( f"Unable to send data to {r.server}: {r}") # check if new data arrived during timer execution with self._change_buffer_lock: if self._buffer: self._timer = threading.Timer(interval=1, function=self._send_data) self._timer.start() else: self._timer = None else: self.logger.debug( f"No neighbour servers to send cluster information") with self._change_buffer_lock: self._timer = None session.close()
async def _send_routes(self, exclude=None): servers = Server.get_neighbours(session=self.session) msg, debug_msg = self._format_routes_message(self._changed_routes) c_exclude = [] if self.logger.level <= logging.DEBUG: if exclude: if is_iterable_not_string(exclude): c_exclude = [ self.session.query(Server).get(e) if not isinstance(e, Server) else e for e in exclude ] else: c_exclude = [ self.session.query(Server).get(exclude) if not isinstance(exclude, Server) else exclude ] log_msg = f" (Excluded nodes: {', '.join([getattr(e, 'name', e) for e in c_exclude])}):" else: log_msg = '' if servers: log_msg = f"Sending route information to the following nodes: {', '.join([s.name for s in servers])} " \ f"{log_msg}{json.dumps(debug_msg, indent=2)}" else: log_msg = f"No servers to send new routing information:{log_msg}{json.dumps(debug_msg, indent=2)}" if debug_msg: log_msg += '\n' + json.dumps(debug_msg, indent=2) if debug_msg and (servers or exclude): self.logger.debug(log_msg) exclude_ids = list( set([s.id for s in servers ]).union([getattr(e, 'id', e) for e in c_exclude])) auth = get_root_auth() aw = [ ntwrk.async_patch(s, view_or_url='api_1_0.routes', json={ 'server_id': self.server.id, 'route_list': msg, 'exclude': exclude_ids }, auth=auth) for s in servers ] rs = await asyncio.gather(*aw, return_exceptions=True) for r, s in zip(rs, servers): if isinstance(r, Exception): self.logger.warning( f"Error while trying to send route data to node {s}: " f"{format_exception(r)}") elif not r.ok: if r.exception: self.logger.warning( f"Error while trying to send route data to node {s}: " f"{format_exception(r.exception)}") else: self.logger.warning( f"Error while trying to send route data to node {s}: {r}" ) self._changed_routes.clear()
async def _async_refresh_route_table( self, discover_new_neighbours=False, check_current_neighbours=False, max_num_discovery=None) -> t.Dict[Server, RouteContainer]: """Gets route tables of all neighbours and updates its own table based on jump weights. Needs a Flask App Context to run. Parameters ---------- discover_new_neighbours: tries to discover new neighbours check_current_neighbours: checks if current neighbours are still neighbours max_num_discovery: maximum number of possible nodes to check as neighbour Returns ------- None """ self.logger.debug('Refresh Route Table') neighbours = Server.get_neighbours(session=self.session) not_neighbours = Server.get_not_neighbours(session=self.session) changed_routes: t.Dict[Server, RouteContainer] = {} not_neighbours_anymore = [] new_neighbours = [] aws = [] if check_current_neighbours: if neighbours: self.logger.debug(f"Checking current neighbours: " + ', '.join([str(s) for s in neighbours])) aws.append( _async_set_current_neighbours(neighbours, changed_routes)) else: self.logger.debug(f"No neighbour to check") if discover_new_neighbours: if not_neighbours[:max_num_discovery]: rs = list(not_neighbours) random.shuffle(rs) target = rs[:max_num_discovery] target.sort(key=lambda s: s.name) self.logger.debug( f"Checking new neighbours{f' (limited to {max_num_discovery})' if max_num_discovery else ''}: " + ', '.join([str(s) for s in target])) aws.append( _async_discover_new_neighbours(target, changed_routes)) else: self.logger.debug("No new neighbours to check") res = await asyncio.gather(*aws, return_exceptions=False) if check_current_neighbours and neighbours: not_neighbours_anymore = res.pop(0) if not_neighbours_anymore: self.logger.info( f"Lost direct connection to the following nodes: " + ', '.join([str(s) for s in not_neighbours_anymore])) if discover_new_neighbours and not_neighbours[:max_num_discovery]: new_neighbours = res.pop(0) if new_neighbours: self.logger.info(f'New neighbours found: ' + ', '.join([str(s) for s in new_neighbours])) else: self.logger.debug("No new neighbours found") # remove routes whose proxy_server is a node that is not a neighbour query = self.session.query(Route).filter( Route.proxy_server_id.in_([ s.id for s in list( set(not_neighbours).union(set(not_neighbours_anymore))) ])) rc = RouteContainer(None, None, None) for route in query.all(): route.set_route(rc) changed_routes[route.destination] = rc self.session.commit() # update neighbour lis neighbours = list( set(neighbours).union(set(new_neighbours)) - set(not_neighbours_anymore)) if neighbours: self.logger.debug( f"Getting routing tables from {', '.join([str(s) for s in neighbours])}" ) responses = await asyncio.gather(*[ ntwrk.async_get(server, 'api_1_0.routes', auth=get_root_auth()) for server in neighbours ]) cr = self._route_table_merge(dict(zip(neighbours, responses))) changed_routes.update(cr) return changed_routes
def _route_table_merge(self, data: t.Dict[Server, ntwrk.Response]): changed_routes: t.Dict[Server, RouteContainer] = {} temp_table_routes: t.Dict[uuid.UUID, t.List[RouteContainer]] = {} for s, resp in data.items(): if resp.code == 200: server_id = resp.msg.get( 'server_id', None) or resp.msg.get('server').get('id') likely_proxy_server_entity = self.session.query(Server).get( server_id) for route_json in resp.msg['route_list']: route_json = convert(route_json) if route_json.destination_id != self.server.id \ and route_json.proxy_server_id != self.server.id \ and route_json.gate_id not in [g.id for g in self.server.gates]: if route_json.destination_id not in temp_table_routes: temp_table_routes.update( {route_json.destination_id: []}) if route_json.cost is not None: route_json.cost += 1 route_json.proxy_server_id = likely_proxy_server_entity.id route_json.gate_id = None temp_table_routes[ route_json.destination_id].append( RouteContainer( likely_proxy_server_entity.id, None, route_json.cost)) elif route_json.cost is None: # remove a routing if gateway cannot reach the destination temp_table_routes[ route_json.destination_id].append( RouteContainer(route_json.proxy_server_id, None, None)) else: self.logger.error( f"Error while connecting with {s}. Error: {resp}") # Select new routes based on neighbour routes neighbour_ids = [ s.id for s in Server.get_neighbours(session=self.session) ] for destination_id in filter(lambda s: s not in neighbour_ids, temp_table_routes.keys()): route = self.session.query(Route).filter_by( destination_id=destination_id).one_or_none() if not route: server = self.session.query(Server).get(destination_id) if not server: continue else: route = Route(destination=server) temp_table_routes[destination_id].sort( key=lambda x: x.cost or MAX_COST) if len(temp_table_routes[destination_id]) > 0: min_route = temp_table_routes[destination_id][0] proxy_server: Server = self.session.query(Server).get( min_route.proxy_server) cost = min_route.cost if route.proxy_server != proxy_server or route.cost != cost: rc = RouteContainer(proxy_server, None, cost) route.set_route(rc) changed_routes[route.destination] = rc self.session.add(route) data = {} for server, temp_route in changed_routes.items(): data.update({ str(server): { 'proxy_server': str(temp_route.proxy_server), 'gate': str(temp_route.gate), 'cost': str(temp_route.cost) } }) return changed_routes
def bootstrap(self): """ bootstraps the application. Gunicorn is still not listening on sockets """ with self.app_context(): from dimensigon.domain.entities import Server, Parameter import dimensigon.web.network as ntwrk from dimensigon.domain.entities import Locker # reset scopes Locker.set_initial(unlock=True) # check gates me = Server.get_current() if me is None: raise RuntimeError("No server set as 'current'") input_gates = bind2gate(self.dm.config.http_conf.get('bind')) current_gates = [(gate.dns or str(gate.ip), gate.port) for gate in me.gates] new_gates = set(input_gates).difference(set(current_gates)) self.server_id_with_new_gates = None if new_gates: if Parameter.get('join_server'): join_server = Server.query.get( Parameter.get('join_server')) else: join_server = None servers = Server.get_neighbours() if join_server in servers: servers.pop(servers.index(join_server)) servers.append(join_server) else: self.logger.warning( f'Join server {join_server} is not a neighbour') start = time.time() resp = None server = True while len(servers) > 0 and server and (time.time() - start) < 900: server_retries = 0 server = servers[-1] self.logger.debug( f"Sending new gates {new_gates} to {server}...") resp = ntwrk.patch( server, 'api_1_0.serverresource', view_data=dict(server_id=str(Server.get_current().id)), json={ 'gates': [{ 'dns_or_ip': ip, 'port': port } for ip, port in new_gates] }, timeout=60, auth=get_root_auth()) if not resp.ok: self.logger.debug( f"Unable to send new gates to {server}. Reason: {resp}" ) self.logger.info( f"Unable to create new gates. Trying to send again in 5 seconds..." ) time.sleep(5) if resp.code == 409: # try with the same server server_retries += 1 elif resp.code == 500: # try with another server i = servers.index(server) - 1 if i >= 0: server = servers[i] server_retries = 0 else: server = None if server_retries == 3: # changing server i = servers.index(server) - 1 if i >= 0: server = servers[i] server_retries = 0 else: server = None else: self.logger.debug("New gates created succesfully") Parameter.set('new_gates_server', server.id) break if not servers: if Server.query.count() == 1: self.logger.info( f"Creating new gates {new_gates} without performing a lock on catalog" ) for gate in new_gates: g = me.add_new_gate(gate[0], gate[1]) db.session.add(g) else: if resp and not resp.ok: self.logger.warning( f"Remote servers may not connect with {me}. ") db.session.commit()