def _notify_cluster_out(self): with self.dm.flask_app.app_context(): servers = Server.get_neighbours() if servers: self.logger.debug( f"Sending shutdown to {', '.join([s.name for s in servers])}" ) else: self.logger.debug("No server to send shutdown information") if servers: responses = asyncio.run( ntwrk.parallel_requests( servers, 'post', view_or_url='api_1_0.cluster_out', view_data=dict(server_id=str(Server.get_current().id)), json={ 'death': get_now().strftime(defaults.DATEMARK_FORMAT) }, timeout=2, auth=get_root_auth())) if self.logger.level <= logging.DEBUG: for r in responses: if not r.ok: self.logger.warning( f"Unable to send data to {r.server}: {r}")
def background_new_gate(): resp = None gates = [dict(dns_or_ip=remote_addr, port=port, hidden=True) for port in set([gate.port for gate in server.gates])] for port in set([gate.port for gate in server.gates]): resp = ntwrk.patch(f"{remote_addr}:{port}", 'api_1_0.serverresource', view_data=dict(server_id=server.id), json=dict(gates=gates), auth=get_root_auth(), timeout=30) if resp.ok: break else: logger.error(f"Unable to create external gate {server}->{remote_addr}." f" Reason: {resp}" if resp else "")
def _update_catalog_from_server(self, server): with lock_scope(Scope.UPGRADE, [self.server]): resp = ntwrk.get( server, 'api_1_0.catalog', view_data=dict(data_mark=self.catalog_ver.strftime( defaults.DATEMARK_FORMAT)), auth=get_root_auth()) if resp.code and 199 < resp.code < 300: delta_catalog = resp.msg self.db_update_catalog(delta_catalog) else: raise CatalogFetchError(resp)
async def _async_get_neighbour_healthcheck(self, cluster_heartbeat_id: str = None ) -> t.Dict[Server, dict]: server_responses = {} servers = Server.get_neighbours() self.logger.debug( f"Neighbour servers to check: {', '.join([s.name for s in servers])}" ) auth = get_root_auth() if cluster_heartbeat_id is None: cluster_heartbeat_id = get_now().strftime(defaults.DATETIME_FORMAT) cos = [ ntwrk.async_post(server, 'root.healthcheck', json={ 'me': self.dm.server_id, 'heartbeat': cluster_heartbeat_id }, auth=auth) for server in servers ] responses = await asyncio.gather(*cos) for server, resp in zip(servers, responses): if resp.ok: id_response = resp.msg.get('server', {}).get('id', '') if id_response and str(server.id) != id_response: e = HealthCheckMismatch(expected={ 'id': str(server.id), 'name': server.name }, actual=resp.msg.get('server', {})) self.logger.warning(str(e)) else: server_responses.update({server: resp.msg}) else: self.logger.warning( f"Unable to get Healthcheck from server {server.name}: {resp}" ) return server_responses
def catalog(dm: Dimensigon, ip, port, http=False): import dimensigon.dshell.network as dshell_ntwrk dm.create_flask_instance() dm.set_catalog_manager() with dm.flask_app.app_context(): print("Updating catalog...") catalog_datamark = Catalog.max_catalog(str) resp = dshell_ntwrk.request( 'get', dshell_ntwrk.generate_url( 'api_1_0.catalog', view_data=dict(data_mark=catalog_datamark), ip=ip, port=port, scheme='http' if http else 'https'), auth=get_root_auth()) if resp.ok: try: dm.catalog_manager.catalog_update(resp.msg) except Exception as e: exit(f"Unable to upgrade data. Exception: {e}") else: exit(f"Unable to get catalog from {resp.url}: {resp}")
def _send_new_data(self): self.update_mapper() tasks = OrderedDict() for log_id, pb in self._mapper.items(): log = self.session.query(Log).get(log_id) for pytail in pb: data = pytail.fetch() data = data.encode() if isinstance(data, str) else data if data and log.destination_server.id in self.dm.cluster_manager.get_alive( ): if log.mode == Mode.MIRROR: file = pytail.file elif log.mode == Mode.REPO_ROOT: path_to_remove = os.path.dirname(log.target) relative = os.path.relpath(pytail.file, path_to_remove) file = os.path.join('{LOG_REPO}', relative) elif log.mode == Mode.FOLDER: path_to_remove = os.path.dirname(log.target) relative = os.path.relpath(pytail.file, path_to_remove) file = os.path.join(log.dest_folder, relative) else: def get_root(dirname): new_dirname = os.path.dirname(dirname) if new_dirname == dirname: return dirname else: return get_root(new_dirname) relative = os.path.relpath(pytail.file, get_root(pytail.file)) file = os.path.join('{LOG_REPO}', relative) with self.dm.flask_app.app_context(): auth = get_root_auth() task = ntwrk.async_post( log.destination_server, 'api_1_0.logresource', view_data={'log_id': str(log_id)}, json={ "file": file, 'data': base64.b64encode( zlib.compress(data)).decode('ascii'), "compress": True }, auth=auth) tasks[task] = (pytail, log) _log_logger.debug( f"Task sending data from '{pytail.file}' to '{log.destination_server}' prepared" ) if tasks: with self.dm.flask_app.app_context(): responses = asyncio.run(asyncio.gather(*list(tasks.keys()))) for task, resp in zip(tasks.keys(), responses): pytail, log = tasks[task] if resp.ok: pytail.update_offset_file() _log_logger.debug(f"Updated offset from '{pytail.file}'") if log.id not in self._blacklist: self._blacklist_log.pop(log.id, None) else: _log_logger.error( f"Unable to send log information from '{pytail.file}' to '{log.destination_server}'. Error: {resp}" ) if log.id not in self._blacklist: bl = BlacklistEntry() self._blacklist_log[log.id] = bl else: bl = self._blacklist_log.get(log.id) bl.retries += 1 if bl.retries >= self.max_allowed_errors: _log_logger.debug( f"Adding server {log.destination_server.id} to the blacklist." ) bl.blacklisted = time.time()
async def _send_file(self, file: File, servers: t.List[Id] = None): try: content = await self._loop.run_in_executor(self._executor, self._read_file, file.target) except Exception as e: self.logger.exception( f"Unable to get content from file {file.target}.") return if servers: server_ids = servers fsas = [ fsa for fsa in file.destinations if fsa.destination_server.id in server_ids ] else: server_ids = [ fsa.destination_server.id for fsa in file.destinations ] fsas = file.destinations with self.dm.flask_app.app_context(): auth = get_root_auth() alive = self.dm.cluster_manager.get_alive() tasks = [ ntwrk.async_post(fsa.destination_server, view_or_url='api_1_0.file_sync', view_data={'file_id': file.id}, json=dict(file=fsa.target, data=content, force=True), auth=auth) for fsa in fsas if fsa.destination_server.id in alive ] skipped = [ fsa.destination_server.name for fsa in fsas if fsa.destination_server.id not in alive ] if skipped: self.logger.debug( f"Following servers are skipped because we do not see them alive: {', '.join(skipped)}" ) if tasks: self.logger.debug( f"Syncing file {file} with the following servers: {', '.join([fsa.destination_server.name for fsa in fsas if fsa.destination_server.id in alive])}." ) resp = await asyncio.gather(*tasks) for resp, fsa in zip(resp, fsas): if not resp.ok: self.logger.warning( f"Unable to send file {file.target} to {fsa.destination_server}. Reason: {resp}" ) if (file.id, fsa.destination_server.id ) not in self._blacklist: bl = BlacklistEntry() self._blacklist[(file.id, fsa.destination_server.id)] = bl else: bl = self._blacklist.get( (file.id, fsa.destination_server.id)) bl.retries += 1 if bl.retries >= self.max_allowed_errors: self.logger.debug( f"Adding server {fsa.destination_server} to the blacklist." ) bl.blacklisted = time.time() else: if (file.id, fsa.destination_server.id) in self._blacklist: self._blacklist.pop( (file.id, fsa.destination_server.id), None) fsa.l_mtime = file.l_mtime try: self.session.commit() except: self.session.rollback()
def _notify_cluster_in(self): from dimensigon.domain.entities import Server import dimensigon.web.network as ntwrk from dimensigon.domain.entities import Parameter try: signaled = self._route_initiated.wait(timeout=120) except Exception: return if not signaled: self.logger.warning("Route Event not fired.") self.logger.debug("Notify Cluster") with self.dm.flask_app.app_context(): not_notify = set() me = Server.get_current() msg = [ r.to_json() for r in Route.query.options( orm.lazyload(Route.destination), orm.lazyload(Route.gate), orm.lazyload(Route.proxy_server)).all() ] neighbours = Server.get_neighbours() if Parameter.get('join_server', None): join_server = Server.query.get(Parameter.get('join_server')) else: join_server = None now = get_now() msg = dict(keepalive=now.strftime(defaults.DATEMARK_FORMAT), routes=msg) if neighbours: random.shuffle(neighbours) first = [ s for s in neighbours if s.id == Parameter.get('new_gates_server', None) ] if first: neighbours.pop(neighbours.index(first[0])) neighbours = first + neighbours elif join_server in neighbours: neighbours.pop(neighbours.index(join_server)) neighbours = [join_server] + neighbours for s in neighbours: if s.id not in not_notify: self.logger.debug( f"Sending 'Cluster IN' message to {s}") resp = ntwrk.post(s, 'api_1_0.cluster_in', view_data=dict(server_id=str(me.id)), json=msg, timeout=10, auth=get_root_auth()) if resp.ok: converted = [] for ident, str_keepalive, death in resp.msg[ 'cluster']: try: keepalive = dt.datetime.strptime( str_keepalive, defaults.DATEMARK_FORMAT) except ValueError: continue converted.append((ident, keepalive, death)) self.put_many(converted) not_notify.update(resp.msg.get('neighbours', [])) else: self.logger.debug( f"Unable to send 'Cluster IN' message to {s} . Response: {resp}" ) else: self.logger.debug( f"Skiping server {s} from sending 'Cluster IN' message" ) # alive = [(getattr(Server.query.get(s_id), 'name', None) or s_id) for s_id in # self.get_alive()] # self.logger.info(f"Alive servers: {', '.join(alive)}") else: self.logger.debug("No neighbour to send 'Cluster IN'") self.logger.debug("Notify Cluster ended")
def _send_data(self): session = self.Session() def log_data(data): debug_data = [] for cr in data: server = dict(id=cr.id) name = getattr(session.query(Server).get(cr.id), 'name', cr.id) if name: server.update(name=name) debug_data.append({ 'server': server, 'keepalive': cr.keepalive.strftime(defaults.DATEMARK_FORMAT), 'death': cr.death }) return debug_data # time to send data with self.dm.flask_app.app_context(): neighbours = Server.get_neighbours(session=session) if neighbours: with self._change_buffer_lock: temp_buffer = dict(self._buffer) self._buffer.clear() self.logger.debug( f"Sending cluster information to the following nodes: {', '.join([s.name for s in neighbours])}" ) self.logger.log( 1, f"{json.dumps(log_data(temp_buffer.values()), indent=2)}") auth = get_root_auth() try: responses = asyncio.run( ntwrk.parallel_requests( neighbours, 'POST', view_or_url='api_1_0.cluster', json=[{ 'id': e.id, 'keepalive': e.keepalive.strftime(defaults.DATEMARK_FORMAT), 'death': e.death } for e in temp_buffer.values()], auth=auth, securizer=False), ) except Exception as e: self.logger.error( f"Unable to send cluster information to neighbours: {format_exception(e)}" ) # restore data with new data arrived with self._change_buffer_lock: temp_buffer.update(**self._buffer) self._buffer.clear() self._buffer.update(temp_buffer) else: for r in responses: if not r.ok: self.logger.warning( f"Unable to send data to {r.server}: {r}") # check if new data arrived during timer execution with self._change_buffer_lock: if self._buffer: self._timer = threading.Timer(interval=1, function=self._send_data) self._timer.start() else: self._timer = None else: self.logger.debug( f"No neighbour servers to send cluster information") with self._change_buffer_lock: self._timer = None session.close()
async def _send_routes(self, exclude=None): servers = Server.get_neighbours(session=self.session) msg, debug_msg = self._format_routes_message(self._changed_routes) c_exclude = [] if self.logger.level <= logging.DEBUG: if exclude: if is_iterable_not_string(exclude): c_exclude = [ self.session.query(Server).get(e) if not isinstance(e, Server) else e for e in exclude ] else: c_exclude = [ self.session.query(Server).get(exclude) if not isinstance(exclude, Server) else exclude ] log_msg = f" (Excluded nodes: {', '.join([getattr(e, 'name', e) for e in c_exclude])}):" else: log_msg = '' if servers: log_msg = f"Sending route information to the following nodes: {', '.join([s.name for s in servers])} " \ f"{log_msg}{json.dumps(debug_msg, indent=2)}" else: log_msg = f"No servers to send new routing information:{log_msg}{json.dumps(debug_msg, indent=2)}" if debug_msg: log_msg += '\n' + json.dumps(debug_msg, indent=2) if debug_msg and (servers or exclude): self.logger.debug(log_msg) exclude_ids = list( set([s.id for s in servers ]).union([getattr(e, 'id', e) for e in c_exclude])) auth = get_root_auth() aw = [ ntwrk.async_patch(s, view_or_url='api_1_0.routes', json={ 'server_id': self.server.id, 'route_list': msg, 'exclude': exclude_ids }, auth=auth) for s in servers ] rs = await asyncio.gather(*aw, return_exceptions=True) for r, s in zip(rs, servers): if isinstance(r, Exception): self.logger.warning( f"Error while trying to send route data to node {s}: " f"{format_exception(r)}") elif not r.ok: if r.exception: self.logger.warning( f"Error while trying to send route data to node {s}: " f"{format_exception(r.exception)}") else: self.logger.warning( f"Error while trying to send route data to node {s}: {r}" ) self._changed_routes.clear()
def _update_route_table_from_data( self, new_routes: t.Dict, auth=None) -> t.Dict[Server, RouteContainer]: changed_routes = {} routes = new_routes.get('route_list', []) routes.sort(key=lambda x: x.get('cost') or MAX_COST, reverse=True) try: likely_proxy_server = self.session.query(Server).get( new_routes.get('server_id')) if not likely_proxy_server: self.logger.warning( f"Server id still '{new_routes.get('server_id')}' not created." ) return changed_routes debug_new_routes = [] for new_route in routes: target_server = self.session.query(Server).get( new_route.get('destination_id')) proxy_server = self.session.query(Server).get( new_route.get('proxy_server_id')) gate = self.session.query(Gate).get(new_route.get('gate_id')) dest_name = getattr(target_server, 'name', new_route.get('destination_id')) proxy_name = getattr(proxy_server, 'name', new_route.get('proxy_server_id')) gate_str = str(gate) if gate else new_route.get('gate_id') cost = new_route.get('cost') if gate_str and proxy_name: gate_str = gate_str + '*' + proxy_name debug_new_routes.append( f'{dest_name} -> {gate_str or proxy_name} / {cost}') if target_server is None: self.logger.warning( f"Destination server unknown {new_route.get('destination_id')}" ) continue if target_server.id == self.server.id: # check if server has detected me as a neighbour if new_route.get('cost') == 0: # check if I do not have it as a neighbour yet if likely_proxy_server.route and likely_proxy_server.route.cost != 0: # check if I have a gate to contact with it route = check_gates(likely_proxy_server) if isinstance(route, RouteContainer): changed_routes[likely_proxy_server] = route likely_proxy_server.set_route(route) else: # server may be created without route (backward compatibility) if target_server.route is None: target_server.set_route( Route(destination=target_server)) # process routes whose proxy_server is not me if self.server.id != new_route.get('proxy_server_id'): # check If I reach destination if target_server.route.cost is not None: if new_route.get('cost') is None: # likely proxy does not reach but I reach it. It might be shutdown unexpectedly? if target_server.route.cost == 0: # check if I still have it as a neighbour route = check_gates(target_server) else: if target_server.route.proxy_server == likely_proxy_server: route = RouteContainer( None, None, None) else: # check if I still have access through my proxy cost, time = ntwrk.ping( target_server, retries=1, timeout=20, session=self.session) if cost == target_server.route.cost: # still a valid route route = target_server.route elif cost is None: route = RouteContainer( None, None, None) else: route = RouteContainer( target_server.route. proxy_server, None, cost) if isinstance(route, RouteContainer): # gate changed changed_routes[target_server] = route target_server.set_route(route) elif route is None: # no route to host. I've lost contact too changed_routes[ target_server] = RouteContainer( None, None, None) target_server.set_route( changed_routes[target_server]) else: # still a valid route. Send route to likely_proxy_server to tell it I have access resp = ntwrk.patch( likely_proxy_server, 'api_1_0.routes', json=dict( server_id=str(self.server.id), route_list=[ target_server.route.to_json() ]), auth=get_root_auth(), timeout=5) if not resp.ok: self.logger.info( f'Unable to send route to {likely_proxy_server}: {resp}' ) elif target_server.route.proxy_server is not None and \ target_server.route.proxy_server == likely_proxy_server: # my proxy is telling me the route has changed rc = RouteContainer(likely_proxy_server, None, new_route.get('cost') + 1) target_server.set_route(rc) changed_routes.update({target_server: rc}) elif new_route.get( 'cost') + 1 < target_server.route.cost: # if new route has less cost than actual route, take it as my new route rc = RouteContainer(likely_proxy_server, None, new_route.get('cost') + 1) target_server.set_route(rc) changed_routes.update({target_server: rc}) else: # me does not reaches target_server # if new route reaches the destination take it as a new one if (new_route.get('cost') == 0 and new_route.get('gate_id') is not None ) or (new_route.get('cost') is not None and proxy_server is not None): rc = RouteContainer(likely_proxy_server, None, new_route.get('cost') + 1) else: # neither my route and the new route has access to the destination rc = RouteContainer(None, None, None) if target_server.route and ( rc.proxy_server != target_server.route.proxy_server or rc.gate != target_server.route.gate or rc.cost != target_server.route.cost): target_server.set_route(rc) changed_routes.update({target_server: rc}) else: # target_server reached through me as a proxy from likely_proxy pass query = self.session.query(Route).filter( Route.proxy_server_id.in_([ s.id for s, r in changed_routes.items() if r.cost is None ])) rc = RouteContainer(None, None, None) for route in query.all(): route.set_route(rc) changed_routes[route.destination] = rc self.logger.debug( f"New routes processed from {likely_proxy_server.name}: {json.dumps(debug_new_routes, indent=2)}" ) # if changed_routes: # Parameter.set('routing_last_refresh', get_now()) except errors.InvalidRoute as e: debug_new_routes = [] routes.sort(key=lambda x: x.get('cost') or MAX_COST, reverse=True) for new_route in routes: target_server = self.session.query(Server).get( new_route.get('destination_id')) proxy_server = self.session.query(Server).get( new_route.get('proxy_server_id')) gate = self.session.query(Gate).get(new_route.get('gate_id')) dest_name = getattr(target_server, 'name', new_route.get('destination_id')) proxy_name = getattr(proxy_server, 'name', new_route.get('proxy_server_id')) gate_str = str(gate) if gate else new_route.get('gate_id') cost = new_route.get('cost') if gate_str and proxy_name: gate_str = gate_str + '*' + proxy_name debug_new_routes.append( f'{dest_name} -> {gate_str or proxy_name} / {cost}') self.logger.exception( "Error setting routes from following data: " + json.dumps(debug_new_routes, indent=4)) return changed_routes
async def _async_refresh_route_table( self, discover_new_neighbours=False, check_current_neighbours=False, max_num_discovery=None) -> t.Dict[Server, RouteContainer]: """Gets route tables of all neighbours and updates its own table based on jump weights. Needs a Flask App Context to run. Parameters ---------- discover_new_neighbours: tries to discover new neighbours check_current_neighbours: checks if current neighbours are still neighbours max_num_discovery: maximum number of possible nodes to check as neighbour Returns ------- None """ self.logger.debug('Refresh Route Table') neighbours = Server.get_neighbours(session=self.session) not_neighbours = Server.get_not_neighbours(session=self.session) changed_routes: t.Dict[Server, RouteContainer] = {} not_neighbours_anymore = [] new_neighbours = [] aws = [] if check_current_neighbours: if neighbours: self.logger.debug(f"Checking current neighbours: " + ', '.join([str(s) for s in neighbours])) aws.append( _async_set_current_neighbours(neighbours, changed_routes)) else: self.logger.debug(f"No neighbour to check") if discover_new_neighbours: if not_neighbours[:max_num_discovery]: rs = list(not_neighbours) random.shuffle(rs) target = rs[:max_num_discovery] target.sort(key=lambda s: s.name) self.logger.debug( f"Checking new neighbours{f' (limited to {max_num_discovery})' if max_num_discovery else ''}: " + ', '.join([str(s) for s in target])) aws.append( _async_discover_new_neighbours(target, changed_routes)) else: self.logger.debug("No new neighbours to check") res = await asyncio.gather(*aws, return_exceptions=False) if check_current_neighbours and neighbours: not_neighbours_anymore = res.pop(0) if not_neighbours_anymore: self.logger.info( f"Lost direct connection to the following nodes: " + ', '.join([str(s) for s in not_neighbours_anymore])) if discover_new_neighbours and not_neighbours[:max_num_discovery]: new_neighbours = res.pop(0) if new_neighbours: self.logger.info(f'New neighbours found: ' + ', '.join([str(s) for s in new_neighbours])) else: self.logger.debug("No new neighbours found") # remove routes whose proxy_server is a node that is not a neighbour query = self.session.query(Route).filter( Route.proxy_server_id.in_([ s.id for s in list( set(not_neighbours).union(set(not_neighbours_anymore))) ])) rc = RouteContainer(None, None, None) for route in query.all(): route.set_route(rc) changed_routes[route.destination] = rc self.session.commit() # update neighbour lis neighbours = list( set(neighbours).union(set(new_neighbours)) - set(not_neighbours_anymore)) if neighbours: self.logger.debug( f"Getting routing tables from {', '.join([str(s) for s in neighbours])}" ) responses = await asyncio.gather(*[ ntwrk.async_get(server, 'api_1_0.routes', auth=get_root_auth()) for server in neighbours ]) cr = self._route_table_merge(dict(zip(neighbours, responses))) changed_routes.update(cr) return changed_routes