class Authenticator(object): _authenticators = {} @classmethod def instance(cls, public_keys_dir): '''Please avoid create multi instance''' if public_keys_dir in cls._authenticators: return cls._authenticators[public_keys_dir] new_instance = cls(public_keys_dir) cls._authenticators[public_keys_dir] = new_instance return new_instance def __init__(self, public_keys_dir): self._auth = ThreadAuthenticator(zmq.Context.instance()) self._auth.start() self._auth.allow('*') self._auth.configure_curve(domain='*', location=public_keys_dir) def set_server_key(self, zmq_socket, server_secret_key_path): '''must call before bind''' load_and_set_key(zmq_socket, server_secret_key_path) zmq_socket.curve_server = True def set_client_key(self, zmq_socket, client_secret_key_path, server_public_key_path): '''must call before bind''' load_and_set_key(zmq_socket, client_secret_key_path) server_public, _ = zmq.auth.load_certificate(server_public_key_path) zmq_socket.curve_serverkey = server_public def stop(self): self._auth.stop()
def run_mdp_broker(): args = docopt("""Usage: mdp-broker [options] <config> Options: -h --help show this help message and exit -s --secure generate (and print) client & broker keys for a secure server """) global log _setup_logging(args['<config>']) log = logging.getLogger(__name__) cp = ConfigParser() cp.read(args['<config>']) # Parse settings a bit raw = dict( (option, cp.get('mdp-broker', option)) for option in cp.options('mdp-broker')) s = SettingsSchema().to_python(raw) if args['--secure']: broker_key = Key.generate() client_key = Key.generate() s['key'] = dict( broker=broker_key, client=client_key) log.info('Auto-generated keys: %s_%s_%s', broker_key.public, client_key.public, client_key.secret) log.info(' broker.public: %s', broker_key.public) log.info(' client.public: %s', client_key.public) log.info(' client.secret: %s', client_key.secret) if s['key']: log.info('Starting secure mdp-broker on %s', s['uri']) auth = ThreadAuthenticator() auth.start() auth.thread.authenticator.certs['*'] = { s['key']['client'].public: 'OK'} broker = SecureMajorDomoBroker(s['key']['broker'], s['uri']) else: log.info('Starting mdp-broker on %s', s['uri']) broker = MajorDomoBroker(s['uri']) try: broker.serve_forever() except: auth.stop() raise
def __init__(self, zmq_rep_bind_address=None, zmq_sub_connect_addresses=None, recreate_sockets_on_timeout_of_sec=600, username=None, password=None): self.context = zmq.Context() self.auth = None self.last_received_message = None self.is_running = False self.thread = None self.zmq_rep_bind_address = zmq_rep_bind_address self.zmq_sub_connect_addresses = zmq_sub_connect_addresses self.poller = zmq.Poller() self.sub_sockets = [] self.rep_socket = None if username is not None and password is not None: # Start an authenticator for this context. # Does not work on PUB/SUB as far as I (probably because the more secure solutions # require two way communication as well) self.auth = ThreadAuthenticator(self.context) self.auth.start() # Instruct authenticator to handle PLAIN requests self.auth.configure_plain(domain='*', passwords={username: password}) if self.zmq_sub_connect_addresses: for address in self.zmq_sub_connect_addresses: self.sub_sockets.append(SubSocket(self.context, self.poller, address, recreate_sockets_on_timeout_of_sec)) if zmq_rep_bind_address: self.rep_socket = RepSocket(self.context, self.poller, zmq_rep_bind_address, self.auth)
def main(): auth = ThreadAuthenticator(zmq.Context.instance()) auth.start() auth.allow('127.0.0.1') # Tell the authenticator how to handle CURVE requests auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) key = Key.load('example/broker.key_secret') broker = SecureMajorDomoBroker(key, sys.argv[1]) try: broker.serve_forever() except KeyboardInterrupt: auth.stop() raise
def _init_txzmq(self): """ Configure the txzmq components and connection. """ self._zmq_factory = txzmq.ZmqFactory() self._zmq_factory.registerForShutdown() self._zmq_connection = txzmq.ZmqREPConnection(self._zmq_factory) context = self._zmq_factory.context socket = self._zmq_connection.socket def _gotMessage(messageId, messageParts): self._zmq_connection.reply(messageId, "OK") self._process_request(messageParts) self._zmq_connection.gotMessage = _gotMessage if flags.ZMQ_HAS_CURVE: # Start an authenticator for this context. auth = ThreadAuthenticator(context) auth.start() # XXX do not hardcode this here. auth.allow('127.0.0.1') # Tell authenticator to use the certificate in a directory auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) public, secret = get_backend_certificates() socket.curve_publickey = public socket.curve_secretkey = secret socket.curve_server = True # must come before bind proto, addr = self._server_address.split('://') # tcp/ipc, ip/socket socket.bind(self._server_address) if proto == 'ipc': os.chmod(addr, 0600)
def _run(self): """ Start a loop to process the ZMQ requests from the signaler client. """ logger.debug("Running SignalerQt loop") context = zmq.Context() socket = context.socket(zmq.REP) # Start an authenticator for this context. auth = ThreadAuthenticator(context) auth.start() auth.allow('127.0.0.1') # Tell authenticator to use the certificate in a directory auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) public, secret = get_frontend_certificates() socket.curve_publickey = public socket.curve_secretkey = secret socket.curve_server = True # must come before bind socket.bind(self.BIND_ADDR) while self._do_work.is_set(): # Wait for next request from client try: request = socket.recv(zmq.NOBLOCK) logger.debug("Received request: '{0}'".format(request)) socket.send("OK") self._process_request(request) except zmq.ZMQError as e: if e.errno != zmq.EAGAIN: raise time.sleep(0.01) logger.debug("SignalerQt thread stopped.")
def __init__(self, port): self.port = str(port) threading.Thread.__init__ (self) self.shutdown_event = Event() self.context = zmq.Context().instance() self.authserver = ThreadAuthenticator(self.context) self.loadConfig() self.start()
class ZMQPull(ZMQ): classname = "ZMQPull" def __init__(self, name, options, inbound): super().__init__(name, options, inbound) self.socket_type = zmq.PULL def secure_setup(self): # Load certificates # TODO: handle errors self.auth = ThreadAuthenticator(self.context) self.auth.start() self.LOG.debug("Server keys in %s", self.secure_config["self"]) sock_pub, sock_priv = zmq.auth.load_certificate(self.secure_config["self"]) if self.secure_config.get("clients", None) is not None: self.LOG.debug("Client certificates in %s", self.secure_config["clients"]) self.auth.configure_curve(domain="*", location=self.secure_config["clients"]) else: self.LOG.debug("Every clients can connect") self.auth.configure_curve(domain="*", location=zmq.auth.CURVE_ALLOW_ANY) # Setup the socket self.sock.curve_publickey = sock_pub self.sock.curve_secretkey = sock_priv self.sock.curve_server = True
def _start_thread_auth(self, socket): """ Start the zmq curve thread authenticator. :param socket: The socket in which to configure the authenticator. :type socket: zmq.Socket """ authenticator = ThreadAuthenticator(self._factory.context) authenticator.start() # XXX do not hardcode this here. authenticator.allow('127.0.0.1') # tell authenticator to use the certificate in a directory public_keys_dir = os.path.join(self._config_prefix, PUBLIC_KEYS_PREFIX) authenticator.configure_curve(domain="*", location=public_keys_dir) socket.curve_server = True # must come before bind
def secure_setup(self): # Load certificates # TODO: handle errors self.auth = ThreadAuthenticator(self.context) self.auth.start() self.LOG.debug("Server keys in %s", self.secure_config["self"]) sock_pub, sock_priv = zmq.auth.load_certificate(self.secure_config["self"]) if self.secure_config.get("clients", None) is not None: self.LOG.debug("Client certificates in %s", self.secure_config["clients"]) self.auth.configure_curve(domain="*", location=self.secure_config["clients"]) else: self.LOG.debug("Every clients can connect") self.auth.configure_curve(domain="*", location=zmq.auth.CURVE_ALLOW_ANY) # Setup the socket self.sock.curve_publickey = sock_pub self.sock.curve_secretkey = sock_priv self.sock.curve_server = True
def setup_auth(): global _auth assert _options is not None auth = _options.get('auth',None) if auth is None: return base_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)),'..')) try: _auth = ThreadAuthenticator(_zctx) _auth.start() whitelist = auth.get('whitelist',None) if whitelist is not None: _auth.allow(whitelist) public_path = auth.get('public_key_dir','public_keys') _auth.configure_curve(domain='*',location=getExistsPath(base_dir,public_path)) private_dir = getExistsPath(base_dir,auth.get('private_key_dir','private_keys')) private_key = os.path.join(private_dir,auth.get('private_key_file','server.key_secret')) server_public,server_private = zmq.auth.load_certificate(private_key) _sock.curve_secretkey = server_private _sock.curve_publickey = server_public _sock.curve_server = True except: _auth.stop() _auth = None
def create_socket(self): self.context = zmq.Context.instance() auth = ThreadAuthenticator(self.context) auth.start() #auth.allow('127.0.0.1') # Tell authenticator to use the certificate in a directory auth.configure_curve(domain='*', location=self.public_keys_dir) self.socket = self.context.socket(zmq.REP) self.monitor = self.socket.get_monitor_socket() server_secret_file = os.path.join(self.secret_keys_dir, "server.key_secret") server_public, server_secret = zmq.auth.load_certificate(server_secret_file) self.socket.curve_secretkey = server_secret self.socket.curve_publickey = server_public self.socket.curve_server = True # must come before bind self.socket.set(zmq.LINGER, 1) self.socket.identity = b"gatekeeper" self.socket.bind("tcp://0.0.0.0:5141")
def run(self): ''' Run Ironhouse example ''' # These directories are generated by the generate_certificates script keys_dir = self.config['certs']['certs'] public_keys_dir = self.config['certs']['public'] secret_keys_dir = self.config['certs']['private'] if not (util.check_dir(keys_dir) and util.check_dir(public_keys_dir) and util.check_dir(secret_keys_dir)): logging.critical("Certificates are missing - run generate_certificates.py script first") sys.exit(1) logger.info("Keys: %s | Public: %s | Secret: %s", keys_dir, public_keys_dir, secret_keys_dir) ctx = zmq.Context.instance() # Start an authenticator for this context. auth = ThreadAuthenticator(ctx) auth.start() for ip in self.config['server']['auth']: auth.allow(ip) # Tell authenticator to use the certificate in a directory auth.configure_curve(domain='*', location=public_keys_dir) server = ctx.socket(zmq.REP) server_secret_file = os.path.join(secret_keys_dir, "server.key_secret") server_public, server_secret = zmq.auth.load_certificate(server_secret_file) server.curve_secretkey = server_secret server.curve_publickey = server_public server.curve_server = True # must come before bind bind_info = 'tcp://%s:%s' % (self.config['server']['listen'], self.config['server']['port']) server.bind(bind_info) logger.info("Server bound to: %s", bind_info) self.load_plugins() logger.info("Starting reciever.") while True: msg = server.recv() self.handle_msg(msg) server.send("ack") auth.stop()
def _run(self): """ Start a loop to process the ZMQ requests from the signaler client. """ logger.debug("Running SignalerQt loop") context = zmq.Context() socket = context.socket(zmq.REP) if flags.ZMQ_HAS_CURVE: # Start an authenticator for this context. auth = ThreadAuthenticator(context) auth.start() auth.allow('127.0.0.1') # Tell authenticator to use the certificate in a directory auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) public, secret = get_frontend_certificates() socket.curve_publickey = public socket.curve_secretkey = secret socket.curve_server = True # must come before bind socket.bind(self.BIND_ADDR) if not flags.ZMQ_HAS_CURVE: os.chmod(self.SOCKET_FILE, 0600) while self._do_work.is_set(): # Wait for next request from client try: request = socket.recv(zmq.NOBLOCK) # logger.debug("Received request: '{0}'".format(request)) socket.send("OK") self._process_request(request) except zmq.ZMQError as e: if e.errno != zmq.EAGAIN: raise time.sleep(0.01) logger.debug("SignalerQt thread stopped.")
def run(): ''' Run Stonehouse example ''' # These directories are generated by the generate_certificates script keys_dir = os.path.dirname(__file__) ctx = zmq.Context.instance() # Start an authenticator for this context. auth = ThreadAuthenticator(ctx) auth.start() auth.allow('127.0.0.1') # Tell the authenticator how to handle CURVE requests auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) client = ctx.socket(zmq.PULL) # We need two certificates, one for the client and one for # the server. The client must know the server's public key # to make a CURVE connection. client_secret_file = os.path.join(keys_dir, "client.key") client_public, client_secret = zmq.auth.load_certificate( client_secret_file) client.curve_secretkey = client_secret client.curve_publickey = client_public # The client must know the server's public key to make a CURVE connection. server_public_file = os.path.join(keys_dir, "server.key") server_public, _ = zmq.auth.load_certificate(server_public_file) client.curve_serverkey = server_public client.connect('tcp://127.0.0.1:9000') if client.poll(100000): msg = client.recv() if msg == b"Hello": logging.info("Stonehouse test OK") else: logging.error("Stonehouse test FAIL") # stop auth thread auth.stop()
def run_mdp_broker(): args = docopt("""Usage: mdp-broker [options] <config> Options: -h --help show this help message and exit -s --secure generate (and print) client & broker keys for a secure server """) global log _setup_logging(args['<config>']) log = logging.getLogger(__name__) cp = ConfigParser() cp.read(args['<config>']) # Parse settings a bit raw = dict((option, cp.get('mdp-broker', option)) for option in cp.options('mdp-broker')) s = SettingsSchema().to_python(raw) if args['--secure']: broker_key = Key.generate() client_key = Key.generate() s['key'] = dict(broker=broker_key, client=client_key) log.info('Auto-generated keys: %s_%s_%s', broker_key.public, client_key.public, client_key.secret) log.info(' broker.public: %s', broker_key.public) log.info(' client.public: %s', client_key.public) log.info(' client.secret: %s', client_key.secret) if s['key']: log.info('Starting secure mdp-broker on %s', s['uri']) auth = ThreadAuthenticator() auth.start() auth.thread.authenticator.certs['*'] = { s['key']['client'].public: 'OK' } broker = SecureMajorDomoBroker(s['key']['broker'], s['uri']) else: log.info('Starting mdp-broker on %s', s['uri']) broker = MajorDomoBroker(s['uri']) try: broker.serve_forever() except: auth.stop() raise
def _connectLogger(self, key, Host="localhost"): """ Open a zeromq queue with publisher service :param Host: which interface the zeromq service needs to bind too ("*" for all interfaces) :param key: privatekey file of the scheduler :return: """ #TODO: error handling on certificates missing and stuff #TODO: expose more security options such as white/blacklisting ips and domain filtering self.context = zmq.Context() self.auth = ThreadAuthenticator(self.context) self.auth.start() self.auth.configure_curve(domain='*', location=os.path.join("keys", "public")) self.Logger = self.context.socket(zmq.PUB) scheduler_public, scheduler_secret = zmq.auth.load_certificate(os.path.join("keys", "plexi1.key_secret")) self.Logger.curve_secretkey = scheduler_secret self.Logger.curve_publickey = scheduler_public self.Logger.curve_server = True self.Logger.bind("tcp://127.0.0.1:6000")
def _init_zmq(self): """ Configure the zmq components and connection. """ context = zmq.Context() socket = context.socket(zmq.REP) # Start an authenticator for this context. auth = ThreadAuthenticator(context) auth.start() # XXX do not hardcode this here. auth.allow('127.0.0.1') # Tell authenticator to use the certificate in a directory auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) public, secret = get_backend_certificates() socket.curve_publickey = public socket.curve_secretkey = secret socket.curve_server = True # must come before bind socket.bind(self.BIND_ADDR) self._zmq_socket = socket
def _start_thread_auth(self, socket): """ Start the zmq curve thread authenticator. :param socket: The socket in which to configure the authenticator. :type socket: zmq.Socket """ authenticator = ThreadAuthenticator(self._factory.context) # Temporary fix until we understand what the problem is # See https://leap.se/code/issues/7536 time.sleep(0.5) authenticator.start() # XXX do not hardcode this here. authenticator.allow('127.0.0.1') # tell authenticator to use the certificate in a directory public_keys_dir = os.path.join(self._config_prefix, PUBLIC_KEYS_PREFIX) authenticator.configure_curve(domain="*", location=public_keys_dir) socket.curve_server = True # must come before bind
def run(): base_dir = os.path.dirname(__file__) server_dir = os.path.join(base_dir, 'server') key_dir = os.path.join(server_dir, 'certificates') authorized_keys = os.path.join(server_dir, 'authorized') make_clean_dirs([server_dir, authorized_keys]) generate_keys(server_dir) ctx = zmq.Context.instance() auth = ThreadAuthenticator(ctx, 'utf-8', logging.getLogger('utf')) auth.start() auth.allow('127.0.0.1') authenticator_refresher = ctx.socket(zmq.PULL) authenticator_refresher.bind("tcp://*:9010") auth.configure_curve(domain='*', location=authorized_keys) server = ctx.socket(zmq.REP) server_secret_file = os.path.join(key_dir, "id.key_secret") server_public, server_secret = zmq.auth.load_certificate( server_secret_file) server.curve_secretkey = server_secret server.curve_publickey = server_public server.curve_server = True server.bind('tcp://*:9000') authenticator_refresher.recv() auth.configure_curve(domain='*', location=authorized_keys) req = server.recv() print(req) if req == b"hi": server.send(b"hello") else: print("wrong request: " + req) auth.stop()
class ContextHandler(): def __init__(self, publicPath): self.__context = zmq.Context() self.publicPath = publicPath self.auth = ThreadAuthenticator(self.__context) self.auth.start() self.auth.configure_curve(domain='*', location=self.publicPath) self.auth.thread.setName("CurveAuth") def getContext(self): return self.__context def configureAuth(self): self.auth.configure_curve(domain='*', location=self.publicPath) def cleanup(self): self.__context.destroy()
def _init_zmq(self): """ Configure the zmq components and connection. """ context = zmq.Context() socket = context.socket(zmq.REP) # Start an authenticator for this context. auth = ThreadAuthenticator(context) auth.start() auth.allow('127.0.0.1') # Tell authenticator to use the certificate in a directory auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) public, secret = get_backend_certificates() socket.curve_publickey = public socket.curve_secretkey = secret socket.curve_server = True # must come before bind socket.bind(self.BIND_ADDR) self._zmq_socket = socket
class CurveAuthenticator(object): def __init__(self, ctx, domain='*', location=zmq.auth.CURVE_ALLOW_ANY, callback=None): self._domain = domain self._location = location self._callback = callback self._ctx = ctx self._atx = ThreadAuthenticator(self.ctx) self._atx.start() if (self._callback is not None): logging.info('Callback: {0}'.format(self._callback)) self._atx.configure_curve_callback( '*', credentials_provider=self._callback) elif (self._location == zmq.auth.CURVE_ALLOW_ANY or self._location is None): self._atx.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) else: self.load_certs() @property def atx(self): return self._atx @property def location(self): return self._location @property def domain(self): return self._domain @property def ctx(self): return self._ctx def load_certs(self): self.atx.configure_curve(domain=self._domain, location=self._location)
def test_encryption(tmpdir): # Create the tmp names conf_filename = str(tmpdir.join("conf.yaml")) pull_url = tmpdir.join("input.pull.socket") pull_cert_dir = tmpdir.mkdir("input.pull") pull_clients_cert_dir = pull_cert_dir.mkdir("clients") sub_url = tmpdir.join("input.sub.socket") sub_cert_dir = tmpdir.mkdir("input.sub") push_url = tmpdir.join("output.push.socket") inbound = tmpdir.join("inbound") outbound = tmpdir.join("outbound") stdout = tmpdir.join("stdout") stderr = tmpdir.join("stderr") # Create the certificates create_certificates(str(pull_cert_dir), "pull") create_certificates(str(pull_clients_cert_dir), "client1") create_certificates(str(pull_clients_cert_dir), "client2") create_certificates(str(sub_cert_dir), "sub") create_certificates(str(sub_cert_dir), "sub-server") with open(conf_filename, "w") as f: f.write("inputs:\n") f.write("- class: ZMQPull\n") f.write(" name: in-pull\n") f.write(" options:\n") f.write(" url: ipc://%s\n" % pull_url) f.write(" encryption:\n") f.write(" self: %s\n" % pull_cert_dir.join("pull.key_secret")) f.write(" clients: %s\n" % pull_clients_cert_dir) f.write("- class: ZMQSub\n") f.write(" name: in-sub\n") f.write(" options:\n") f.write(" url: ipc://%s\n" % sub_url) f.write(" encryption:\n") f.write(" self: %s\n" % sub_cert_dir.join("sub.key_secret")) f.write(" server: %s\n" % sub_cert_dir.join("sub-server.key")) f.write("core:\n") f.write(" inbound: ipc://%s\n" % inbound) f.write(" outbound: ipc://%s\n" % outbound) f.write("outputs:\n") f.write("- class: ZMQPush\n") f.write(" name: out-push\n") f.write(" options:\n") f.write(" url: ipc://%s\n" % push_url) args = [ "python3", "-m", "reactobus", "--conf", conf_filename, "--level", "DEBUG", "--log-file", "-", ] proc = subprocess.Popen(args, stdout=open(str(stdout), "w"), stderr=open(str(stderr), "w")) # Create the input sockets ctx = zmq.Context.instance() in_sock = ctx.socket(zmq.PUSH) (server_public, _) = load_certificate(str(pull_cert_dir.join("pull.key"))) in_sock.curve_serverkey = server_public (client_public, client_private) = load_certificate( str(pull_clients_cert_dir.join("client1.key_secret"))) in_sock.curve_publickey = client_public in_sock.curve_secretkey = client_private in_sock.connect("ipc://%s" % pull_url) out_sock = ctx.socket(zmq.PULL) out_sock.bind("ipc://%s" % push_url) pub_sock = ctx.socket(zmq.PUB) auth = ThreadAuthenticator(ctx) auth.start() auth.configure_curve(domain="*", location=str(sub_cert_dir)) (server_public, server_secret) = load_certificate( str(sub_cert_dir.join("sub-server.key_secret"))) pub_sock.curve_publickey = server_public pub_sock.curve_secretkey = server_secret pub_sock.curve_server = True pub_sock.bind("ipc://%s" % sub_url) # Allow the process sometime to setup and connect time.sleep(1) # Send some data data = [ b"org.videolan.git", b(str(uuid.uuid1())), b(datetime.datetime.utcnow().isoformat()), b("videolan-git"), b( json.dumps({ "url": "https://code.videolan.org/éêï", "username": "******" })), ] in_sock.send_multipart(data) msg = out_sock.recv_multipart() assert msg == data data = [ b"org.videolan.git", b(str(uuid.uuid1())), b(datetime.datetime.utcnow().isoformat()), b("videolan-git"), b( json.dumps({ "url": "https://code.videolan.org/éêï", "username": "******" })), ] pub_sock.send_multipart(data) msg = out_sock.recv_multipart() assert msg == data # End the process proc.terminate() proc.wait()
def run(self): self.set_status("Server Startup") self.set_status("Creating zmq Contexts",1) serverctx = zmq.Context() self.set_status("Starting zmq ThreadedAuthenticator",1) #serverauth = zmq.auth.ThreadedAuthenticator(serverctx) serverauth = ThreadAuthenticator(serverctx) serverauth.start() with taco.globals.settings_lock: bindip = taco.globals.settings["Application IP"] bindport = taco.globals.settings["Application Port"] localuuid = taco.globals.settings["Local UUID"] publicdir = os.path.normpath(os.path.abspath(taco.globals.settings["TacoNET Certificates Store"] + "/" + taco.globals.settings["Local UUID"] + "/public/")) privatedir = os.path.normpath(os.path.abspath(taco.globals.settings["TacoNET Certificates Store"] + "/" + taco.globals.settings["Local UUID"] + "/private/")) self.set_status("Configuring Curve to use publickey dir:" + publicdir) serverauth.configure_curve(domain='*', location=publicdir) #auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) self.set_status("Creating Server Context",1) server = serverctx.socket(zmq.REP) server.setsockopt(zmq.LINGER, 0) self.set_status("Loading Server Certs",1) server_public, server_secret = zmq.auth.load_certificate(os.path.normpath(os.path.abspath(privatedir + "/" + taco.constants.KEY_GENERATION_PREFIX +"-server.key_secret"))) server.curve_secretkey = server_secret server.curve_publickey = server_public server.curve_server = True if bindip == "0.0.0.0": bindip ="*" self.set_status("Server is now listening for encrypted ZMQ connections @ "+ "tcp://" + bindip +":" + str(bindport)) server.bind("tcp://" + bindip +":" + str(bindport)) poller = zmq.Poller() poller.register(server, zmq.POLLIN|zmq.POLLOUT) while not self.stop.is_set(): socks = dict(poller.poll(200)) if server in socks and socks[server] == zmq.POLLIN: #self.set_status("Getting a request") data = server.recv() with taco.globals.download_limiter_lock: taco.globals.download_limiter.add(len(data)) (client_uuid,reply) = taco.commands.Proccess_Request(data) if client_uuid!="0": self.set_client_last_request(client_uuid) socks = dict(poller.poll(10)) if server in socks and socks[server] == zmq.POLLOUT: #self.set_status("Replying to a request") with taco.globals.upload_limiter_lock: taco.globals.upload_limiter.add(len(reply)) server.send(reply) self.set_status("Stopping zmq server with 0 second linger") server.close(0) self.set_status("Stopping zmq ThreadedAuthenticator") serverauth.stop() serverctx.term() self.set_status("Server Exit")
class ZmqListener: def __init__(self, settings): self.redis = RedisScraper(settings) self.id = settings.getKey("box_id") self.log = logging.getLogger('ZMQ') self.clientPath = settings.getKey("zmq.private_cert") self.serverPath = settings.getKey("zmq.server_cert") if not self.clientPath or not self.serverPath: self.log.fatal( "zmq certificates not configured in the settings file") os._exit(1) self.host = settings.getKey("zmq.acq_host") self.ctx = zmq.Context() self.auth = ThreadAuthenticator(self.ctx) self.auth.start() #self.auth.allow('127.0.0.1') self.auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) self.client = self.ctx.socket(zmq.REP) try: client_public, client_secret = zmq.auth.load_certificate( self.clientPath) self.client.curve_secretkey = client_secret self.client.curve_publickey = client_public server_public, _ = zmq.auth.load_certificate(self.serverPath) self.client.curve_serverkey = server_public self.client.connect(self.host) except IOError: self.log.fatal("Could not load client certificate") os._exit(1) except ValueError: self.log.fatal("Could not load client certificate") os._exit(1) self.log.info("ZMQ connected to " + self.host + " using certs " + self.clientPath) self.running = False self.handlers = { opq_pb2.RequestDataMessage.PING: self.ping, opq_pb2.RequestDataMessage.READ: self.read } def ping(self, message): message.type = opq_pb2.RequestDataMessage.PONG message_buff = message.SerializeToString() self.log.info("Received a PING from server") self.client.send(message_buff) return True def read(self, message): self.log.debug("Received a data transfer request from server") try: if message.front == 0 or message.back == 0: message.type = opq_pb2.RequestDataMessage.ERROR message_buff = message.SerializeToString() self.log.info("Bad message from server") self.client.send(message_buff) return False cycles = self.redis.getRange(message.time - message.back, message.time + message.front) cycles.id = self.id cycles.mid = message.mid message_buff = cycles.SerializeToString() self.client.send(message_buff) except google.protobuf.message.DecodeError: self.log.fatal("Bad request from acquisition server.") return False def run(self): self.running = True try: while self.running: message_buff = self.client.recv() message = opq_pb2.RequestDataMessage() message.ParseFromString(message_buff) self.handlers[message.type](message) except google.protobuf.message.DecodeError: self.log.fatal("Bad request from acquisition server.")
class Command(LAVADaemonCommand): """ worker_host is the hostname of the worker this field is set by the admin and could therefore be empty in a misconfigured instance. """ logger = None help = "LAVA dispatcher master" default_logfile = "/var/log/lava-server/lava-master.log" def __init__(self, *args, **options): super(Command, self).__init__(*args, **options) self.auth = None self.controler = None self.event_socket = None self.poller = None self.pipe_r = None self.inotify_fd = None # List of logs # List of known dispatchers. At startup do not load this from the # database. This will help to know if the slave as restarted or not. self.dispatchers = {"lava-logs": SlaveDispatcher("lava-logs", online=False)} self.events = {"canceling": set()} def add_arguments(self, parser): super(Command, self).add_arguments(parser) # Important: ensure share/env.yaml is put into /etc/ by setup.py in packaging. config = parser.add_argument_group("dispatcher config") config.add_argument('--env', default="/etc/lava-server/env.yaml", help="Environment variables for the dispatcher processes. " "Default: /etc/lava-server/env.yaml") config.add_argument('--env-dut', default="/etc/lava-server/env.dut.yaml", help="Environment variables for device under test. " "Default: /etc/lava-server/env.dut.yaml") config.add_argument('--dispatchers-config', default="/etc/lava-server/dispatcher.d", help="Directory that might contain dispatcher specific configuration") net = parser.add_argument_group("network") net.add_argument('--master-socket', default='tcp://*:5556', help="Socket for master-slave communication. Default: tcp://*:5556") net.add_argument('--event-url', default="tcp://localhost:5500", help="URL of the publisher") net.add_argument('--ipv6', default=False, action='store_true', help="Enable IPv6 on the listening sockets") net.add_argument('--encrypt', default=False, action='store_true', help="Encrypt messages") net.add_argument('--master-cert', default='/etc/lava-dispatcher/certificates.d/master.key_secret', help="Certificate for the master socket") net.add_argument('--slaves-certs', default='/etc/lava-dispatcher/certificates.d', help="Directory for slaves certificates") def send_status(self, hostname): """ The master crashed, send a STATUS message to get the current state of jobs """ jobs = TestJob.objects.filter(actual_device__worker_host__hostname=hostname, state=TestJob.STATE_RUNNING) for job in jobs: self.logger.info("[%d] STATUS => %s (%s)", job.id, hostname, job.actual_device.hostname) send_multipart_u(self.controler, [hostname, 'STATUS', str(job.id)]) def dispatcher_alive(self, hostname): if hostname not in self.dispatchers: # The server crashed: send a STATUS message self.logger.warning("Unknown dispatcher <%s> (server crashed)", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname) self.send_status(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() def controler_socket(self): try: # We need here to use the zmq.NOBLOCK flag, otherwise we could block # the whole main loop where this function is called. msg = self.controler.recv_multipart(zmq.NOBLOCK) except zmq.error.Again: return False # This is way to verbose for production and should only be activated # by (and for) developers # self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = u(msg[0]) # 2: the action action = u(msg[1]) # Check that lava-logs only send PINGs if hostname == "lava-logs" and action != "PING": self.logger.error("%s => %s Invalid action from log daemon", hostname, action) return True # Handle the actions if action == 'HELLO' or action == 'HELLO_RETRY': self._handle_hello(hostname, action, msg) elif action == 'PING': self._handle_ping(hostname, action, msg) elif action == 'END': self._handle_end(hostname, action, msg) elif action == 'START_OK': self._handle_start_ok(hostname, action, msg) else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True def read_event_socket(self): try: msg = self.event_socket.recv_multipart(zmq.NOBLOCK) except zmq.error.Again: return False try: (topic, _, dt, username, data) = (u(m) for m in msg) except ValueError: self.logger.error("Invalid event: %s", msg) return True if topic.endswith(".testjob"): try: data = simplejson.loads(data) if data["state"] == "Canceling": self.events["canceling"].add(int(data["job"])) except ValueError: self.logger.error("Invalid event data: %s", msg) return True def _handle_end(self, hostname, action, msg): # pylint: disable=unused-argument try: job_id = int(msg[2]) error_msg = msg[3] compressed_description = msg[4] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return try: job = TestJob.objects.get(id=job_id) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)]) return filename = os.path.join(job.output_dir, 'description.yaml') # If description.yaml already exists: a END was already received if os.path.exists(filename): self.logger.info("[%d] %s => END (duplicated), skipping", job_id, hostname) else: if compressed_description: self.logger.info("[%d] %s => END", job_id, hostname) else: self.logger.info("[%d] %s => END (lava-run crashed, mark job as INCOMPLETE)", job_id, hostname) with transaction.atomic(): # TODO: find a way to lock actual_device job = TestJob.objects.select_for_update() \ .get(id=job_id) job.go_state_finished(TestJob.HEALTH_INCOMPLETE) if error_msg: self.logger.error("[%d] Error: %s", job_id, error_msg) job.failure_comment = error_msg job.save() # Create description.yaml even if it's empty # Allows to know when END messages are duplicated try: # Create the directory if it was not already created mkdir(os.path.dirname(filename)) # TODO: check that compressed_description is not "" description = lzma.decompress(compressed_description) with open(filename, 'w') as f_description: f_description.write(description.decode("utf-8")) if description: parse_job_description(job) except (IOError, lzma.LZMAError) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception("[%d] %s", job_id, exc) # ACK the job and mark the dispatcher as alive send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname) def _handle_hello(self, hostname, action, msg): # Check the protocol version try: slave_version = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return self.logger.info("%s => %s", hostname, action) if slave_version != PROTOCOL_VERSION: self.logger.error("<%s> using protocol v%d while master is using v%d", hostname, slave_version, PROTOCOL_VERSION) return send_multipart_u(self.controler, [hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: if action == 'HELLO': self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning("Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO # message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname) # Mark the dispatcher as alive self.dispatcher_alive(hostname) def _handle_ping(self, hostname, action, msg): # pylint: disable=unused-argument self.logger.debug("%s => PING(%d)", hostname, PING_INTERVAL) # Send back a signal send_multipart_u(self.controler, [hostname, 'PONG', str(PING_INTERVAL)]) self.dispatcher_alive(hostname) def _handle_start_ok(self, hostname, action, msg): # pylint: disable=unused-argument try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): # TODO: find a way to lock actual_device job = TestJob.objects.select_for_update() \ .get(id=job_id) job.go_state_running() job.save() except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) else: self.dispatcher_alive(hostname) def export_definition(self, job): # pylint: disable=no-self-use job_def = yaml.load(job.definition) job_def['compatibility'] = job.pipeline_compatibility # no need for the dispatcher to retain comments return yaml.dump(job_def) def save_job_config(self, job, worker, device_cfg, options): output_dir = job.output_dir mkdir(output_dir) with open(os.path.join(output_dir, "job.yaml"), "w") as f_out: f_out.write(self.export_definition(job)) with contextlib.suppress(IOError): shutil.copy(options["env"], os.path.join(output_dir, "env.yaml")) with contextlib.suppress(IOError): shutil.copy(options["env_dut"], os.path.join(output_dir, "env.dut.yaml")) with contextlib.suppress(IOError): shutil.copy(os.path.join(options["dispatchers_config"], "%s.yaml" % worker.hostname), os.path.join(output_dir, "dispatcher.yaml")) with open(os.path.join(output_dir, "device.yaml"), "w") as f_out: yaml.dump(device_cfg, f_out) def start_job(self, job, options): # Load job definition to get the variables for template # rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) device = job.actual_device worker = device.worker_host # Load configurations env_str = load_optional_yaml_file(options['env']) env_dut_str = load_optional_yaml_file(options['env_dut']) device_cfg = device.load_configuration(job_ctx) dispatcher_cfg_file = os.path.join(options['dispatchers_config'], "%s.yaml" % worker.hostname) dispatcher_cfg = load_optional_yaml_file(dispatcher_cfg_file) self.save_job_config(job, worker, device_cfg, options) self.logger.info("[%d] START => %s (%s)", job.id, worker.hostname, device.hostname) send_multipart_u(self.controler, [worker.hostname, 'START', str(job.id), self.export_definition(job), yaml.dump(device_cfg), dispatcher_cfg, env_str, env_dut_str]) # For multinode jobs, start the dynamic connections parent = job for sub_job in job.sub_jobs_list: if sub_job == parent or not sub_job.dynamic_connection: continue # inherit only enough configuration for dynamic_connection operation self.logger.info("[%d] Trimming dynamic connection device configuration.", sub_job.id) min_device_cfg = parent.actual_device.minimise_configuration(device_cfg) self.save_job_config(sub_job, worker, min_device_cfg, options) self.logger.info("[%d] START => %s (connection)", sub_job.id, worker.hostname) send_multipart_u(self.controler, [worker.hostname, 'START', str(sub_job.id), self.export_definition(sub_job), yaml.dump(min_device_cfg), dispatcher_cfg, env_str, env_dut_str]) def start_jobs(self, options): """ Loop on all scheduled jobs and send the START message to the slave. """ # make the request atomic query = TestJob.objects.select_for_update() # Only select test job that are ready query = query.filter(state=TestJob.STATE_SCHEDULED) # Only start jobs on online workers query = query.filter(actual_device__worker_host__state=Worker.STATE_ONLINE) # exclude test job without a device: they are special test jobs like # dynamic connection. query = query.exclude(actual_device=None) # TODO: find a way to lock actual_device # Loop on all jobs for job in query: msg = None try: self.start_job(job, options) except jinja2.TemplateNotFound as exc: self.logger.error("[%d] Template not found: '%s'", job.id, exc.message) msg = "Template not found: '%s'" % exc.message except jinja2.TemplateSyntaxError as exc: self.logger.error("[%d] Template syntax error in '%s', line %d: %s", job.id, exc.name, exc.lineno, exc.message) msg = "Template syntax error in '%s', line %d: %s" % (exc.name, exc.lineno, exc.message) except IOError as exc: self.logger.error("[%d] Unable to read '%s': %s", job.id, exc.filename, exc.strerror) msg = "Cannot open '%s': %s" % (exc.filename, exc.strerror) except yaml.YAMLError as exc: self.logger.error("[%d] Unable to parse job definition: %s", job.id, exc) msg = "Cannot parse job definition: %s" % exc if msg: # Add the error as lava.job result metadata = {"case": "job", "definition": "lava", "error_type": "Infrastructure", "error_msg": msg, "result": "fail"} suite, _ = TestSuite.objects.get_or_create(name="lava", job=job) TestCase.objects.create(name="job", suite=suite, result=TestCase.RESULT_FAIL, metadata=yaml.dump(metadata)) job.go_state_finished(TestJob.HEALTH_INCOMPLETE, True) job.save() def cancel_jobs(self, partial=False): query = TestJob.objects.filter(state=TestJob.STATE_CANCELING) if partial: query = query.filter(id__in=list(self.events["canceling"])) for job in query: worker = job.lookup_worker if job.dynamic_connection else job.actual_device.worker_host self.logger.info("[%d] CANCEL => %s", job.id, worker.hostname) send_multipart_u(self.controler, [worker.hostname, 'CANCEL', str(job.id)]) def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-master", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("[INIT] Unable to drop privileges") return self.logger.info("[INIT] Marking all workers as offline") with transaction.atomic(): for worker in Worker.objects.select_for_update().all(): worker.go_state_offline() worker.save() # Create the sockets context = zmq.Context() self.controler = context.socket(zmq.ROUTER) self.event_socket = context.socket(zmq.SUB) if options['ipv6']: self.logger.info("[INIT] Enabling IPv6") self.controler.setsockopt(zmq.IPV6, 1) self.event_socket.setsockopt(zmq.IPV6, 1) if options['encrypt']: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug("[INIT] Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate(options['master_cert']) self.logger.debug("[INIT] Using slaves certificates from: %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) except IOError as err: self.logger.error(err) self.auth.stop() return self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_server = True self.logger.debug("[INIT] Watching %s", options["slaves_certs"]) self.inotify_fd = watch_directory(options["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.controler.setsockopt(zmq.IDENTITY, b"master") # From http://api.zeromq.org/4-2:zmq-setsockopt#toc42 # "If two clients use the same identity when connecting to a ROUTER # [...] the ROUTER socket shall hand-over the connection to the new # client and disconnect the existing one." self.controler.setsockopt(zmq.ROUTER_HANDOVER, 1) self.controler.bind(options['master_socket']) self.event_socket.setsockopt(zmq.SUBSCRIBE, b(settings.EVENT_TOPIC)) self.event_socket.connect(options['event_url']) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.controler, zmq.POLLIN) self.poller.register(self.event_socket, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] LAVA master has started.") self.logger.info("[INIT] Using protocol version %d", PROTOCOL_VERSION) try: self.main_loop(options) except BaseException as exc: self.logger.error("[CLOSE] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Drop controler socket: the protocol does handle lost messages self.logger.info("[CLOSE] Closing the controler socket and dropping messages") self.controler.close(linger=0) self.event_socket.close(linger=0) if options['encrypt']: self.auth.stop() context.term() def main_loop(self, options): last_schedule = last_dispatcher_check = time.time() while True: try: try: # Compute the timeout now = time.time() timeout = min(SCHEDULE_INTERVAL - (now - last_schedule), PING_INTERVAL - (now - last_dispatcher_check)) # If some actions are remaining, decrease the timeout if self.events["canceling"]: timeout = min(timeout, 1) # Wait at least for 1ms timeout = max(timeout * 1000, 1) # Wait for data or a timeout sockets = dict(self.poller.poll(timeout)) except zmq.error.ZMQError: continue if sockets.get(self.pipe_r) == zmq.POLLIN: self.logger.info("[POLL] Received a signal, leaving") break # Command socket if sockets.get(self.controler) == zmq.POLLIN: while self.controler_socket(): # Unqueue all pending messages pass # Events socket if sockets.get(self.event_socket) == zmq.POLLIN: while self.read_event_socket(): # Unqueue all pending messages pass # Wait for the next iteration to handle the event. # In fact, the code that generated the event (lava-logs or # lava-server-gunicorn) needs some time to commit the # database transaction. # If we are too fast, the database object won't be # available (or in the right state) yet. continue # Inotify socket if sockets.get(self.inotify_fd) == zmq.POLLIN: os.read(self.inotify_fd, 4096) self.logger.debug("[AUTH] Reloading certificates from %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) # Check dispatchers status now = time.time() if now - last_dispatcher_check > PING_INTERVAL: for hostname, dispatcher in self.dispatchers.items(): if dispatcher.online and now - dispatcher.last_msg > DISPATCHER_TIMEOUT: if hostname == "lava-logs": self.logger.error("[STATE] lava-logs goes OFFLINE") else: self.logger.error("[STATE] Dispatcher <%s> goes OFFLINE", hostname) self.dispatchers[hostname].go_offline() last_dispatcher_check = now # Limit accesses to the database. This will also limit the rate of # CANCEL and START messages if time.time() - last_schedule > SCHEDULE_INTERVAL: if self.dispatchers["lava-logs"].online: schedule(self.logger) # Dispatch scheduled jobs with transaction.atomic(): self.start_jobs(options) else: self.logger.warning("lava-logs is offline: can't schedule jobs") # Handle canceling jobs self.cancel_jobs() # Do not count the time taken to schedule jobs last_schedule = time.time() else: # Cancel the jobs and remove the jobs from the set if self.events["canceling"]: self.cancel_jobs(partial=True) self.events["canceling"] = set() except (OperationalError, InterfaceError): self.logger.info("[RESET] database connection reset.") # Closing the database connection will force Django to reopen # the connection connection.close() time.sleep(2)
class Command(LAVADaemonCommand): help = "LAVA log recorder" logger = None default_logfile = "/var/log/lava-server/lava-logs.log" def __init__(self, *args, **options): super().__init__(*args, **options) self.logger = logging.getLogger("lava-logs") self.log_socket = None self.auth = None self.controler = None self.inotify_fd = None self.pipe_r = None self.poller = None self.cert_dir_path = None # List of logs self.jobs = {} # Keep test cases in memory self.test_cases = [] # Master status self.last_ping = 0 self.ping_interval = TIMEOUT def add_arguments(self, parser): super().add_arguments(parser) net = parser.add_argument_group("network") net.add_argument('--socket', default='tcp://*:5555', help="Socket waiting for logs. Default: tcp://*:5555") net.add_argument('--master-socket', default='tcp://localhost:5556', help="Socket for master-slave communication. Default: tcp://localhost:5556") net.add_argument('--ipv6', default=False, action='store_true', help="Enable IPv6 on the listening sockets") net.add_argument('--encrypt', default=False, action='store_true', help="Encrypt messages") net.add_argument('--master-cert', default='/etc/lava-dispatcher/certificates.d/master.key_secret', help="Certificate for the master socket") net.add_argument('--slaves-certs', default='/etc/lava-dispatcher/certificates.d', help="Directory for slaves certificates") def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-logs", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("[INIT] Unable to drop privileges") return filename = os.path.join(settings.MEDIA_ROOT, 'lava-logs-config.yaml') self.logger.debug("[INIT] Dumping config to %s", filename) with open(filename, 'w') as output: yaml.dump(options, output) # Create the sockets context = zmq.Context() self.log_socket = context.socket(zmq.PULL) self.controler = context.socket(zmq.ROUTER) self.controler.setsockopt(zmq.IDENTITY, b"lava-logs") # Limit the number of messages in the queue self.controler.setsockopt(zmq.SNDHWM, 2) # From http://api.zeromq.org/4-2:zmq-setsockopt#toc5 # "Immediately readies that connection for data transfer with the master" self.controler.setsockopt(zmq.CONNECT_RID, b"master") if options['ipv6']: self.logger.info("[INIT] Enabling IPv6") self.log_socket.setsockopt(zmq.IPV6, 1) self.controler.setsockopt(zmq.IPV6, 1) if options['encrypt']: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug("[INIT] Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate(options['master_cert']) self.logger.debug("[INIT] Using slaves certificates from: %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) except OSError as err: self.logger.error("[INIT] %s", err) self.auth.stop() return self.log_socket.curve_publickey = master_public self.log_socket.curve_secretkey = master_secret self.log_socket.curve_server = True self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_serverkey = master_public self.logger.debug("[INIT] Watching %s", options["slaves_certs"]) self.cert_dir_path = options["slaves_certs"] self.inotify_fd = watch_directory(options["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.log_socket.bind(options['socket']) self.controler.connect(options['master_socket']) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.log_socket, zmq.POLLIN) self.poller.register(self.controler, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] listening for logs") # PING right now: the master is waiting for this message to start # scheduling. self.controler.send_multipart([b"master", b"PING"]) try: self.main_loop() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) # Close the controler socket self.controler.close(linger=0) self.poller.unregister(self.controler) # Carefully close the logging socket as we don't want to lose messages self.logger.info("[EXIT] Disconnect logging socket and process messages") endpoint = u(self.log_socket.getsockopt(zmq.LAST_ENDPOINT)) self.logger.debug("[EXIT] unbinding from '%s'", endpoint) self.log_socket.unbind(endpoint) # Empty the queue try: while self.wait_for_messages(True): # Flush test cases cache for every iteration because we might # get killed soon. self.flush_test_cases() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Last flush self.flush_test_cases() self.logger.info("[EXIT] Closing the logging socket: the queue is empty") self.log_socket.close() if options['encrypt']: self.auth.stop() context.term() def flush_test_cases(self): if not self.test_cases: return # Try to save into the database try: TestCase.objects.bulk_create(self.test_cases) self.logger.info("Saving %d test cases", len(self.test_cases)) self.test_cases = [] except DatabaseError as exc: self.logger.error("Unable to flush the test cases") self.logger.exception(exc) self.logger.warning("Saving test cases one by one and dropping the faulty ones") saved = 0 for tc in self.test_cases: with contextlib.suppress(DatabaseError): tc.save() saved += 1 self.logger.info("%d test cases saved, %d dropped", saved, len(self.test_cases) - saved) self.test_cases = [] def main_loop(self): last_gc = time.time() last_bulk_create = time.time() # Wait for messages # TODO: fix timeout computation while self.wait_for_messages(False): now = time.time() # Dump TestCase into the database if now - last_bulk_create > BULK_CREATE_TIMEOUT: last_bulk_create = now self.flush_test_cases() # Close old file handlers if now - last_gc > FD_TIMEOUT: last_gc = now # Iterate while removing keys is not compatible with iterator for job_id in list(self.jobs.keys()): # pylint: disable=consider-iterating-dictionary if now - self.jobs[job_id].last_usage > FD_TIMEOUT: self.logger.info("[%s] closing log file", job_id) self.jobs[job_id].close() del self.jobs[job_id] # Ping the master if now - self.last_ping > self.ping_interval: self.logger.debug("PING => master") self.last_ping = now self.controler.send_multipart([b"master", b"PING"]) def wait_for_messages(self, leaving): try: try: sockets = dict(self.poller.poll(TIMEOUT * 1000)) except zmq.error.ZMQError as exc: self.logger.error("[POLL] zmq error: %s", str(exc)) return True # Messages if sockets.get(self.log_socket) == zmq.POLLIN: self.logging_socket() return True # Signals elif sockets.get(self.pipe_r) == zmq.POLLIN: # remove the message from the queue os.read(self.pipe_r, 1) if not leaving: self.logger.info("[POLL] received a signal, leaving") return False else: self.logger.warning("[POLL] signal already handled, please wait for the process to exit") return True # Pong received elif sockets.get(self.controler) == zmq.POLLIN: self.controler_socket() return True # Inotify socket if sockets.get(self.inotify_fd) == zmq.POLLIN: os.read(self.inotify_fd, 4096) self.logger.debug("[AUTH] Reloading certificates from %s", self.cert_dir_path) self.auth.configure_curve(domain='*', location=self.cert_dir_path) # Nothing received else: return not leaving except (OperationalError, InterfaceError): self.logger.info("[RESET] database connection reset") connection.close() return True def logging_socket(self): msg = self.log_socket.recv_multipart() try: (job_id, message) = (u(m) for m in msg) # pylint: disable=unbalanced-tuple-unpacking except ValueError: # do not let a bad message stop the master. self.logger.error("[POLL] failed to parse log message, skipping: %s", msg) return try: scanned = yaml.load(message, Loader=yaml.CLoader) except yaml.YAMLError: self.logger.error("[%s] data are not valid YAML, dropping", job_id) return # Look for "results" level try: message_lvl = scanned["lvl"] message_msg = scanned["msg"] except TypeError: self.logger.error("[%s] not a dictionary, dropping", job_id) return except KeyError: self.logger.error( "[%s] invalid log line, missing \"lvl\" or \"msg\" keys: %s", job_id, message) return # Find the handler (if available) if job_id not in self.jobs: # Query the database for the job try: job = TestJob.objects.get(id=job_id) except TestJob.DoesNotExist: self.logger.error("[%s] unknown job id", job_id) return self.logger.info("[%s] receiving logs from a new job", job_id) # Create the sub directories (if needed) mkdir(job.output_dir) self.jobs[job_id] = JobHandler(job) # For 'event', send an event and log as 'debug' if message_lvl == 'event': self.logger.debug("[%s] event: %s", job_id, message_msg) send_event(".event", "lavaserver", {"message": message_msg, "job": job_id}) message_lvl = "debug" # For 'marker', save in the database and log as 'debug' elif message_lvl == 'marker': # TODO: save on the file system in case of lava-logs restart m_type = message_msg.get("type") case = message_msg.get("case") if m_type is None or case is None: self.logger.error("[%s] invalid marker: %s", job_id, message_msg) return self.jobs[job_id].markers.setdefault(case, {})[m_type] = self.jobs[job_id].line_count() # This is in fact the previous line self.jobs[job_id].markers[case][m_type] -= 1 self.logger.debug("[%s] marker: %s line: %s", job_id, message_msg, self.jobs[job_id].markers[case][m_type]) return # Mark the file handler as used self.jobs[job_id].last_usage = time.time() # The format is a list of dictionaries self.jobs[job_id].write("- %s" % message) if message_lvl == "results": try: job = TestJob.objects.get(pk=job_id) except TestJob.DoesNotExist: self.logger.error("[%s] unknown job id", job_id) return meta_filename = create_metadata_store(message_msg, job) new_test_case = map_scanned_results(results=message_msg, job=job, markers=self.jobs[job_id].markers, meta_filename=meta_filename) if new_test_case is None: self.logger.warning( "[%s] unable to map scanned results: %s", job_id, message) else: self.test_cases.append(new_test_case) # Look for lava.job result if message_msg.get("definition") == "lava" and message_msg.get("case") == "job": # Flush cached test cases self.flush_test_cases() if message_msg.get("result") == "pass": health = TestJob.HEALTH_COMPLETE health_msg = "Complete" else: health = TestJob.HEALTH_INCOMPLETE health_msg = "Incomplete" self.logger.info("[%s] job status: %s", job_id, health_msg) infrastructure_error = (message_msg.get("error_type") in ["Bug", "Configuration", "Infrastructure"]) if infrastructure_error: self.logger.info("[%s] Infrastructure error", job_id) # Update status. with transaction.atomic(): # TODO: find a way to lock actual_device job = TestJob.objects.select_for_update() \ .get(id=job_id) job.go_state_finished(health, infrastructure_error) job.save() # n.b. logging here would produce a log entry for every message in every job. def controler_socket(self): msg = self.controler.recv_multipart() try: master_id = u(msg[0]) action = u(msg[1]) ping_interval = int(msg[2]) if master_id != "master": self.logger.error("Invalid master id '%s'. Should be 'master'", master_id) return if action != "PONG": self.logger.error("Invalid answer '%s'. Should be 'PONG'", action) return except (IndexError, ValueError): self.logger.error("Invalid message '%s'", msg) return if ping_interval < TIMEOUT: self.logger.error("invalid ping interval (%d) too small", ping_interval) return self.logger.debug("master => PONG(%d)", ping_interval) self.ping_interval = ping_interval
class RpcServer: """""" def __init__(self): """ Constructor """ # Save functions dict: key is fuction name, value is fuction object self.__functions: Dict[str, Any] = {} # Zmq port related self.__context: zmq.Context = zmq.Context() # Reply socket (Request–reply pattern) self.__socket_rep: zmq.Socket = self.__context.socket(zmq.REP) # Publish socket (Publish–subscribe pattern) self.__socket_pub: zmq.Socket = self.__context.socket(zmq.PUB) # Worker thread related self.__active: bool = False # RpcServer status self.__thread: threading.Thread = None # RpcServer thread self.__lock: threading.Lock = threading.Lock() # Authenticator used to ensure data security self.__authenticator: ThreadAuthenticator = None def is_active(self) -> bool: """""" return self.__active def start(self, rep_address: str, pub_address: str, server_secretkey_path: str = "", username: str = "", password: str = "") -> None: """ Start RpcServer """ if self.__active: return # Start authenticator if server_secretkey_path: self.__authenticator = ThreadAuthenticator(self.__context) self.__authenticator.start() self.__authenticator.configure_curve( domain="*", location=zmq.auth.CURVE_ALLOW_ANY) publickey, secretkey = zmq.auth.load_certificate( server_secretkey_path) self.__socket_pub.curve_secretkey = secretkey self.__socket_pub.curve_publickey = publickey self.__socket_pub.curve_server = True self.__socket_rep.curve_secretkey = secretkey self.__socket_rep.curve_publickey = publickey self.__socket_rep.curve_server = True elif username and password: self.__authenticator = ThreadAuthenticator(self.__context) self.__authenticator.start() self.__authenticator.configure_plain( domain="*", passwords={username: password}) self.__socket_pub.plain_server = True self.__socket_rep.plain_server = True # Bind socket address self.__socket_rep.bind(rep_address) self.__socket_pub.bind(pub_address) # Start RpcServer status self.__active = True # Start RpcServer thread self.__thread = threading.Thread(target=self.run) self.__thread.start() def stop(self) -> None: """ Stop RpcServer """ if not self.__active: return # Stop RpcServer status self.__active = False def join(self) -> None: # Wait for RpcServer thread to exit if self.__thread and self.__thread.is_alive(): self.__thread.join() self.__thread = None def run(self) -> None: """ Run RpcServer functions """ start = datetime.utcnow() while self.__active: # Use poll to wait event arrival, waiting time is 1 second (1000 milliseconds) cur = datetime.utcnow() delta = cur - start if delta >= KEEP_ALIVE_INTERVAL: self.publish(KEEP_ALIVE_TOPIC, cur) if not self.__socket_rep.poll(1000): continue # Receive request data from Reply socket req = self.__socket_rep.recv_pyobj() # Get function name and parameters name, args, kwargs = req # Try to get and execute callable function object; capture exception information if it fails try: func = self.__functions[name] r = func(*args, **kwargs) rep = [True, r] except Exception as e: # noqa rep = [False, traceback.format_exc()] # send callable response by Reply socket self.__socket_rep.send_pyobj(rep) # Unbind socket address self.__socket_pub.unbind(self.__socket_pub.LAST_ENDPOINT) self.__socket_rep.unbind(self.__socket_rep.LAST_ENDPOINT) def publish(self, topic: str, data: Any) -> None: """ Publish data """ with self.__lock: self.__socket_pub.send_pyobj([topic, data]) def register(self, func: Callable) -> None: """ Register function """ self.__functions[func.__name__] = func
def __init__(self, gModel, gModelName, dName, qName, sysArgv): ''' Constructor :param dName: device type name :type dName: str :param qName: qualified name of the device instance: 'actor.inst' :type qName: str ''' self.logger = logging.getLogger(__name__) self.inst_ = self self.appName = gModel["name"] self.modelName = gModelName aName,iName = qName.split('.') self.name = qName self.iName = iName self.dName = dName self.pid = os.getpid() self.uuid = None self.suffix = "" self.setupIfaces() # Assumption : pid is a 4 byte int self.actorID = ipaddress.IPv4Address(self.globalHost).packed + self.pid.to_bytes(4, 'big') if dName not in gModel["devices"]: raise BuildError('Device "%s" unknown' % dName) # In order to make the rest of the code work, we build an actor model for the device devModel = gModel["devices"][dName] self.model = {} # The made-up actor model formals = devModel["formals"] # Formals are the same as those of the device (component) self.model["formals"] = formals devInst = { "type": dName } # There is a single instance, containing the device component actuals = [] for arg in formals: name = arg["name"] actual = {} actual["name"] = name actual["param"] = name actuals.append(actual) devInst["actuals"] = actuals self.model["instances"] = { iName: devInst} # Single instance (under iName) aModel = gModel["actors"][aName] self.model["locals"] = aModel["locals"] # Locals self.model["internals"] = aModel["internals"] # Internals self.INT_RE = re.compile(r"^[-]?\d+$") self.parseParams(sysArgv) # Use czmq's context czmq_ctx = Zsys.init() self.context = zmq.Context.shadow(czmq_ctx.value) Zsys.handler_reset() # Reset previous signal # Context for app sockets self.appContext = zmq.Context() if Config.SECURITY: (self.public_key, self.private_key) = zmq.auth.load_certificate(const.appCertFile) _public = zmq.curve_public(self.private_key) if(self.public_key != _public): self.logger.error("bad security key(s)") raise BuildError("invalid security key(s)") hosts = ['127.0.0.1'] try: with open(const.appDescFile, 'r') as f: content = yaml.load(f, Loader=yaml.Loader) hosts += content.hosts except: self.logger.error("Error loading app descriptor:s", str(sys.exc_info()[1])) self.auth = ThreadAuthenticator(self.appContext) self.auth.start() self.auth.allow(*hosts) self.auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) else: (self.public_key, self.private_key) = (None, None) self.auth = None self.appContext = self.context try: if os.path.isfile(const.logConfFile) and os.access(const.logConfFile, os.R_OK): spdlog_setup.from_file(const.logConfFile) except Exception as e: self.logger.error("error while configuring componentLogger: %s" % repr(e)) messages = gModel["messages"] # Global message types (global on the network) self.messageNames = [] for messageSpec in messages: self.messageNames.append(messageSpec["name"]) locals_ = self.model["locals"] # Local message types (local to the host) self.localNames = [] for messageSpec in locals_: self.localNames.append(messageSpec["type"]) internals = self.model["internals"] # Internal message types (internal to the actor process) self.internalNames = [] for messageSpec in internals: self.internalNames.append(messageSpec["type"]) groups = gModel["groups"] self.groupTypes = {} for group in groups: self.groupTypes[group["name"]] = { "kind": group["kind"], "message": group["message"], "timed": group["timed"] } self.components = {} instSpecs = self.model["instances"] _compSpecs = gModel["components"] devSpecs = gModel["devices"] for instName in instSpecs: # Create the component instances: the 'parts' instSpec = instSpecs[instName] instType = instSpec['type'] if instType in devSpecs: typeSpec = devSpecs[instType] else: raise BuildError('Device type "%s" for instance "%s" is undefined' % (instType, instName)) instFormals = typeSpec['formals'] instActuals = instSpec['actuals'] instArgs = self.buildInstArgs(instName, instFormals, instActuals) # Check whether the component is C++ component ccComponentFile = 'lib' + instType.lower() + '.so' ccComp = os.path.isfile(ccComponentFile) try: if ccComp: modObj = importlib.import_module('lib' + instType.lower()) self.components[instName] = modObj.create_component_py(self, self.model, typeSpec, instName, instType, instArgs, self.appName, self.name, groups) else: self.components[instName] = Part(self, typeSpec, instName, instType, instArgs) except Exception as e: traceback.print_exc() self.logger.error("Error while constructing part '%s.%s': %s" % (instType, instName, str(e)))
class Device(Actor): ''' The actor class implements all the management and control functions over its components ''' def __init__(self, gModel, gModelName, dName, qName, sysArgv): ''' Constructor :param dName: device type name :type dName: str :param qName: qualified name of the device instance: 'actor.inst' :type qName: str ''' self.logger = logging.getLogger(__name__) self.inst_ = self self.appName = gModel["name"] self.modelName = gModelName aName,iName = qName.split('.') self.name = qName self.iName = iName self.dName = dName self.pid = os.getpid() self.uuid = None self.suffix = "" self.setupIfaces() # Assumption : pid is a 4 byte int self.actorID = ipaddress.IPv4Address(self.globalHost).packed + self.pid.to_bytes(4, 'big') if dName not in gModel["devices"]: raise BuildError('Device "%s" unknown' % dName) # In order to make the rest of the code work, we build an actor model for the device devModel = gModel["devices"][dName] self.model = {} # The made-up actor model formals = devModel["formals"] # Formals are the same as those of the device (component) self.model["formals"] = formals devInst = { "type": dName } # There is a single instance, containing the device component actuals = [] for arg in formals: name = arg["name"] actual = {} actual["name"] = name actual["param"] = name actuals.append(actual) devInst["actuals"] = actuals self.model["instances"] = { iName: devInst} # Single instance (under iName) aModel = gModel["actors"][aName] self.model["locals"] = aModel["locals"] # Locals self.model["internals"] = aModel["internals"] # Internals self.INT_RE = re.compile(r"^[-]?\d+$") self.parseParams(sysArgv) # Use czmq's context czmq_ctx = Zsys.init() self.context = zmq.Context.shadow(czmq_ctx.value) Zsys.handler_reset() # Reset previous signal # Context for app sockets self.appContext = zmq.Context() if Config.SECURITY: (self.public_key, self.private_key) = zmq.auth.load_certificate(const.appCertFile) _public = zmq.curve_public(self.private_key) if(self.public_key != _public): self.logger.error("bad security key(s)") raise BuildError("invalid security key(s)") hosts = ['127.0.0.1'] try: with open(const.appDescFile, 'r') as f: content = yaml.load(f, Loader=yaml.Loader) hosts += content.hosts except: self.logger.error("Error loading app descriptor:s", str(sys.exc_info()[1])) self.auth = ThreadAuthenticator(self.appContext) self.auth.start() self.auth.allow(*hosts) self.auth.configure_curve(domain='*', location=zmq.auth.CURVE_ALLOW_ANY) else: (self.public_key, self.private_key) = (None, None) self.auth = None self.appContext = self.context try: if os.path.isfile(const.logConfFile) and os.access(const.logConfFile, os.R_OK): spdlog_setup.from_file(const.logConfFile) except Exception as e: self.logger.error("error while configuring componentLogger: %s" % repr(e)) messages = gModel["messages"] # Global message types (global on the network) self.messageNames = [] for messageSpec in messages: self.messageNames.append(messageSpec["name"]) locals_ = self.model["locals"] # Local message types (local to the host) self.localNames = [] for messageSpec in locals_: self.localNames.append(messageSpec["type"]) internals = self.model["internals"] # Internal message types (internal to the actor process) self.internalNames = [] for messageSpec in internals: self.internalNames.append(messageSpec["type"]) groups = gModel["groups"] self.groupTypes = {} for group in groups: self.groupTypes[group["name"]] = { "kind": group["kind"], "message": group["message"], "timed": group["timed"] } self.components = {} instSpecs = self.model["instances"] _compSpecs = gModel["components"] devSpecs = gModel["devices"] for instName in instSpecs: # Create the component instances: the 'parts' instSpec = instSpecs[instName] instType = instSpec['type'] if instType in devSpecs: typeSpec = devSpecs[instType] else: raise BuildError('Device type "%s" for instance "%s" is undefined' % (instType, instName)) instFormals = typeSpec['formals'] instActuals = instSpec['actuals'] instArgs = self.buildInstArgs(instName, instFormals, instActuals) # Check whether the component is C++ component ccComponentFile = 'lib' + instType.lower() + '.so' ccComp = os.path.isfile(ccComponentFile) try: if ccComp: modObj = importlib.import_module('lib' + instType.lower()) self.components[instName] = modObj.create_component_py(self, self.model, typeSpec, instName, instType, instArgs, self.appName, self.name, groups) else: self.components[instName] = Part(self, typeSpec, instName, instType, instArgs) except Exception as e: traceback.print_exc() self.logger.error("Error while constructing part '%s.%s': %s" % (instType, instName, str(e))) def getPortMessageTypes(self, ports, key, kinds, res): for _name, spec in ports[key].items(): for kind in kinds: typeName = spec[kind] res.append({"type": typeName}) def getMessageTypes(self, devModel): res = [] ports = devModel["ports"] self.getPortMessageTypes(ports, "pubs", ["type"], res) self.getPortMessageTypes(ports, "subs", ["type"], res) self.getPortMessageTypes(ports, "reqs", ["req_type", "rep_type"], res) self.getPortMessageTypes(ports, "reps", ["req_type", "rep_type"], res) self.getPortMessageTypes(ports, "clts", ["req_type", "rep_type"], res) self.getPortMessageTypes(ports, "srvs", ["req_type", "rep_type"], res) self.getPortMessageTypes(ports, "qrys", ["req_type", "rep_type"], res) self.getPortMessageTypes(ports, "anss", ["req_type", "rep_type"], res) return res def isDevice(self): return True def setup(self): ''' Perform a setup operation on the actor (after the initial construction but before the activation of parts) ''' self.logger.info("setup") # self.setupIfaces() self.suffix = self.macAddress self.disco = DiscoClient(self, self.suffix) self.disco.start() # Start the discovery service client self.disco.registerActor() # Register this actor with the discovery service self.logger.info("device registered with disco") self.deplc = DeplClient(self, self.suffix) self.deplc.start() ok = self.deplc.registerActor() self.logger.info("device %s registered with depl" % ("is" if ok else "is not")) self.controls = { } self.controlMap = { } for inst in self.components: comp = self.components[inst] control = self.context.socket(zmq.PAIR) control.bind('inproc://part_' + inst + '_control') self.controls[inst] = control self.controlMap[id(control)] = comp if isinstance(comp, Part): self.components[inst].setup(control) else: self.components[inst].setup() def terminate(self): self.logger.info("terminating") for component in self.components.values(): component.terminate() # self.devc.terminate() self.disco.terminate() # Clean up everything # self.context.destroy() time.sleep(1.0) self.logger.info("terminated") os._exit(0)
class StratusApp(StratusServerApp): def __init__(self, core: StratusCore, **kwargs): StratusServerApp.__init__(self, core, **kwargs) self.logger = StratusLogger.getLogger() self.active = True self.parms = self.getConfigParms('stratus') self.client_address = self.parms.get("client_address", "*") self.request_port = self.parms.get("request_port", 4556) self.response_port = self.parms.get("response_port", 4557) self.active_handlers = {} self.getCertDirs() def getCertDirs( self ): # These directories are generated by the generate_certificates script self.cert_dir = self.parms.get("certificate_path", os.path.expanduser("~/.stratus/zmq")) self.logger.info( f"Loading certificates and keys from directory {self.cert_dir}") self.keys_dir = os.path.join(self.cert_dir, 'certificates') self.public_keys_dir = os.path.join(self.cert_dir, 'public_keys') self.secret_keys_dir = os.path.join(self.cert_dir, 'private_keys') if not (os.path.exists(self.keys_dir) and os.path.exists(self.public_keys_dir) and os.path.exists(self.secret_keys_dir)): from stratus.handlers.zeromq.security.generate_certificates import generate_certificates generate_certificates(self.cert_dir) def initSocket(self): try: server_secret_file = os.path.join(self.secret_keys_dir, "server.key_secret") server_public, server_secret = zmq.auth.load_certificate( server_secret_file) # TODO: this is commented to avoid key checking #self.request_socket.curve_secretkey = server_secret #self.request_socket.curve_publickey = server_public #self.request_socket.curve_server = True self.request_socket.bind("tcp://{}:{}".format( self.client_address, self.request_port)) self.logger.info( "@@STRATUS-APP --> Bound authenticated request socket to client at {} on port: {}" .format(self.client_address, self.request_port)) except Exception as err: self.logger.error( "@@STRATUS-APP: Error initializing request socket on {}, port {}: {}" .format(self.client_address, self.request_port, err)) self.logger.error(traceback.format_exc()) def addHandler(self, clientId, jobId, handler): self.active_handlers[clientId + "-" + jobId] = handler return handler def removeHandler(self, clientId, jobId): handlerId = clientId + "-" + jobId try: del self.active_handlers[handlerId] except: self.logger.error("Error removing handler: " + handlerId + ", active handlers = " + str(list(self.active_handlers.keys()))) def setExeStatus(self, submissionId: str, status: Status): self.responder.setExeStatus(submissionId, status) def sendResponseMessage(self, msg: StratusResponse) -> str: request_args = [msg.id, msg.message] packaged_msg = "!".join(request_args) timeStamp = datetime.datetime.now().strftime("MM/dd HH:mm:ss") self.logger.info( "@@STRATUS-APP: Sending response {} on request_socket @({}): {}". format(msg.id, timeStamp, str(msg))) self.request_socket.send_string(packaged_msg) return packaged_msg def initInteractions(self): try: self.zmqContext: zmq.Context = zmq.Context() self.auth = ThreadAuthenticator(self.zmqContext) self.auth.start() self.auth.allow("192.168.0.22") self.auth.allow(self.client_address) self.auth.configure_curve( domain='*', location=zmq.auth.CURVE_ALLOW_ANY ) # self.public_keys_dir ) # Use 'location=zmq.auth.CURVE_ALLOW_ANY' for stonehouse security self.request_socket: zmq.Socket = self.zmqContext.socket(zmq.REP) self.responder = StratusZMQResponder( self.zmqContext, self.response_port, client_address=self.client_address, certificate_path=self.cert_dir) self.initSocket() self.logger.info( "@@STRATUS-APP:Listening for requests on port: {}".format( self.request_port)) except Exception as err: self.logger.error( "@@STRATUS-APP: ------------------------------- StratusApp Init error: {} ------------------------------- " .format(err)) def processResults(self): completed_workflows = self.responder.processWorkflows( self.getWorkflows()) for rid in completed_workflows: self.clearWorkflow(rid) def processRequests(self): while self.request_socket.poll(0) != 0: request_header = self.request_socket.recv_string().strip().strip( "'") parts = request_header.split("!") submissionId = str(parts[0]) rType = str(parts[1]) request: Dict = json.loads(parts[2]) if len(parts) > 2 else "" try: self.logger.info( "@@STRATUS-APP: ### Processing {} request: {}".format( rType, request)) if rType == "capabilities": response = self.core.getCapabilities(request["type"]) self.sendResponseMessage( StratusResponse(submissionId, response)) elif rType == "exe": if len(parts) <= 2: raise Exception("Missing parameters to exe request") request["rid"] = submissionId self.logger.info( "Processing zmq Request: '{}' '{}' '{}'".format( submissionId, rType, str(request))) self.submitWorkflow( request) # TODO: Send results when tasks complete. response = {"status": "Executing"} self.sendResponseMessage( StratusResponse(submissionId, response)) elif rType == "quit" or rType == "shutdown": response = {"status": "Terminating"} self.sendResponseMessage( StratusResponse(submissionId, response)) self.logger.info( "@@STRATUS-APP: Received Shutdown Message") exit(0) else: msg = "@@STRATUS-APP: Unknown request type: " + rType self.logger.info(msg) response = {"status": "error", "error": msg} self.sendResponseMessage( StratusResponse(submissionId, response)) except Exception as ex: self.processError(submissionId, ex) def processError(self, rid: str, ex: Exception): tb = traceback.format_exc() self.logger.error("@@STRATUS-APP: Execution error: " + str(ex)) self.logger.error(tb) response = {"status": "error", "error": str(ex), "traceback": tb} self.sendResponseMessage(StratusResponse(rid, response)) def updateInteractions(self): self.processRequests() self.processResults() def term(self, msg): self.logger.info("@@STRATUS-APP: !!EDAS Shutdown: " + msg) self.active = False self.auth.stop() self.logger.info("@@STRATUS-APP: QUIT PythonWorkerPortal") try: self.request_socket.close() except Exception: pass self.logger.info("@@STRATUS-APP: CLOSE request_socket") self.responder.close_connection() self.logger.info("@@STRATUS-APP: TERM responder") self.shutdown() self.logger.info("@@STRATUS-APP: shutdown complete")
class FrankFancyStreamingInterface(object): """ Abstraction layer to the graph streamer as well as the central logger Uses direct (non encrypted) socket connection to the streaming server It uses an (encrypted) zeromq connection to the logger """ ConvertStatus = { "Cells" : { 0 : 5, #removing 1 : 4, #allocating 2 : 6 #blacklisting } } #TODO: give every scheduler an unique topic to easily distinguish between them on the queue def __init__(self, name, privatekey, VisualizerHost, root_id, ZeromqHost = "*", empty=False): """ Calls internal methods to open the connections to both the Active Live visualizer and the logger :param VisualizerHost: The ip of the FrankFancyGraphStreamer :type VisualizerHost: str :param ZeromqHost: which interface the zeromq service needs to bind too ("*" for all interfaces) :type ZeromqHost: str :param KeyFolder: The folder with all the keys, as generated by generate_certificates.py :type KeyFolder: str :param root_id: the root of the network: LBR :type root_id: str :return: """ self.Active = None self.Logger = None self.EventId = 0 self.Name = name #used as topic on the queue if not empty: if privatekey is not None: self._connectLogger(privatekey, Host=ZeromqHost) if VisualizerHost is not None: self._connectVisualizer(VisualizerHost, root_id) self.g = DoDAG(root_id, root_id) self.root_id = root_id def _connectVisualizer(self, Host, root_id): """ Connect to the Active Live Visualizer :param Host: The ip of the FrankFancyGraphStreamer :param root_id: the ip6 address of the root node of the network :return: """ try: logg.debug("Connecting Streaming Interface to Active Viewer") self.Active = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.Active.connect((Host, 600)) logg.debug("Sending to Active Viewer:{}".format(root_id)) self.Active.sendall(root_id) except: logg.debug("Connection to Active Viewer failed!") self.Active = None def _connectLogger(self, key, Host="localhost"): """ Open a zeromq queue with publisher service :param Host: which interface the zeromq service needs to bind too ("*" for all interfaces) :param key: privatekey file of the scheduler :return: """ #TODO: error handling on certificates missing and stuff #TODO: expose more security options such as white/blacklisting ips and domain filtering self.context = zmq.Context() self.auth = ThreadAuthenticator(self.context) self.auth.start() self.auth.configure_curve(domain='*', location=os.path.join("keys", "public")) self.Logger = self.context.socket(zmq.PUB) scheduler_public, scheduler_secret = zmq.auth.load_certificate(os.path.join("keys", "plexi1.key_secret")) self.Logger.curve_secretkey = scheduler_secret self.Logger.curve_publickey = scheduler_public self.Logger.curve_server = True self.Logger.bind("tcp://127.0.0.1:6000") # raw_input("Press enter when the logger has opened subscription to us") def SendActiveJson(self,data): """ Sends an object as json encoded to the Active Live Viewer :param data: the object to be send :return: """ if self.Active is not None: logg.debug("Sending json data to Active: " + json.dumps(data)) self.Active.sendall(json.dumps(data)) def PublishLogging(self,LoggingName="zmq.auth", root_topic="zmq.auth"): """ Publishes the given python logger to the publishing service :param LoggingName: Name of the python logger service :type LoggingName: str :param root_topic: the topic given with message. is appended with .<LEVEL> :type root_topic: str :return: """ handler = PUBHandler(self.Logger) handler.root_topic = root_topic handler.formatters[logging.DEBUG] = logging.Formatter(fmt='%(asctime)s\t%(levelname)s: %(message)s', datefmt='%H:%M:%S') handler.formatters[logging.INFO] = logging.Formatter(fmt='%(asctime)s\t%(levelname)s: %(message)s', datefmt='%H:%M:%S') l = logging.getLogger(LoggingName) l.addHandler(handler) def ChangeCell(self, who, slotoffs, channeloffs, frame, ID, status): """ Notifies all active services about the changes to a cell in the schedule matrix :param who: The node in which the cell is changed :type who: :class: `node.NodeID` :param slotoffs: slot offset :param channeloffs: channel offset :param frame: frame name :param ID: local cell id :param status: new status of the cell :return: """ if self.Active is not None: logg.debug("Sending ChangeCell to active viewer") self.Active.sendall(json.dumps(["changecell",{"who": str(who), "channeloffs":channeloffs, "slotoffs":slotoffs, "frame":frame, "id":ID, "status":status}])) if self.Logger is not None: self.EventId += 1 logg.debug("Sending ChangeCell to logger, EventID:" + str(self.EventId)) # self.Logger.send_multipart([self.Name.encode(), pickle.dumps({ # "EventId" : self.EventId, # "SubjectId" : self.ConvertStatus["Cells"][status], # "InfoString" : json.dumps({"who": who, "channeloffs":channeloffs, "slotoffs":slotoffs, "frame":frame, "id":ID}) # })]) self.Logger.send_multipart([self.Name.encode(), pickle.dumps(Event(self.EventId, self.ConvertStatus["Cells"][status], time.time(), json.dumps({"node_id": str(who), "channeloffs":channeloffs, "slotoffs":slotoffs, "frame":frame, "id":ID})))]) def DumpDotData(self, labels={}): """ dumps an entire dot file to the active viewer. This is not used for the logger :return: """ # packet = "[\"" + str(self.root_id) + " at " + time.strftime("%Y-%m-%d %H:%M:%S") + "\"," + json.dumps(dotdata) + "]" if self.Active is not None: logg.debug("Sending dotdata") # self.Active.sendall(bytearray("[\"" + root_id + " at " + time.strftime("%Y-%m-%d %H:%M:%S") + "\"," + dotdata + "]")) dotdata = self.g.draw_graph(labels=labels) self.Active.sendall(bytearray(json.dumps(["\"" + self.root_id + " at " + time.strftime("%Y-%m-%d %H:%M:%S") + "\"", dotdata]))) time.sleep(.5) def AddNode(self, node_id, parent): """ Sends a notification of joining node to the logger :param node_id: ip6 of the node :type node_id: str :param parent: ip6 of the parent node :type parent: str :return: """ node_id = str(node_id) parent = str(parent) if self.Logger is not None: self.EventId += 1 logg.debug("Sending Addnode to logger, EventID:" + str(self.EventId)) # self.Logger.send_multipart([self.Name.encode(), pickle.dumps({ # "EventId" : self.EventId, # "SubjectId" : 0, # "InfoString": json.dumps({"node_id" : node_id, "parent" : parent}) # })]) self.Logger.send_multipart([self.Name.encode(), pickle.dumps(Event(self.EventId, 0, time.time(), json.dumps({"node_id" : str(node_id), "parent" : str(parent)})))]) if self.Active is not None: logg.debug("Sending Addnode to Active Visualizer, node:{}, parent:{}".format(node_id, parent)) if parent == "root": self.g.attach_node(node_id) else: self.g.attach_child(node_id, parent) self.DumpDotData() def RewireNode(self, node_id, old_parent, new_parent): """ Notifies the logger of a rewire that happened in the network :param node_id: ip6 of the node that has rewired :param old_parent: ip6 of the old parent :param new_parent: ip6 of the new parent :return: """ node_id = str(node_id) old_parent = str(old_parent) new_parent = str(new_parent) if self.Logger is not None: self.EventId += 1 logg.debug("Sending RewireNode to logger, EventID: " + str(self.EventId)) # self.Logger.send_multipart([self.Name.encode(), pickle.dumps({ # "EventId" : self.EventId, # "SubjectId" : 2, # "InfoString": json.dumps({"node_id" : node_id, "old_parent" : old_parent, "new_parent" : new_parent}) # })]) self.Logger.send_multipart([self.Name.encode(), pickle.dumps(Event(self.EventId, 2, time.time(), json.dumps({"node_id" : str(node_id), "old_parent" : str(old_parent), "new_parent" : str(new_parent)})))]) if self.Active is not None: logg.debug("Sending Rewire to the Active") self.g.attach_child(node_id, new_parent) self.DumpDotData() def RemoveNode(self, node_id): """ Notifies the logger of a disconnected node :param node_id: ip6 of the node that has disconnected :return: """ node_id = str(node_id) if self.Logger is not None: self.EventId += 1 logg.debug("Sending RemoveNode to logger, EventID: " + str(self.EventId)) # self.Logger.send_multipart([self.Name.encode(), pickle.dumps({ # "EventId" : self.EventId, # "SubjectId" : 1, # "InfoString": json.dumps({"node_id" : node_id}) # })]) self.Logger.send_multipart([self.Name.encode(), pickle.dumps(Event(self.EventId, 1, time.time(), json.dumps({"node_id" : str(node_id)})))]) if self.Active is not None: self.g.detach_node(node_id) self.DumpDotData() def RegisterFrame(self, num_cells, framename): """ Notifies the logger of a new frame that is defined in the scheduler algorithm :param num_cells: number of cells per channel :param framename: unique identifieng name :return: """ if self.Logger is not None: self.EventId += 1 logg.debug("Sending RegisterFrame to logger, EventID: " + str(self.EventId)) self.Logger.send_multipart([self.Name.encode(), pickle.dumps(Event(self.EventId, 7, time.time(), json.dumps({"cells" : num_cells, "name" : framename})))]) def RegisterFrames(self, frames): if self.Active is not None: logg.debug("Sending RegisterFrames to Active") self.Active.sendall(bytearray(json.dumps(frames)))
class TaskQueue: """Outgoing task queue from the executor to the Interchange""" def __init__( self, address: str, port: int = 55001, identity: str = str(uuid.uuid4()), zmq_context=None, set_hwm=False, RCVTIMEO=None, SNDTIMEO=None, linger=None, ironhouse: bool = False, keys_dir: str = os.path.abspath(".curve"), mode: str = "client", ): """ Parameters ---------- address: str address to connect port: int Port to use identity : str Applies only to clients, where the identity must match the endpoint uuid. This will be utf-8 encoded on the wire. A random uuid4 string is set by default. mode: string Either 'client' or 'server' keys_dir : string Directory from which keys will be loaded for curve. ironhouse: Bool Only valid for server mode. Setting this flag switches the server to require client keys to be available on the server in the keys_dir. """ if zmq_context: self.context = zmq_context else: self.context = zmq.Context() self.mode = mode self.port = port self.ironhouse = ironhouse self.keys_dir = keys_dir assert self.mode in [ "client", "server", ], "Only two modes are supported: client, server" if self.mode == "server": print("Configuring server") self.zmq_socket = self.context.socket(zmq.ROUTER) self.zmq_socket.set(zmq.ROUTER_MANDATORY, 1) self.zmq_socket.set(zmq.ROUTER_HANDOVER, 1) print("Setting up auth-server") self.setup_server_auth() elif self.mode == "client": self.zmq_socket = self.context.socket(zmq.DEALER) self.setup_client_auth() self.zmq_socket.setsockopt(zmq.IDENTITY, identity.encode("utf-8")) else: raise ValueError( "TaskQueue must be initialized with mode set to 'server' or 'client'" ) if set_hwm: self.zmq_socket.set_hwm(0) if RCVTIMEO is not None: self.zmq_socket.setsockopt(zmq.RCVTIMEO, RCVTIMEO) if SNDTIMEO is not None: self.zmq_socket.setsockopt(zmq.SNDTIMEO, SNDTIMEO) if linger is not None: self.zmq_socket.setsockopt(zmq.LINGER, linger) # all zmq setsockopt calls must be done before bind/connect is called if self.mode == "server": self.zmq_socket.bind(f"tcp://*:{port}") elif self.mode == "client": self.zmq_socket.connect(f"tcp://{address}:{port}") self.poller = zmq.Poller() self.poller.register(self.zmq_socket) os.makedirs(self.keys_dir, exist_ok=True) log.debug(f"Initializing Taskqueue:{self.mode} on port:{self.port}") def zmq_context(self): return self.context def add_client_key(self, endpoint_id, client_key): log.info("Adding client key") if self.ironhouse: # Use the ironhouse ZMQ pattern: http://hintjens.com/blog:49#toc6 with open(os.path.join(self.keys_dir, f"{endpoint_id}.key"), "w") as f: f.write(client_key) try: self.auth.configure_curve(domain="*", location=self.keys_dir) except Exception: log.exception("Failed to load keys from {self.keys_dir}") return def setup_server_auth(self): # Start an authenticator for this context. self.auth = ThreadAuthenticator(self.context) self.auth.start() self.auth.allow("127.0.0.1") # Tell the authenticator how to handle CURVE requests if not self.ironhouse: # Use the stonehouse ZMQ pattern: http://hintjens.com/blog:49#toc5 self.auth.configure_curve(domain="*", location=zmq.auth.CURVE_ALLOW_ANY) server_secret_file = os.path.join(self.keys_dir, "server.key_secret") server_public, server_secret = zmq.auth.load_certificate( server_secret_file) self.zmq_socket.curve_secretkey = server_secret self.zmq_socket.curve_publickey = server_public self.zmq_socket.curve_server = True # must come before bind def setup_client_auth(self): # We need two certificates, one for the client and one for # the server. The client must know the server's public key # to make a CURVE connection. client_secret_file = os.path.join(self.keys_dir, "endpoint.key_secret") client_public, client_secret = zmq.auth.load_certificate( client_secret_file) self.zmq_socket.curve_secretkey = client_secret self.zmq_socket.curve_publickey = client_public # The client must know the server's public key to make a CURVE connection. server_public_file = os.path.join(self.keys_dir, "server.key") server_public, _ = zmq.auth.load_certificate(server_public_file) self.zmq_socket.curve_serverkey = server_public def get(self, block=True, timeout=1000): """ Parameters ---------- block : Bool Blocks until there's a message, Default is True timeout : int Milliseconds to wait. """ # timeout is in milliseconds if block is True: return self.zmq_socket.recv_multipart() socks = dict(self.poller.poll(timeout=timeout)) if self.zmq_socket in socks and socks[self.zmq_socket] == zmq.POLLIN: message = self.zmq_socket.recv_multipart() return message else: raise zmq.Again def register_client(self, message): return self.zmq_socket.send_multipart([message]) def put(self, dest, message, max_timeout=1000): """This function needs to be fast at the same time aware of the possibility of ZMQ pipes overflowing. The timeout increases slowly if contention is detected on ZMQ pipes. We could set copy=False and get slightly better latency but this results in ZMQ sockets reaching a broken state once there are ~10k tasks in flight. This issue can be magnified if each the serialized buffer itself is larger. Parameters ---------- dest : zmq_identity of the destination endpoint, must be a byte string message : py object Python object to send max_timeout : int Max timeout in milliseconds that we will wait for before raising an exception Raises ------ zmq.EAGAIN if the send failed. zmq.error.ZMQError: Host unreachable (if client disconnects?) """ if self.mode == "client": return self.zmq_socket.send_multipart([message]) else: return self.zmq_socket.send_multipart([dest, message]) def close(self): self.zmq_socket.close() self.context.term()
class Command(LAVADaemonCommand): help = "LAVA log recorder" logger = None default_logfile = "/var/log/lava-server/lava-logs.log" def __init__(self, *args, **options): super(Command, self).__init__(*args, **options) self.logger = logging.getLogger("lava-logs") self.log_socket = None self.auth = None self.controler = None self.inotify_fd = None self.pipe_r = None self.poller = None self.cert_dir_path = None # List of logs self.jobs = {} # Keep test cases in memory self.test_cases = [] # Master status self.last_ping = 0 self.ping_interval = TIMEOUT def add_arguments(self, parser): super(Command, self).add_arguments(parser) net = parser.add_argument_group("network") net.add_argument('--socket', default='tcp://*:5555', help="Socket waiting for logs. Default: tcp://*:5555") net.add_argument('--master-socket', default='tcp://localhost:5556', help="Socket for master-slave communication. Default: tcp://localhost:5556") net.add_argument('--ipv6', default=False, action='store_true', help="Enable IPv6 on the listening sockets") net.add_argument('--encrypt', default=False, action='store_true', help="Encrypt messages") net.add_argument('--master-cert', default='/etc/lava-dispatcher/certificates.d/master.key_secret', help="Certificate for the master socket") net.add_argument('--slaves-certs', default='/etc/lava-dispatcher/certificates.d', help="Directory for slaves certificates") def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-logs", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("[INIT] Unable to drop privileges") return # Create the sockets context = zmq.Context() self.log_socket = context.socket(zmq.PULL) self.controler = context.socket(zmq.ROUTER) self.controler.setsockopt(zmq.IDENTITY, b"lava-logs") # Limit the number of messages in the queue self.controler.setsockopt(zmq.SNDHWM, 2) # From http://api.zeromq.org/4-2:zmq-setsockopt#toc5 # "Immediately readies that connection for data transfer with the master" self.controler.setsockopt(zmq.CONNECT_RID, b"master") if options['ipv6']: self.logger.info("[INIT] Enabling IPv6") self.log_socket.setsockopt(zmq.IPV6, 1) self.controler.setsockopt(zmq.IPV6, 1) if options['encrypt']: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug("[INIT] Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate(options['master_cert']) self.logger.debug("[INIT] Using slaves certificates from: %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) except IOError as err: self.logger.error("[INIT] %s", err) self.auth.stop() return self.log_socket.curve_publickey = master_public self.log_socket.curve_secretkey = master_secret self.log_socket.curve_server = True self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_serverkey = master_public self.logger.debug("[INIT] Watching %s", options["slaves_certs"]) self.cert_dir_path = options["slaves_certs"] self.inotify_fd = watch_directory(options["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.log_socket.bind(options['socket']) self.controler.connect(options['master_socket']) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.log_socket, zmq.POLLIN) self.poller.register(self.controler, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] listening for logs") # PING right now: the master is waiting for this message to start # scheduling. self.controler.send_multipart([b"master", b"PING"]) try: self.main_loop() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) # Close the controler socket self.controler.close(linger=0) self.poller.unregister(self.controler) # Carefully close the logging socket as we don't want to lose messages self.logger.info("[EXIT] Disconnect logging socket and process messages") endpoint = u(self.log_socket.getsockopt(zmq.LAST_ENDPOINT)) self.logger.debug("[EXIT] unbinding from '%s'", endpoint) self.log_socket.unbind(endpoint) # Empty the queue try: while self.wait_for_messages(True): # Flush test cases cache for every iteration because we might # get killed soon. self.flush_test_cases() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Last flush self.flush_test_cases() self.logger.info("[EXIT] Closing the logging socket: the queue is empty") self.log_socket.close() if options['encrypt']: self.auth.stop() context.term() def flush_test_cases(self): if self.test_cases: self.logger.info("Saving %d test cases", len(self.test_cases)) TestCase.objects.bulk_create(self.test_cases) self.test_cases = [] def main_loop(self): last_gc = time.time() last_bulk_create = time.time() # Wait for messages # TODO: fix timeout computation while self.wait_for_messages(False): now = time.time() # Dump TestCase into the database if now - last_bulk_create > BULK_CREATE_TIMEOUT: last_bulk_create = now self.flush_test_cases() # Close old file handlers if now - last_gc > FD_TIMEOUT: last_gc = now # Iterate while removing keys is not compatible with iterator for job_id in list(self.jobs.keys()): # pylint: disable=consider-iterating-dictionary if now - self.jobs[job_id].last_usage > FD_TIMEOUT: self.logger.info("[%s] closing log file", job_id) self.jobs[job_id].close() del self.jobs[job_id] # Ping the master if now - self.last_ping > self.ping_interval: self.logger.debug("PING => master") self.last_ping = now self.controler.send_multipart([b"master", b"PING"]) def wait_for_messages(self, leaving): try: try: sockets = dict(self.poller.poll(TIMEOUT * 1000)) except zmq.error.ZMQError as exc: self.logger.error("[POLL] zmq error: %s", str(exc)) return True # Messages if sockets.get(self.log_socket) == zmq.POLLIN: self.logging_socket() return True # Signals elif sockets.get(self.pipe_r) == zmq.POLLIN: # remove the message from the queue os.read(self.pipe_r, 1) if not leaving: self.logger.info("[POLL] received a signal, leaving") return False else: self.logger.warning("[POLL] signal already handled, please wait for the process to exit") return True # Pong received elif sockets.get(self.controler) == zmq.POLLIN: self.controler_socket() return True # Inotify socket if sockets.get(self.inotify_fd) == zmq.POLLIN: os.read(self.inotify_fd, 4096) self.logger.debug("[AUTH] Reloading certificates from %s", self.cert_dir_path) self.auth.configure_curve(domain='*', location=self.cert_dir_path) # Nothing received else: return not leaving except (OperationalError, InterfaceError): self.logger.info("[RESET] database connection reset") connection.close() return True def logging_socket(self): msg = self.log_socket.recv_multipart() try: (job_id, message) = (u(m) for m in msg) # pylint: disable=unbalanced-tuple-unpacking except ValueError: # do not let a bad message stop the master. self.logger.error("[POLL] failed to parse log message, skipping: %s", msg) return try: scanned = yaml.load(message, Loader=yaml.CLoader) except yaml.YAMLError: self.logger.error("[%s] data are not valid YAML, dropping", job_id) return # Look for "results" level try: message_lvl = scanned["lvl"] message_msg = scanned["msg"] except TypeError: self.logger.error("[%s] not a dictionary, dropping", job_id) return except KeyError: self.logger.error( "[%s] invalid log line, missing \"lvl\" or \"msg\" keys: %s", job_id, message) return # Find the handler (if available) if job_id not in self.jobs: # Query the database for the job try: job = TestJob.objects.get(id=job_id) except TestJob.DoesNotExist: self.logger.error("[%s] unknown job id", job_id) return self.logger.info("[%s] receiving logs from a new job", job_id) # Create the sub directories (if needed) mkdir(job.output_dir) self.jobs[job_id] = JobHandler(job) if message_lvl == "results": try: job = TestJob.objects.get(pk=job_id) except TestJob.DoesNotExist: self.logger.error("[%s] unknown job id", job_id) return meta_filename = create_metadata_store(message_msg, job) new_test_case = map_scanned_results(results=message_msg, job=job, meta_filename=meta_filename) if new_test_case is None: self.logger.warning( "[%s] unable to map scanned results: %s", job_id, message) else: self.test_cases.append(new_test_case) # Look for lava.job result if message_msg.get("definition") == "lava" and message_msg.get("case") == "job": # Flush cached test cases self.flush_test_cases() if message_msg.get("result") == "pass": health = TestJob.HEALTH_COMPLETE health_msg = "Complete" else: health = TestJob.HEALTH_INCOMPLETE health_msg = "Incomplete" self.logger.info("[%s] job status: %s", job_id, health_msg) infrastructure_error = (message_msg.get("error_type") in ["Bug", "Configuration", "Infrastructure"]) if infrastructure_error: self.logger.info("[%s] Infrastructure error", job_id) # Update status. with transaction.atomic(): # TODO: find a way to lock actual_device job = TestJob.objects.select_for_update() \ .get(id=job_id) job.go_state_finished(health, infrastructure_error) job.save() # Mark the file handler as used self.jobs[job_id].last_usage = time.time() # n.b. logging here would produce a log entry for every message in every job. # The format is a list of dictionaries message = "- %s" % message # Write data self.jobs[job_id].write(message) def controler_socket(self): msg = self.controler.recv_multipart() try: master_id = u(msg[0]) action = u(msg[1]) ping_interval = int(msg[2]) if master_id != "master": self.logger.error("Invalid master id '%s'. Should be 'master'", master_id) return if action != "PONG": self.logger.error("Invalid answer '%s'. Should be 'PONG'", action) return except (IndexError, ValueError): self.logger.error("Invalid message '%s'", msg) return if ping_interval < TIMEOUT: self.logger.error("invalid ping interval (%d) too small", ping_interval) return self.logger.debug("master => PONG(%d)", ping_interval) self.ping_interval = ping_interval
class FeatureComputer(object): def __init__(self, bind_str="tcp://127.0.0.1:5560", parent_model=None, layer=None, logins=None, viewable_layers=None): self.context = zmq.Context.instance() self.auth = ThreadAuthenticator(self.context) self.auth.start() #auth.allow('127.0.0.1') self.auth.configure_plain(domain='*', passwords=logins) self.socket = self.context.socket(zmq.PAIR) self.socket.plain_server = True self.socket.bind(bind_str) self.parent_model = parent_model self.curr_model = parent_model self.viewable_layers = viewable_layers self.config = tf.ConfigProto() self.config.gpu_options.per_process_gpu_memory_fraction = 0.3 self.config.gpu_options.allow_growth = True if not layer: self.layer = 5 #len(self.parent_model.layers) - 1 else: self.layer = layer def change_layer(self, *args, **kwargs): print("Changing layer") print(args, kwargs) self.layer = kwargs.get('layer', self.layer) if not self.viewable_layers: self.curr_model = Model( input=[self.parent_model.layers[0].input], output=[self.parent_model.layers[self.layer].output]) else: self.curr_model = Model( input=[self.parent_model.layers[0].input], output=[self.viewable_layers[self.layer].output]) #set_session(tf.Session(config=self.config)) print(self.layer) self.socket.send_pyobj({ 'type': 'layer_changed', 'success': True }, zmq.NOBLOCK) def get_summary(self, *args, **kwargs): self.socket.send_pyobj({ 'type': 'summary', 'result': None }, zmq.NOBLOCK) def do_predict(self, *args, **kwargs): # TODO: Make this configurable input = kwargs.pop('input', np.zeros((1, 224, 224, 3))) resized = np.float64(cv2.resize(input, (224, 224))) preprocessed = preprocess_input(np.expand_dims(resized, axis=0)) result = self.curr_model.predict(preprocessed, verbose=0) self.socket.send_pyobj({ 'type': 'prediction', 'result': result }, zmq.NOBLOCK) def do_layerinfo(self, *args, **kwargs): self.socket.send_pyobj( { 'type': 'layer_info', 'shape': self.curr_model.compute_output_shape( (1, 224, 224, 3)), 'name': self.parent_model.layers[self.layer].name }, zmq.NOBLOCK) def do_summary(self, *args, **kwargs): if not self.viewable_layers: self.socket.send_pyobj( { 'type': 'summary', 'result': [layer.name for layer in self.parent_model.layers] }, zmq.NOBLOCK) else: self.socket.send_pyobj( { 'type': 'summary', 'result': [layer.name for layer in self.viewable_layers] }, zmq.NOBLOCK) def handle_message(self, message): if message['type'] == "change_layer": self.change_layer(**message) if message['type'] == 'predict': self.do_predict(**message) if message['type'] == 'layer_info': self.do_layerinfo(**message) if message['type'] == 'summary': self.do_summary(**message) def run(self): self.running = True while self.running: message = self.socket.recv_pyobj() self.handle_message(message)
def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-logs", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("[INIT] Unable to drop privileges") return filename = os.path.join(settings.MEDIA_ROOT, 'lava-logs-config.yaml') self.logger.debug("[INIT] Dumping config to %s", filename) with open(filename, 'w') as output: yaml.dump(options, output) # Create the sockets context = zmq.Context() self.log_socket = context.socket(zmq.PULL) self.controler = context.socket(zmq.ROUTER) self.controler.setsockopt(zmq.IDENTITY, b"lava-logs") # Limit the number of messages in the queue self.controler.setsockopt(zmq.SNDHWM, 2) # From http://api.zeromq.org/4-2:zmq-setsockopt#toc5 # "Immediately readies that connection for data transfer with the master" self.controler.setsockopt(zmq.CONNECT_RID, b"master") if options['ipv6']: self.logger.info("[INIT] Enabling IPv6") self.log_socket.setsockopt(zmq.IPV6, 1) self.controler.setsockopt(zmq.IPV6, 1) if options['encrypt']: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug("[INIT] Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate(options['master_cert']) self.logger.debug("[INIT] Using slaves certificates from: %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) except OSError as err: self.logger.error("[INIT] %s", err) self.auth.stop() return self.log_socket.curve_publickey = master_public self.log_socket.curve_secretkey = master_secret self.log_socket.curve_server = True self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_serverkey = master_public self.logger.debug("[INIT] Watching %s", options["slaves_certs"]) self.cert_dir_path = options["slaves_certs"] self.inotify_fd = watch_directory(options["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.log_socket.bind(options['socket']) self.controler.connect(options['master_socket']) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.log_socket, zmq.POLLIN) self.poller.register(self.controler, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] listening for logs") # PING right now: the master is waiting for this message to start # scheduling. self.controler.send_multipart([b"master", b"PING"]) try: self.main_loop() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) # Close the controler socket self.controler.close(linger=0) self.poller.unregister(self.controler) # Carefully close the logging socket as we don't want to lose messages self.logger.info("[EXIT] Disconnect logging socket and process messages") endpoint = u(self.log_socket.getsockopt(zmq.LAST_ENDPOINT)) self.logger.debug("[EXIT] unbinding from '%s'", endpoint) self.log_socket.unbind(endpoint) # Empty the queue try: while self.wait_for_messages(True): # Flush test cases cache for every iteration because we might # get killed soon. self.flush_test_cases() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Last flush self.flush_test_cases() self.logger.info("[EXIT] Closing the logging socket: the queue is empty") self.log_socket.close() if options['encrypt']: self.auth.stop() context.term()
class dataTransfer(): def __init__ (self, connectionType, signalHost = None, useLog = False, context = None): if useLog: self.log = logging.getLogger("dataTransferAPI") elif useLog == None: self.log = noLoggingFunction() else: self.log = loggingFunction() # ZMQ applications always start by creating a context, # and then using that for creating sockets # (source: ZeroMQ, Messaging for Many Applications by Pieter Hintjens) if context: self.context = context self.extContext = True else: self.context = zmq.Context() self.extContext = False self.signalHost = signalHost self.signalPort = "50000" self.requestPort = "50001" self.dataHost = None self.dataPort = None self.signalSocket = None self.dataSocket = None self.requestSocket = None self.poller = zmq.Poller() self.auth = None self.targets = None self.supportedConnections = ["stream", "streamMetadata", "queryNext", "queryMetadata"] self.signalExchanged = None self.streamStarted = None self.queryNextStarted = None self.socketResponseTimeout = 1000 if connectionType in self.supportedConnections: self.connectionType = connectionType else: raise NotSupported("Chosen type of connection is not supported.") # targets: [host, port, prio] or [[host, port, prio], ...] def initiate (self, targets): if type(targets) != list: self.stop() raise FormatError("Argument 'targets' must be list.") if not self.context: self.context = zmq.Context() self.extContext = False signal = None # Signal exchange if self.connectionType == "stream": signalPort = self.signalPort signal = "START_STREAM" elif self.connectionType == "streamMetadata": signalPort = self.signalPort signal = "START_STREAM_METADATA" elif self.connectionType == "queryNext": signalPort = self.signalPort signal = "START_QUERY_NEXT" elif self.connectionType == "queryMetadata": signalPort = self.signalPort signal = "START_QUERY_METADATA" self.log.debug("Create socket for signal exchange...") if self.signalHost: self.__createSignalSocket(signalPort) else: self.stop() raise ConnectionFailed("No host to send signal to specified." ) self.__setTargets (targets) message = self.__sendSignal(signal) if message and message == "VERSION_CONFLICT": self.stop() raise VersionError("Versions are conflicting.") elif message and message == "NO_VALID_HOST": self.stop() raise AuthenticationFailed("Host is not allowed to connect.") elif message and message == "CONNECTION_ALREADY_OPEN": self.stop() raise CommunicationFailed("Connection is already open.") elif message and message == "NO_VALID_SIGNAL": self.stop() raise CommunicationFailed("Connection type is not supported for this kind of sender.") # if there was no response or the response was of the wrong format, the receiver should be shut down elif message and message.startswith(signal): self.log.info("Received confirmation ...") self.signalExchanged = signal else: raise CommunicationFailed("Sending start signal ...failed.") def __createSignalSocket (self, signalPort): # To send a notification that a Displayer is up and running, a communication socket is needed # create socket to exchange signals with Sender self.signalSocket = self.context.socket(zmq.REQ) # time to wait for the sender to give a confirmation of the signal # self.signalSocket.RCVTIMEO = self.socketResponseTimeout connectionStr = "tcp://" + str(self.signalHost) + ":" + str(signalPort) try: self.signalSocket.connect(connectionStr) self.log.info("signalSocket started (connect) for '" + connectionStr + "'") except: self.log.error("Failed to start signalSocket (connect): '" + connectionStr + "'") raise # using a Poller to implement the signalSocket timeout (in older ZMQ version there is no option RCVTIMEO) self.poller.register(self.signalSocket, zmq.POLLIN) def __setTargets (self, targets): self.targets = [] # [host, port, prio] if len(targets) == 3 and type(targets[0]) != list and type(targets[1]) != list and type(targets[2]) != list: host, port, prio = targets self.targets = [[host + ":" + port, prio, [""]]] # [host, port, prio, suffixes] elif len(targets) == 4 and type(targets[0]) != list and type(targets[1]) != list and type(targets[2]) != list and type(targets[3]) == list: host, port, prio, suffixes = targets self.targets = [[host + ":" + port, prio, suffixes]] # [[host, port, prio], ...] or [[host, port, prio, suffixes], ...] else: for t in targets: if type(t) == list and len(t) == 3: host, port, prio = t self.targets.append([host + ":" + port, prio, [""]]) elif type(t) == list and len(t) == 4 and type(t[3]): host, port, prio, suffixes = t self.targets.append([host + ":" + port, prio, suffixes]) else: self.stop() self.log.debug("targets=" + str(targets)) raise FormatError("Argument 'targets' is of wrong format.") def __sendSignal (self, signal): if not signal: return # Send the signal that the communication infrastructure should be established self.log.info("Sending Signal") sendMessage = [__version__, signal] trg = cPickle.dumps(self.targets) sendMessage.append(trg) # sendMessage = [__version__, signal, self.dataHost, self.dataPort] self.log.debug("Signal: " + str(sendMessage)) try: self.signalSocket.send_multipart(sendMessage) except: self.log.error("Could not send signal") raise message = None try: socks = dict(self.poller.poll(self.socketResponseTimeout)) except: self.log.error("Could not poll for new message") raise # if there was a response if self.signalSocket in socks and socks[self.signalSocket] == zmq.POLLIN: try: # Get the reply. message = self.signalSocket.recv() self.log.info("Received answer to signal: " + str(message) ) except: self.log.error("Could not receive answer to signal") raise return message def start (self, dataSocket = False, whitelist = None): # Receive data only from whitelisted nodes if whitelist: if type(whitelist) == list: self.auth = ThreadAuthenticator(self.context) self.auth.start() for host in whitelist: try: if host == "localhost": ip = [socket.gethostbyname(host)] else: hostname, tmp, ip = socket.gethostbyaddr(host) self.log.debug("Allowing host " + host + " (" + str(ip[0]) + ")") self.auth.allow(ip[0]) except: self.log.error("Error was: ", exc_info=True) raise AuthenticationFailed("Could not get IP of host " + host) else: raise FormatError("Whitelist has to be a list of IPs") socketIdToConnect = self.streamStarted or self.queryNextStarted if socketIdToConnect: self.log.info("Reopening already started connection.") else: ip = "0.0.0.0" #TODO use IP of hostname? host = "" port = "" if dataSocket: if type(dataSocket) == list: socketIdToConnect = dataSocket[0] + ":" + dataSocket[1] host = dataSocket[0] ip = socket.gethostbyaddr(host)[2][0] port = dataSocket[1] else: port = str(dataSocket) host = socket.gethostname() socketId = host + ":" + port ipFromHost = socket.gethostbyaddr(host)[2] if len(ipFromHost) == 1: ip = ipFromHost[0] elif len(self.targets) == 1: host, port = self.targets[0][0].split(":") ipFromHost = socket.gethostbyaddr(host)[2] if len(ipFromHost) == 1: ip = ipFromHost[0] else: raise FormatError("Multipe possible ports. Please choose which one to use.") socketId = host + ":" + port socketIdToConnect = ip + ":" + port # socketIdToConnect = "[" + ip + "]:" + port self.dataSocket = self.context.socket(zmq.PULL) # An additional socket is needed to establish the data retriving mechanism connectionStr = "tcp://" + socketIdToConnect if whitelist: self.dataSocket.zap_domain = b'global' try: # self.dataSocket.ipv6 = True self.dataSocket.bind(connectionStr) # self.dataSocket.bind("tcp://[2003:ce:5bc0:a600:fa16:54ff:fef4:9fc0]:50102") self.log.info("Data socket of type " + self.connectionType + " started (bind) for '" + connectionStr + "'") except: self.log.error("Failed to start Socket of type " + self.connectionType + " (bind): '" + connectionStr + "'", exc_info=True) raise self.poller.register(self.dataSocket, zmq.POLLIN) if self.connectionType in ["queryNext", "queryMetadata"]: self.requestSocket = self.context.socket(zmq.PUSH) # An additional socket is needed to establish the data retriving mechanism connectionStr = "tcp://" + self.signalHost + ":" + self.requestPort try: self.requestSocket.connect(connectionStr) self.log.info("Request socket started (connect) for '" + connectionStr + "'") except: self.log.error("Failed to start Socket of type " + self.connectionType + " (connect): '" + connectionStr + "'", exc_info=True) raise self.queryNextStarted = socketId else: self.streamStarted = socketId ## # # Receives or queries for new files depending on the connection initialized # # returns either # the newest file # (if connection type "queryNext" or "stream" was choosen) # the path of the newest file # (if connection type "queryMetadata" or "streamMetadata" was choosen) # ## def get (self, timeout=None): if not self.streamStarted and not self.queryNextStarted: self.log.info("Could not communicate, no connection was initialized.") return None, None if self.queryNextStarted : sendMessage = ["NEXT", self.queryNextStarted] try: self.requestSocket.send_multipart(sendMessage) except Exception as e: self.log.error("Could not send request to requestSocket", exc_info=True) return None, None while True: # receive data if timeout: try: socks = dict(self.poller.poll(timeout)) except: self.log.error("Could not poll for new message") raise else: try: socks = dict(self.poller.poll()) except: self.log.error("Could not poll for new message") raise # if there was a response if self.dataSocket in socks and socks[self.dataSocket] == zmq.POLLIN: try: multipartMessage = self.dataSocket.recv_multipart() except: self.log.error("Receiving data..failed.", exc_info=True) return [None, None] if multipartMessage[0] == b"ALIVE_TEST": continue elif len(multipartMessage) < 2: self.log.error("Received mutipart-message is too short. Either config or file content is missing.") self.log.debug("multipartMessage=" + str(mutipartMessage)[:100]) return [None, None] # extract multipart message try: metadata = cPickle.loads(multipartMessage[0]) except: self.log.error("Could not extract metadata from the multipart-message.", exc_info=True) metadata = None #TODO validate multipartMessage (like correct dict-values for metadata) try: payload = multipartMessage[1] except: self.log.warning("An empty file was received within the multipart-message", exc_info=True) payload = None return [metadata, payload] else: self.log.warning("Could not receive data in the given time.") if self.queryNextStarted : try: self.requestSocket.send_multipart(["CANCEL", self.queryNextStarted]) except Exception as e: self.log.error("Could not cancel the next query", exc_info=True) return [None, None] def store (self, targetBasePath, dataObject): if type(dataObject) is not list and len(dataObject) != 2: raise FormatError("Wrong input type for 'store'") payloadMetadata = dataObject[0] payload = dataObject[1] if type(payloadMetadata) is not dict: raise FormatError("payload: Wrong input format in 'store'") #save all chunks to file while True: #TODO check if payload != cPickle.dumps(None) ? if payloadMetadata and payload: #append to file try: self.log.debug("append to file based on multipart-message...") #TODO: save message to file using a thread (avoids blocking) #TODO: instead of open/close file for each chunk recyle the file-descriptor for all chunks opened self.__appendChunksToFile(targetBasePath, payloadMetadata, payload) self.log.debug("append to file based on multipart-message...success.") except KeyboardInterrupt: self.log.info("KeyboardInterrupt detected. Unable to append multipart-content to file.") break except Exception, e: self.log.error("Unable to append multipart-content to file.", exc_info=True) self.log.debug("Append to file based on multipart-message...failed.") if len(payload) < payloadMetadata["chunkSize"] : #indicated end of file. Leave loop filename = self.generateTargetFilepath(targetBasePath, payloadMetadata) fileModTime = payloadMetadata["fileModTime"] self.log.info("New file with modification time " + str(fileModTime) + " received and saved: " + str(filename)) break try: [payloadMetadata, payload] = self.get() except: self.log.error("Getting data failed.", exc_info=True) break
def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-master", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("[INIT] Unable to drop privileges") return self.logger.info("[INIT] Marking all workers as offline") with transaction.atomic(): for worker in Worker.objects.select_for_update().all(): worker.go_state_offline() worker.save() # Create the sockets context = zmq.Context() self.controler = context.socket(zmq.ROUTER) self.event_socket = context.socket(zmq.SUB) if options['ipv6']: self.logger.info("[INIT] Enabling IPv6") self.controler.setsockopt(zmq.IPV6, 1) self.event_socket.setsockopt(zmq.IPV6, 1) if options['encrypt']: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug("[INIT] Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate(options['master_cert']) self.logger.debug("[INIT] Using slaves certificates from: %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) except IOError as err: self.logger.error(err) self.auth.stop() return self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_server = True self.logger.debug("[INIT] Watching %s", options["slaves_certs"]) self.inotify_fd = watch_directory(options["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.controler.setsockopt(zmq.IDENTITY, b"master") # From http://api.zeromq.org/4-2:zmq-setsockopt#toc42 # "If two clients use the same identity when connecting to a ROUTER # [...] the ROUTER socket shall hand-over the connection to the new # client and disconnect the existing one." self.controler.setsockopt(zmq.ROUTER_HANDOVER, 1) self.controler.bind(options['master_socket']) self.event_socket.setsockopt(zmq.SUBSCRIBE, b(settings.EVENT_TOPIC)) self.event_socket.connect(options['event_url']) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.controler, zmq.POLLIN) self.poller.register(self.event_socket, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] LAVA master has started.") self.logger.info("[INIT] Using protocol version %d", PROTOCOL_VERSION) try: self.main_loop(options) except BaseException as exc: self.logger.error("[CLOSE] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Drop controler socket: the protocol does handle lost messages self.logger.info("[CLOSE] Closing the controler socket and dropping messages") self.controler.close(linger=0) self.event_socket.close(linger=0) if options['encrypt']: self.auth.stop() context.term()
def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-master", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("[INIT] Unable to drop privileges") return filename = os.path.join(settings.MEDIA_ROOT, 'lava-master-config.yaml') self.logger.debug("[INIT] Dumping config to %s", filename) with open(filename, 'w') as output: yaml.dump(options, output) self.logger.info("[INIT] Marking all workers as offline") with transaction.atomic(): for worker in Worker.objects.select_for_update().all(): worker.go_state_offline() worker.save() # Create the sockets context = zmq.Context() self.controler = context.socket(zmq.ROUTER) self.event_socket = context.socket(zmq.SUB) if options['ipv6']: self.logger.info("[INIT] Enabling IPv6") self.controler.setsockopt(zmq.IPV6, 1) self.event_socket.setsockopt(zmq.IPV6, 1) if options['encrypt']: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug("[INIT] Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate( options['master_cert']) self.logger.debug("[INIT] Using slaves certificates from: %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) except OSError as err: self.logger.error(err) self.auth.stop() return self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_server = True self.logger.debug("[INIT] Watching %s", options["slaves_certs"]) self.inotify_fd = watch_directory(options["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.controler.setsockopt(zmq.IDENTITY, b"master") # From http://api.zeromq.org/4-2:zmq-setsockopt#toc42 # "If two clients use the same identity when connecting to a ROUTER # [...] the ROUTER socket shall hand-over the connection to the new # client and disconnect the existing one." self.controler.setsockopt(zmq.ROUTER_HANDOVER, 1) self.controler.bind(options['master_socket']) self.event_socket.setsockopt(zmq.SUBSCRIBE, b(settings.EVENT_TOPIC)) self.event_socket.connect(options['event_url']) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.controler, zmq.POLLIN) self.poller.register(self.event_socket, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] LAVA master has started.") self.logger.info("[INIT] Using protocol version %d", PROTOCOL_VERSION) try: self.main_loop(options) except BaseException as exc: self.logger.error("[CLOSE] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Drop controler socket: the protocol does handle lost messages self.logger.info( "[CLOSE] Closing the controler socket and dropping messages") self.controler.close(linger=0) self.event_socket.close(linger=0) if options['encrypt']: self.auth.stop() context.term()
def handle(self, *args, **options): # Initialize logging. self.logging_support(options["log_file"]) # Set the logging level if options['level'] == 'ERROR': self.logger.setLevel(logging.ERROR) elif options['level'] == 'WARN': self.logger.setLevel(logging.WARN) elif options['level'] == 'INFO': self.logger.setLevel(logging.INFO) else: self.logger.setLevel(logging.DEBUG) self.logger.info("Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("Unable to drop privileges") return auth = None # Create the sockets context = zmq.Context() self.pull_socket = context.socket(zmq.PULL) self.controler = context.socket(zmq.ROUTER) if options['encrypt']: self.logger.info("Starting encryption") try: auth = ThreadAuthenticator(context) auth.start() self.logger.debug("Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate( options['master_cert']) self.logger.debug("Using slaves certificates from: %s", options['slaves_certs']) auth.configure_curve(domain='*', location=options['slaves_certs']) except IOError as err: self.logger.error(err) auth.stop() return self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_server = True self.pull_socket.curve_publickey = master_public self.pull_socket.curve_secretkey = master_secret self.pull_socket.curve_server = True self.pull_socket.bind(options['log_socket']) self.controler.bind(options['master_socket']) # Last access to the database for new jobs and cancelations last_db_access = 0 # Poll on the sockets (only one for the moment). This allow to have a # nice timeout along with polling. poller = zmq.Poller() poller.register(self.pull_socket, zmq.POLLIN) poller.register(self.controler, zmq.POLLIN) # Mask signals and create a pipe that will receive a bit for each # signal received. Poll the pipe along with the zmq socket so that we # can only be interupted while reading data. (pipe_r, pipe_w) = os.pipe() flags = fcntl.fcntl(pipe_w, fcntl.F_GETFL, 0) fcntl.fcntl(pipe_w, fcntl.F_SETFL, flags | os.O_NONBLOCK) def signal_to_pipe(signumber, _): # Send the signal number on the pipe os.write(pipe_w, chr(signumber)) signal.signal(signal.SIGHUP, signal_to_pipe) signal.signal(signal.SIGINT, signal_to_pipe) signal.signal(signal.SIGTERM, signal_to_pipe) signal.signal(signal.SIGQUIT, signal_to_pipe) poller.register(pipe_r, zmq.POLLIN) if os.path.exists('/etc/lava-server/worker.conf'): self.logger.error( "[FAIL] lava-master must not be run on a remote worker!") self.controler.close(linger=0) self.pull_socket.close(linger=0) context.term() sys.exit(2) self.logger.info("[INIT] LAVA dispatcher-master has started.") self.logger.info("[INIT] Using protocol version %d", PROTOCOL_VERSION) while True: try: try: # TODO: Fix the timeout computation # Wait for data or a timeout sockets = dict(poller.poll(TIMEOUT * 1000)) except zmq.error.ZMQError: continue if sockets.get(pipe_r) == zmq.POLLIN: signum = ord(os.read(pipe_r, 1)) if signum == signal.SIGHUP: self.logger.info( "[POLL] SIGHUP received, restarting loggers") self.logging_support(options["log_file"]) else: self.logger.info("[POLL] Received a signal, leaving") break # Logging socket if sockets.get(self.pull_socket) == zmq.POLLIN: self.logging_socket(options) # Garbage collect file handlers now = time.time() for job_id in self.jobs.keys(): # pylint: disable=consider-iterating-dictionary if now - self.jobs[job_id].last_usage > FD_TIMEOUT: self.logger.info("[%s] Closing log file", job_id) self.jobs[job_id].close() del self.jobs[job_id] # Command socket if sockets.get(self.controler) == zmq.POLLIN: if not self.controler_socket(): continue # Check dispatchers status now = time.time() for hostname, dispatcher in self.dispatchers.iteritems(): if dispatcher.online and now - dispatcher.last_msg > DISPATCHER_TIMEOUT: self.logger.error( "[STATE] Dispatcher <%s> goes OFFLINE", hostname) self.dispatchers[hostname].online = False # TODO: DB: mark the dispatcher as offline and attached # devices # Limit accesses to the database. This will also limit the rate of # CANCEL and START messages if now - last_db_access > DB_LIMIT: last_db_access = now # TODO: make this atomic # Dispatch pipeline jobs with devices in Reserved state self.process_jobs(options) # Handle canceling jobs self.handle_canceling() except (OperationalError, InterfaceError): self.logger.info("[RESET] database connection reset.") continue # Closing sockets and droping messages. self.logger.info("[CLOSE] Closing the sockets and dropping messages") self.controler.close(linger=0) self.pull_socket.close(linger=0) if options['encrypt']: auth.stop() context.term()
class CombaZMQAdapter(threading.Thread, CombaBase): def __init__(self, port): self.port = str(port) threading.Thread.__init__ (self) self.shutdown_event = Event() self.context = zmq.Context().instance() self.authserver = ThreadAuthenticator(self.context) self.loadConfig() self.start() #------------------------------------------------------------------------------------------# def run(self): """ run runs on function start """ self.startAuthserver() self.data = '' self.socket = self.context.socket(zmq.REP) self.socket.plain_server = True self.socket.bind("tcp://*:"+self.port) self.shutdown_event.clear() self.controller = CombaController(self, self.lqs_socket, self.lqs_recorder_socket) self.controller.messenger.setMailAddresses(self.get('frommail'), self.get('adminmail')) self.can_send = False # Process tasks forever while not self.shutdown_event.is_set(): self.data = self.socket.recv() self.can_send = True data = self.data.split(' ') command = str(data.pop(0)) params = "()" if len(data) < 1 else "('" + "','".join(data) + "')" try: exec"a=self.controller." + command + params except SyntaxError: self.controller.message('Warning: Syntax Error') except AttributeError: print "Warning: Method " + command + " does not exist" self.controller.message('Warning: Method ' + command + ' does not exist') except TypeError: print "Warning: Wrong number of params" self.controller.message('Warning: Wrong number of params') except: print "Warning: Unknown Error" self.controller.message('Warning: Unknown Error') return #------------------------------------------------------------------------------------------# def halt(self): """ Stop the server """ if self.shutdown_event.is_set(): return try: del self.controller except: pass self.shutdown_event.set() result = 'failed' try: result = self.socket.unbind("tcp://*:"+self.port) except: pass #self.socket.close() #------------------------------------------------------------------------------------------# def reload(self): """ stop, reload config and startagaing """ if self.shutdown_event.is_set(): return self.loadConfig() self.halt() time.sleep(3) self.run() #------------------------------------------------------------------------------------------# def send(self,message): """ Send a message to the client :param message: string """ if self.can_send: self.socket.send(message) self.can_send = False #------------------------------------------------------------------------------------------# def startAuthserver(self): """ Start zmq authentification server """ # stop auth server if running if self.authserver.is_alive(): self.authserver.stop() if self.securitylevel > 0: # Authentifizierungsserver starten. self.authserver.start() # Bei security level 2 auch passwort und usernamen verlangen if self.securitylevel > 1: try: addresses = CombaWhitelist().getList() for address in addresses: self.authserver.allow(address) except: pass # Instruct authenticator to handle PLAIN requests self.authserver.configure_plain(domain='*', passwords=self.getAccounts()) #------------------------------------------------------------------------------------------# def getAccounts(self): """ Get accounts from redis db :return: llist - a list of accounts """ accounts = CombaUser().getLogins() db = redis.Redis() internaccount = db.get('internAccess') if not internaccount: user = ''.join(random.sample(string.lowercase,10)) password = ''.join(random.sample(string.lowercase+string.uppercase+string.digits,22)) db.set('internAccess', user + ':' + password) intern = [user, password] else: intern = internaccount.split(':') accounts[intern[0]] = intern[1] return accounts
class MispZmq: message_count = 0 publish_count = 0 monitor_thread = None auth = None socket = None pidfile = None r: redis.StrictRedis namespace: str def __init__(self): self._logger = logging.getLogger() self.tmp_location = Path(__file__).parent.parent / "tmp" self.pidfile = self.tmp_location / "mispzmq.pid" if self.pidfile.exists(): with open(self.pidfile.as_posix()) as f: pid = f.read() if check_pid(pid): raise Exception( "mispzmq already running on PID {}".format(pid)) else: # Cleanup self.pidfile.unlink() if (self.tmp_location / "mispzmq_settings.json").exists(): self._setup() else: raise Exception("The settings file is missing.") def _setup(self): with open((self.tmp_location / "mispzmq_settings.json").as_posix()) as settings_file: self.settings = json.load(settings_file) self.namespace = self.settings["redis_namespace"] self.r = redis.StrictRedis(host=self.settings["redis_host"], db=self.settings["redis_database"], password=self.settings["redis_password"], port=self.settings["redis_port"], decode_responses=True) self.timestamp_settings = time.time() self._logger.debug("Connected to Redis {}:{}/{}".format( self.settings["redis_host"], self.settings["redis_port"], self.settings["redis_database"])) def _setup_zmq(self): context = zmq.Context() if "username" in self.settings and self.settings["username"]: if "password" not in self.settings or not self.settings["password"]: raise Exception( "When username is set, password cannot be empty.") self.auth = ThreadAuthenticator(context) self.auth.start() self.auth.configure_plain(domain="*", passwords={ self.settings["username"]: self.settings["password"] }) else: if self.auth: self.auth.stop() self.auth = None self.socket = context.socket(zmq.PUB) if self.settings["username"]: self.socket.plain_server = True # must come before bind self.socket.bind("tcp://{}:{}".format(self.settings["host"], self.settings["port"])) self._logger.debug("ZMQ listening on tcp://{}:{}".format( self.settings["host"], self.settings["port"])) if self._logger.isEnabledFor(logging.DEBUG): monitor = self.socket.get_monitor_socket() self.monitor_thread = threading.Thread(target=event_monitor, args=(monitor, self._logger)) self.monitor_thread.start() else: if self.monitor_thread: self.socket.disable_monitor() self.monitor_thread = None def _handle_command(self, command): if command == "kill": self._logger.info("Kill command received, shutting down.") self.clean() sys.exit() elif command == "reload": self._logger.info( "Reload command received, reloading settings from file.") self._setup() self._setup_zmq() elif command == "status": self._logger.info( "Status command received, responding with latest stats.") self.r.delete("{}:status".format(self.namespace)) self.r.lpush( "{}:status".format(self.namespace), json.dumps({ "timestamp": time.time(), "timestampSettings": self.timestamp_settings, "publishCount": self.publish_count, "messageCount": self.message_count })) else: self._logger.warning( "Received invalid command '{}'.".format(command)) def _create_pid_file(self): with open(self.pidfile.as_posix(), "w") as f: f.write(str(os.getpid())) def _pub_message(self, topic, data): self.socket.send_string("{} {}".format(topic, data)) def clean(self): if self.monitor_thread: self.socket.disable_monitor() if self.auth: self.auth.stop() if self.socket: self.socket.close() if self.pidfile: self.pidfile.unlink() def main(self): self._create_pid_file() self._setup_zmq() time.sleep(1) status_array = [ "And when you're dead I will be still alive.", "And believe me I am still alive.", "I'm doing science and I'm still alive.", "I feel FANTASTIC and I'm still alive.", "While you're dying I'll be still alive.", ] topics = [ "misp_json", "misp_json_event", "misp_json_attribute", "misp_json_sighting", "misp_json_organisation", "misp_json_user", "misp_json_conversation", "misp_json_object", "misp_json_object_reference", "misp_json_audit", "misp_json_tag", "misp_json_warninglist" ] lists = ["{}:command".format(self.namespace)] for topic in topics: lists.append("{}:data:{}".format(self.namespace, topic)) while True: data = self.r.blpop(lists, timeout=10) if data is None: # redis timeout expired current_time = int(time.time()) time_delta = current_time - int(self.timestamp_settings) status_entry = int(time_delta / 10 % 5) status_message = { "status": status_array[status_entry], "uptime": current_time - int(self.timestamp_settings) } self._pub_message("misp_json_self", json.dumps(status_message)) self._logger.debug( "No message received for 10 seconds, sending ZMQ status message." ) else: key, value = data key = key.replace("{}:".format(self.namespace), "") if key == "command": self._handle_command(value) elif key.startswith("data:"): topic = key.split(":")[1] self._logger.debug( "Received data for topic '{}', sending to ZMQ.".format( topic)) self._pub_message(topic, value) self.message_count += 1 if topic == "misp_json": self.publish_count += 1 else: self._logger.warning( "Received invalid message '{}'.".format(key))
def start(self, dataSocket=False, whitelist=None): # Receive data only from whitelisted nodes if whitelist: if type(whitelist) == list: self.auth = ThreadAuthenticator(self.context) self.auth.start() for host in whitelist: try: if host == "localhost": ip = [socket.gethostbyname(host)] else: hostname, tmp, ip = socket.gethostbyaddr(host) self.log.debug("Allowing host " + host + " (" + str(ip[0]) + ")") self.auth.allow(ip[0]) except: self.log.error("Error was: ", exc_info=True) raise AuthenticationFailed( "Could not get IP of host " + host) else: raise FormatError("Whitelist has to be a list of IPs") socketIdToConnect = self.streamStarted or self.queryNextStarted if socketIdToConnect: self.log.info("Reopening already started connection.") else: ip = "0.0.0.0" #TODO use IP of hostname? host = "" port = "" if dataSocket: if type(dataSocket) == list: socketIdToConnect = dataSocket[0] + ":" + dataSocket[1] host = dataSocket[0] ip = socket.gethostbyaddr(host)[2][0] port = dataSocket[1] else: port = str(dataSocket) host = socket.gethostname() socketId = host + ":" + port ipFromHost = socket.gethostbyaddr(host)[2] if len(ipFromHost) == 1: ip = ipFromHost[0] elif len(self.targets) == 1: host, port = self.targets[0][0].split(":") ipFromHost = socket.gethostbyaddr(host)[2] if len(ipFromHost) == 1: ip = ipFromHost[0] else: raise FormatError( "Multipe possible ports. Please choose which one to use.") socketId = host + ":" + port socketIdToConnect = ip + ":" + port # socketIdToConnect = "[" + ip + "]:" + port self.dataSocket = self.context.socket(zmq.PULL) # An additional socket is needed to establish the data retriving mechanism connectionStr = "tcp://" + socketIdToConnect if whitelist: self.dataSocket.zap_domain = b'global' try: # self.dataSocket.ipv6 = True self.dataSocket.bind(connectionStr) # self.dataSocket.bind("tcp://[2003:ce:5bc0:a600:fa16:54ff:fef4:9fc0]:50102") self.log.info("Data socket of type " + self.connectionType + " started (bind) for '" + connectionStr + "'") except: self.log.error("Failed to start Socket of type " + self.connectionType + " (bind): '" + connectionStr + "'", exc_info=True) raise self.poller.register(self.dataSocket, zmq.POLLIN) if self.connectionType in ["queryNext", "queryMetadata"]: self.requestSocket = self.context.socket(zmq.PUSH) # An additional socket is needed to establish the data retriving mechanism connectionStr = "tcp://" + self.signalHost + ":" + self.requestPort try: self.requestSocket.connect(connectionStr) self.log.info("Request socket started (connect) for '" + connectionStr + "'") except: self.log.error("Failed to start Socket of type " + self.connectionType + " (connect): '" + connectionStr + "'", exc_info=True) raise self.queryNextStarted = socketId else: self.streamStarted = socketId
def __init__(self, public_keys_dir): self._auth = ThreadAuthenticator(zmq.Context.instance()) self._auth.start() self._auth.allow('*') self._auth.configure_curve(domain='*', location=public_keys_dir)
class dataTransfer(): def __init__(self, connectionType, signalHost=None, useLog=False, context=None): if useLog: self.log = logging.getLogger("dataTransferAPI") elif useLog == None: self.log = noLoggingFunction() else: self.log = loggingFunction() # ZMQ applications always start by creating a context, # and then using that for creating sockets # (source: ZeroMQ, Messaging for Many Applications by Pieter Hintjens) if context: self.context = context self.extContext = True else: self.context = zmq.Context() self.extContext = False self.signalHost = signalHost self.signalPort = "50000" self.requestPort = "50001" self.dataHost = None self.dataPort = None self.signalSocket = None self.dataSocket = None self.requestSocket = None self.poller = zmq.Poller() self.auth = None self.targets = None self.supportedConnections = [ "stream", "streamMetadata", "queryNext", "queryMetadata" ] self.signalExchanged = None self.streamStarted = None self.queryNextStarted = None self.socketResponseTimeout = 1000 if connectionType in self.supportedConnections: self.connectionType = connectionType else: raise NotSupported("Chosen type of connection is not supported.") # targets: [host, port, prio] or [[host, port, prio], ...] def initiate(self, targets): if type(targets) != list: self.stop() raise FormatError("Argument 'targets' must be list.") if not self.context: self.context = zmq.Context() self.extContext = False signal = None # Signal exchange if self.connectionType == "stream": signalPort = self.signalPort signal = "START_STREAM" elif self.connectionType == "streamMetadata": signalPort = self.signalPort signal = "START_STREAM_METADATA" elif self.connectionType == "queryNext": signalPort = self.signalPort signal = "START_QUERY_NEXT" elif self.connectionType == "queryMetadata": signalPort = self.signalPort signal = "START_QUERY_METADATA" self.log.debug("Create socket for signal exchange...") if self.signalHost: self.__createSignalSocket(signalPort) else: self.stop() raise ConnectionFailed("No host to send signal to specified.") self.__setTargets(targets) message = self.__sendSignal(signal) if message and message == "VERSION_CONFLICT": self.stop() raise VersionError("Versions are conflicting.") elif message and message == "NO_VALID_HOST": self.stop() raise AuthenticationFailed("Host is not allowed to connect.") elif message and message == "CONNECTION_ALREADY_OPEN": self.stop() raise CommunicationFailed("Connection is already open.") elif message and message == "NO_VALID_SIGNAL": self.stop() raise CommunicationFailed( "Connection type is not supported for this kind of sender.") # if there was no response or the response was of the wrong format, the receiver should be shut down elif message and message.startswith(signal): self.log.info("Received confirmation ...") self.signalExchanged = signal else: raise CommunicationFailed("Sending start signal ...failed.") def __createSignalSocket(self, signalPort): # To send a notification that a Displayer is up and running, a communication socket is needed # create socket to exchange signals with Sender self.signalSocket = self.context.socket(zmq.REQ) # time to wait for the sender to give a confirmation of the signal # self.signalSocket.RCVTIMEO = self.socketResponseTimeout connectionStr = "tcp://" + str(self.signalHost) + ":" + str(signalPort) try: self.signalSocket.connect(connectionStr) self.log.info("signalSocket started (connect) for '" + connectionStr + "'") except: self.log.error("Failed to start signalSocket (connect): '" + connectionStr + "'") raise # using a Poller to implement the signalSocket timeout (in older ZMQ version there is no option RCVTIMEO) self.poller.register(self.signalSocket, zmq.POLLIN) def __setTargets(self, targets): self.targets = [] # [host, port, prio] if len(targets) == 3 and type(targets[0]) != list and type( targets[1]) != list and type(targets[2]) != list: host, port, prio = targets self.targets = [[host + ":" + port, prio, [""]]] # [host, port, prio, suffixes] elif len(targets) == 4 and type(targets[0]) != list and type( targets[1]) != list and type(targets[2]) != list and type( targets[3]) == list: host, port, prio, suffixes = targets self.targets = [[host + ":" + port, prio, suffixes]] # [[host, port, prio], ...] or [[host, port, prio, suffixes], ...] else: for t in targets: if type(t) == list and len(t) == 3: host, port, prio = t self.targets.append([host + ":" + port, prio, [""]]) elif type(t) == list and len(t) == 4 and type(t[3]): host, port, prio, suffixes = t self.targets.append([host + ":" + port, prio, suffixes]) else: self.stop() self.log.debug("targets=" + str(targets)) raise FormatError("Argument 'targets' is of wrong format.") def __sendSignal(self, signal): if not signal: return # Send the signal that the communication infrastructure should be established self.log.info("Sending Signal") sendMessage = [__version__, signal] trg = cPickle.dumps(self.targets) sendMessage.append(trg) # sendMessage = [__version__, signal, self.dataHost, self.dataPort] self.log.debug("Signal: " + str(sendMessage)) try: self.signalSocket.send_multipart(sendMessage) except: self.log.error("Could not send signal") raise message = None try: socks = dict(self.poller.poll(self.socketResponseTimeout)) except: self.log.error("Could not poll for new message") raise # if there was a response if self.signalSocket in socks and socks[ self.signalSocket] == zmq.POLLIN: try: # Get the reply. message = self.signalSocket.recv() self.log.info("Received answer to signal: " + str(message)) except: self.log.error("Could not receive answer to signal") raise return message def start(self, dataSocket=False, whitelist=None): # Receive data only from whitelisted nodes if whitelist: if type(whitelist) == list: self.auth = ThreadAuthenticator(self.context) self.auth.start() for host in whitelist: try: if host == "localhost": ip = [socket.gethostbyname(host)] else: hostname, tmp, ip = socket.gethostbyaddr(host) self.log.debug("Allowing host " + host + " (" + str(ip[0]) + ")") self.auth.allow(ip[0]) except: self.log.error("Error was: ", exc_info=True) raise AuthenticationFailed( "Could not get IP of host " + host) else: raise FormatError("Whitelist has to be a list of IPs") socketIdToConnect = self.streamStarted or self.queryNextStarted if socketIdToConnect: self.log.info("Reopening already started connection.") else: ip = "0.0.0.0" #TODO use IP of hostname? host = "" port = "" if dataSocket: if type(dataSocket) == list: socketIdToConnect = dataSocket[0] + ":" + dataSocket[1] host = dataSocket[0] ip = socket.gethostbyaddr(host)[2][0] port = dataSocket[1] else: port = str(dataSocket) host = socket.gethostname() socketId = host + ":" + port ipFromHost = socket.gethostbyaddr(host)[2] if len(ipFromHost) == 1: ip = ipFromHost[0] elif len(self.targets) == 1: host, port = self.targets[0][0].split(":") ipFromHost = socket.gethostbyaddr(host)[2] if len(ipFromHost) == 1: ip = ipFromHost[0] else: raise FormatError( "Multipe possible ports. Please choose which one to use.") socketId = host + ":" + port socketIdToConnect = ip + ":" + port # socketIdToConnect = "[" + ip + "]:" + port self.dataSocket = self.context.socket(zmq.PULL) # An additional socket is needed to establish the data retriving mechanism connectionStr = "tcp://" + socketIdToConnect if whitelist: self.dataSocket.zap_domain = b'global' try: # self.dataSocket.ipv6 = True self.dataSocket.bind(connectionStr) # self.dataSocket.bind("tcp://[2003:ce:5bc0:a600:fa16:54ff:fef4:9fc0]:50102") self.log.info("Data socket of type " + self.connectionType + " started (bind) for '" + connectionStr + "'") except: self.log.error("Failed to start Socket of type " + self.connectionType + " (bind): '" + connectionStr + "'", exc_info=True) raise self.poller.register(self.dataSocket, zmq.POLLIN) if self.connectionType in ["queryNext", "queryMetadata"]: self.requestSocket = self.context.socket(zmq.PUSH) # An additional socket is needed to establish the data retriving mechanism connectionStr = "tcp://" + self.signalHost + ":" + self.requestPort try: self.requestSocket.connect(connectionStr) self.log.info("Request socket started (connect) for '" + connectionStr + "'") except: self.log.error("Failed to start Socket of type " + self.connectionType + " (connect): '" + connectionStr + "'", exc_info=True) raise self.queryNextStarted = socketId else: self.streamStarted = socketId ## # # Receives or queries for new files depending on the connection initialized # # returns either # the newest file # (if connection type "queryNext" or "stream" was choosen) # the path of the newest file # (if connection type "queryMetadata" or "streamMetadata" was choosen) # ## def get(self, timeout=None): if not self.streamStarted and not self.queryNextStarted: self.log.info( "Could not communicate, no connection was initialized.") return None, None if self.queryNextStarted: sendMessage = ["NEXT", self.queryNextStarted] try: self.requestSocket.send_multipart(sendMessage) except Exception as e: self.log.error("Could not send request to requestSocket", exc_info=True) return None, None while True: # receive data if timeout: try: socks = dict(self.poller.poll(timeout)) except: self.log.error("Could not poll for new message") raise else: try: socks = dict(self.poller.poll()) except: self.log.error("Could not poll for new message") raise # if there was a response if self.dataSocket in socks and socks[ self.dataSocket] == zmq.POLLIN: try: multipartMessage = self.dataSocket.recv_multipart() except: self.log.error("Receiving data..failed.", exc_info=True) return [None, None] if multipartMessage[0] == b"ALIVE_TEST": continue elif len(multipartMessage) < 2: self.log.error( "Received mutipart-message is too short. Either config or file content is missing." ) self.log.debug("multipartMessage=" + str(mutipartMessage)[:100]) return [None, None] # extract multipart message try: metadata = cPickle.loads(multipartMessage[0]) except: self.log.error( "Could not extract metadata from the multipart-message.", exc_info=True) metadata = None #TODO validate multipartMessage (like correct dict-values for metadata) try: payload = multipartMessage[1] except: self.log.warning( "An empty file was received within the multipart-message", exc_info=True) payload = None return [metadata, payload] else: self.log.warning("Could not receive data in the given time.") if self.queryNextStarted: try: self.requestSocket.send_multipart( ["CANCEL", self.queryNextStarted]) except Exception as e: self.log.error("Could not cancel the next query", exc_info=True) return [None, None] def store(self, targetBasePath, dataObject): if type(dataObject) is not list and len(dataObject) != 2: raise FormatError("Wrong input type for 'store'") payloadMetadata = dataObject[0] payload = dataObject[1] if type(payloadMetadata) is not dict: raise FormatError("payload: Wrong input format in 'store'") #save all chunks to file while True: #TODO check if payload != cPickle.dumps(None) ? if payloadMetadata and payload: #append to file try: self.log.debug( "append to file based on multipart-message...") #TODO: save message to file using a thread (avoids blocking) #TODO: instead of open/close file for each chunk recyle the file-descriptor for all chunks opened self.__appendChunksToFile(targetBasePath, payloadMetadata, payload) self.log.debug( "append to file based on multipart-message...success.") except KeyboardInterrupt: self.log.info( "KeyboardInterrupt detected. Unable to append multipart-content to file." ) break except Exception, e: self.log.error( "Unable to append multipart-content to file.", exc_info=True) self.log.debug( "Append to file based on multipart-message...failed.") if len(payload) < payloadMetadata["chunkSize"]: #indicated end of file. Leave loop filename = self.generateTargetFilepath( targetBasePath, payloadMetadata) fileModTime = payloadMetadata["fileModTime"] self.log.info("New file with modification time " + str(fileModTime) + " received and saved: " + str(filename)) break try: [payloadMetadata, payload] = self.get() except: self.log.error("Getting data failed.", exc_info=True) break
def main(): port = "5556" socket_ip = "*" # ip = socket.getfqdn() context = zmq.Context() auth = ThreadAuthenticator(context) auth.start() whitelist = [socket.getfqdn()] for host in whitelist: hostname, tmp, ip = socket.gethostbyaddr(host) auth.allow(ip[0]) zmq_socket = context.socket(zmq.PUSH) zmq_socket.zap_domain = b'global' zmq_socket.bind("tcp://" + socket_ip + ":%s" % port) try: for i in range(5): message = ["World"] print("Send: ", message) res = zmq_socket.send_multipart(message, copy=False, track=True) if res.done: print("res: done") else: print("res: waiting") res.wait() print("res: waiting...") print("sleeping...") if i == 1: auth.stop() zmq_socket.close(0) auth.start() # ip = socket.gethostbyaddr(socket.getfqdn())[2] # auth.allow(ip[0]) ip = socket.gethostbyaddr(socket.getfqdn())[2] auth.deny(ip[0]) zmq_socket = context.socket(zmq.PUSH) zmq_socket.zap_domain = b'global' zmq_socket.bind("tcp://" + socket_ip + ":%s" % port) time.sleep(1) print("sleeping...done") i += 1 finally: auth.stop()
def start (self, dataSocket = False, whitelist = None): # Receive data only from whitelisted nodes if whitelist: if type(whitelist) == list: self.auth = ThreadAuthenticator(self.context) self.auth.start() for host in whitelist: try: if host == "localhost": ip = [socket.gethostbyname(host)] else: hostname, tmp, ip = socket.gethostbyaddr(host) self.log.debug("Allowing host " + host + " (" + str(ip[0]) + ")") self.auth.allow(ip[0]) except: self.log.error("Error was: ", exc_info=True) raise AuthenticationFailed("Could not get IP of host " + host) else: raise FormatError("Whitelist has to be a list of IPs") socketIdToConnect = self.streamStarted or self.queryNextStarted if socketIdToConnect: self.log.info("Reopening already started connection.") else: ip = "0.0.0.0" #TODO use IP of hostname? host = "" port = "" if dataSocket: if type(dataSocket) == list: socketIdToConnect = dataSocket[0] + ":" + dataSocket[1] host = dataSocket[0] ip = socket.gethostbyaddr(host)[2][0] port = dataSocket[1] else: port = str(dataSocket) host = socket.gethostname() socketId = host + ":" + port ipFromHost = socket.gethostbyaddr(host)[2] if len(ipFromHost) == 1: ip = ipFromHost[0] elif len(self.targets) == 1: host, port = self.targets[0][0].split(":") ipFromHost = socket.gethostbyaddr(host)[2] if len(ipFromHost) == 1: ip = ipFromHost[0] else: raise FormatError("Multipe possible ports. Please choose which one to use.") socketId = host + ":" + port socketIdToConnect = ip + ":" + port # socketIdToConnect = "[" + ip + "]:" + port self.dataSocket = self.context.socket(zmq.PULL) # An additional socket is needed to establish the data retriving mechanism connectionStr = "tcp://" + socketIdToConnect if whitelist: self.dataSocket.zap_domain = b'global' try: # self.dataSocket.ipv6 = True self.dataSocket.bind(connectionStr) # self.dataSocket.bind("tcp://[2003:ce:5bc0:a600:fa16:54ff:fef4:9fc0]:50102") self.log.info("Data socket of type " + self.connectionType + " started (bind) for '" + connectionStr + "'") except: self.log.error("Failed to start Socket of type " + self.connectionType + " (bind): '" + connectionStr + "'", exc_info=True) raise self.poller.register(self.dataSocket, zmq.POLLIN) if self.connectionType in ["queryNext", "queryMetadata"]: self.requestSocket = self.context.socket(zmq.PUSH) # An additional socket is needed to establish the data retriving mechanism connectionStr = "tcp://" + self.signalHost + ":" + self.requestPort try: self.requestSocket.connect(connectionStr) self.log.info("Request socket started (connect) for '" + connectionStr + "'") except: self.log.error("Failed to start Socket of type " + self.connectionType + " (connect): '" + connectionStr + "'", exc_info=True) raise self.queryNextStarted = socketId else: self.streamStarted = socketId
def run(): ''' Run Ironhouse example ''' # These directories are generated by the generate_certificates script base_dir = os.path.dirname(__file__) keys_dir = os.path.join(base_dir, 'certificates') public_keys_dir = os.path.join(base_dir, 'public_keys') secret_keys_dir = os.path.join(base_dir, 'private_keys') if not ( os.path.exists(keys_dir) and os.path.exists(public_keys_dir) and os.path.exists(secret_keys_dir) ): logging.critical( "Certificates are missing - run generate_certificates.py script first" ) sys.exit(1) ctx = zmq.Context.instance() # Start an authenticator for this context. auth = ThreadAuthenticator(ctx) auth.start() auth.allow('127.0.0.1') # Tell authenticator to use the certificate in a directory auth.configure_curve(domain='*', location=public_keys_dir) server = ctx.socket(zmq.PUSH) server_secret_file = os.path.join(secret_keys_dir, "server.key_secret") server_public, server_secret = zmq.auth.load_certificate(server_secret_file) server.curve_secretkey = server_secret server.curve_publickey = server_public server.curve_server = True # must come before bind server.bind('tcp://*:9000') client = ctx.socket(zmq.PULL) # We need two certificates, one for the client and one for # the server. The client must know the server's public key # to make a CURVE connection. client_secret_file = os.path.join(secret_keys_dir, "client.key_secret") client_public, client_secret = zmq.auth.load_certificate(client_secret_file) client.curve_secretkey = client_secret client.curve_publickey = client_public server_public_file = os.path.join(public_keys_dir, "server.key") server_public, _ = zmq.auth.load_certificate(server_public_file) # The client must know the server's public key to make a CURVE connection. client.curve_serverkey = server_public client.connect('tcp://127.0.0.1:9000') server.send(b"Hello") if client.poll(1000): msg = client.recv() if msg == b"Hello": logging.info("Ironhouse test OK") else: logging.error("Ironhouse test FAIL") # stop auth thread auth.stop()
class RpcClient: """""" def __init__(self): """Constructor""" # zmq port related self.__context: zmq.Context = zmq.Context() # Request socket (Request–reply pattern) self.__socket_req: zmq.Socket = self.__context.socket(zmq.REQ) # Subscribe socket (Publish–subscribe pattern) self.__socket_sub: zmq.Socket = self.__context.socket(zmq.SUB) # Worker thread relate, used to process data pushed from server self.__active: bool = False # RpcClient status self.__thread: threading.Thread = None # RpcClient thread self.__lock: threading.Lock = threading.Lock() # Authenticator used to ensure data security self.__authenticator: ThreadAuthenticator = None self._last_received_ping: datetime = datetime.utcnow() @lru_cache(100) def __getattr__(self, name: str): """ Realize remote call function """ # Perform remote call task def dorpc(*args, **kwargs): # Get timeout value from kwargs, default value is 30 seconds if "timeout" in kwargs: timeout = kwargs.pop("timeout") else: timeout = 30000 # Generate request req = [name, args, kwargs] # Send request and wait for response with self.__lock: self.__socket_req.send_pyobj(req) # Timeout reached without any data n = self.__socket_req.poll(timeout) if not n: msg = f"Timeout of {timeout}ms reached for {req}" raise RemoteException(msg) rep = self.__socket_req.recv_pyobj() # Return response if successed; Trigger exception if failed if rep[0]: return rep[1] else: raise RemoteException(rep[1]) return dorpc def start(self, req_address: str, sub_address: str, client_secretkey_path: str = "", server_publickey_path: str = "", username: str = "", password: str = "") -> None: """ Start RpcClient """ if self.__active: return # Start authenticator if client_secretkey_path and server_publickey_path: self.__authenticator = ThreadAuthenticator(self.__context) self.__authenticator.start() self.__authenticator.configure_curve( domain="*", location=zmq.auth.CURVE_ALLOW_ANY) publickey, secretkey = zmq.auth.load_certificate( client_secretkey_path) serverkey, _ = zmq.auth.load_certificate(server_publickey_path) self.__socket_sub.curve_secretkey = secretkey self.__socket_sub.curve_publickey = publickey self.__socket_sub.curve_serverkey = serverkey self.__socket_req.curve_secretkey = secretkey self.__socket_req.curve_publickey = publickey self.__socket_req.curve_serverkey = serverkey elif username and password: self.__authenticator = ThreadAuthenticator(self.__context) self.__authenticator.start() self.__authenticator.configure_plain( domain="*", passwords={username: password}) self.__socket_sub.plain_username = username.encode() self.__socket_sub.plain_password = password.encode() self.__socket_req.plain_username = username.encode() self.__socket_req.plain_password = password.encode() # Connect zmq port self.__socket_req.connect(req_address) self.__socket_sub.connect(sub_address) # Start RpcClient status self.__active = True # Start RpcClient thread self.__thread = threading.Thread(target=self.run) self.__thread.start() self._last_received_ping = datetime.utcnow() def stop(self) -> None: """ Stop RpcClient """ if not self.__active: return # Stop RpcClient status self.__active = False def join(self) -> None: # Wait for RpcClient thread to exit if self.__thread and self.__thread.is_alive(): self.__thread.join() self.__thread = None def run(self) -> None: """ Run RpcClient function """ pull_tolerance = int(KEEP_ALIVE_TOLERANCE.total_seconds() * 1000) while self.__active: if not self.__socket_sub.poll(pull_tolerance): self.on_disconnected() continue # Receive data from subscribe socket topic, data = self.__socket_sub.recv_pyobj(flags=NOBLOCK) if topic == KEEP_ALIVE_TOPIC: self._last_received_ping = data else: # Process data by callable function self.callback(topic, data) # Close socket self.__socket_req.close() self.__socket_sub.close() def callback(self, topic: str, data: Any) -> None: """ Callable function """ raise NotImplementedError def subscribe_topic(self, topic: str) -> None: """ Subscribe data """ self.__socket_sub.setsockopt_string(zmq.SUBSCRIBE, topic) def on_disconnected(self): """ Callback when heartbeat is lost. """ print( "RpcServer has no response over {tolerance} seconds, please check you connection." .format(tolerance=KEEP_ALIVE_TOLERANCE.total_seconds()))
def run(self): self.set_status("Client Startup") self.set_status("Creating zmq Contexts",1) clientctx = zmq.Context() self.set_status("Starting zmq ThreadedAuthenticator",1) #clientauth = zmq.auth.ThreadedAuthenticator(clientctx) clientauth = ThreadAuthenticator(clientctx) clientauth.start() with taco.globals.settings_lock: publicdir = os.path.normpath(os.path.abspath(taco.globals.settings["TacoNET Certificates Store"] + "/" + taco.globals.settings["Local UUID"] + "/public/")) privatedir = os.path.normpath(os.path.abspath(taco.globals.settings["TacoNET Certificates Store"] + "/" + taco.globals.settings["Local UUID"] + "/private/")) self.set_status("Configuring Curve to use publickey dir:" + publicdir) clientauth.configure_curve(domain='*', location=publicdir) poller = zmq.Poller() while not self.stop.is_set(): #logging.debug("PRE") result = self.sleep.wait(0.1) #logging.debug(result) self.sleep.clear() if self.stop.is_set(): break if abs(time.time() - self.connect_block_time) > 1: with taco.globals.settings_lock: self.max_upload_rate = taco.globals.settings["Upload Limit"] * taco.constants.KB with taco.globals.settings_lock: self.max_download_rate = taco.globals.settings["Download Limit"] * taco.constants.KB self.chunk_request_rate = float(taco.constants.FILESYSTEM_CHUNK_SIZE) / float(self.max_download_rate) #logging.debug(str((self.max_download_rate,taco.constants.FILESYSTEM_CHUNK_SIZE,self.chunk_request_rate))) self.connect_block_time = time.time() with taco.globals.settings_lock: for peer_uuid in taco.globals.settings["Peers"].keys(): if taco.globals.settings["Peers"][peer_uuid]["enabled"]: #init some defaults if not peer_uuid in self.client_reconnect_mod: self.client_reconnect_mod[peer_uuid] = taco.constants.CLIENT_RECONNECT_MIN if not peer_uuid in self.client_connect_time: self.client_connect_time[peer_uuid] = time.time() + self.client_reconnect_mod[peer_uuid] if not peer_uuid in self.client_timeout: self.client_timeout[peer_uuid] = time.time() + taco.constants.ROLLCALL_TIMEOUT if time.time() >= self.client_connect_time[peer_uuid]: if peer_uuid not in self.clients.keys(): self.set_status("Starting Client for: " + peer_uuid) try: ip_of_client = socket.gethostbyname(taco.globals.settings["Peers"][peer_uuid]["hostname"]) except: self.set_status("Starting of client failed due to bad dns lookup:" + peer_uuid) continue self.clients[peer_uuid] = clientctx.socket(zmq.DEALER) self.clients[peer_uuid].setsockopt(zmq.LINGER, 0) client_public, client_secret = zmq.auth.load_certificate(os.path.normpath(os.path.abspath(privatedir + "/" + taco.constants.KEY_GENERATION_PREFIX +"-client.key_secret"))) self.clients[peer_uuid].curve_secretkey = client_secret self.clients[peer_uuid].curve_publickey = client_public self.clients[peer_uuid].curve_serverkey = str(taco.globals.settings["Peers"][peer_uuid]["serverkey"]) self.clients[peer_uuid].connect("tcp://" + ip_of_client + ":" + str(taco.globals.settings["Peers"][peer_uuid]["port"])) self.next_rollcall[peer_uuid] = time.time() with taco.globals.high_priority_output_queue_lock: taco.globals.high_priority_output_queue[peer_uuid] = Queue.Queue() with taco.globals.medium_priority_output_queue_lock: taco.globals.medium_priority_output_queue[peer_uuid] = Queue.Queue() with taco.globals.low_priority_output_queue_lock: taco.globals.low_priority_output_queue[peer_uuid] = Queue.Queue() with taco.globals.file_request_output_queue_lock: taco.globals.file_request_output_queue[peer_uuid] = Queue.Queue() poller.register(self.clients[peer_uuid],zmq.POLLIN) if len(self.clients.keys()) == 0: continue peer_keys = self.clients.keys() random.shuffle(peer_keys) for peer_uuid in peer_keys: #self.set_status("Socket Write Possible:" + peer_uuid) #high priority queue processing with taco.globals.high_priority_output_queue_lock: while not taco.globals.high_priority_output_queue[peer_uuid].empty(): self.set_status("high priority output q not empty:" + peer_uuid) data = taco.globals.high_priority_output_queue[peer_uuid].get() self.clients[peer_uuid].send_multipart(['',data]) self.sleep.set() with taco.globals.upload_limiter_lock: taco.globals.upload_limiter.add(len(data)) #medium priority queue processing with taco.globals.medium_priority_output_queue_lock: while not taco.globals.medium_priority_output_queue[peer_uuid].empty(): self.set_status("medium priority output q not empty:" + peer_uuid) data = taco.globals.medium_priority_output_queue[peer_uuid].get() self.clients[peer_uuid].send_multipart(['',data]) self.sleep.set() with taco.globals.upload_limiter_lock: taco.globals.upload_limiter.add(len(data)) #filereq q, aka the download throttle if time.time() >= self.file_request_time: self.file_request_time = time.time() with taco.globals.file_request_output_queue_lock: if not taco.globals.file_request_output_queue[peer_uuid].empty(): with taco.globals.download_limiter_lock: download_rate = taco.globals.download_limiter.get_rate() bw_percent = download_rate / self.max_download_rate wait_time = self.chunk_request_rate * bw_percent #self.set_status(str((download_rate,self.max_download_rate,self.chunk_request_rate,bw_percent,wait_time))) if wait_time > 0.01: self.file_request_time += wait_time if download_rate < self.max_download_rate: self.set_status("filereq output q not empty+free bw:" + peer_uuid) data = taco.globals.file_request_output_queue[peer_uuid].get() self.clients[peer_uuid].send_multipart(['',data]) self.sleep.set() with taco.globals.upload_limiter_lock: taco.globals.upload_limiter.add(len(data)) #low priority queue processing with taco.globals.low_priority_output_queue_lock: if not taco.globals.low_priority_output_queue[peer_uuid].empty(): with taco.globals.upload_limiter_lock: upload_rate = taco.globals.upload_limiter.get_rate() if upload_rate < self.max_upload_rate: self.set_status("low priority output q not empty+free bw:" + peer_uuid) data = taco.globals.low_priority_output_queue[peer_uuid].get() self.clients[peer_uuid].send_multipart(['',data]) self.sleep.set() with taco.globals.upload_limiter_lock: taco.globals.upload_limiter.add(len(data)) #rollcall special case if self.next_rollcall[peer_uuid] < time.time(): #self.set_status("Requesting Rollcall from: " + peer_uuid) data = taco.commands.Request_Rollcall() self.clients[peer_uuid].send_multipart(['',data]) with taco.globals.upload_limiter_lock: taco.globals.upload_limiter.add(len(data)) self.next_rollcall[peer_uuid] = time.time() + random.randint(taco.constants.ROLLCALL_MIN,taco.constants.ROLLCALL_MAX) self.sleep.set() #continue #RECEIVE BLOCK socks = dict(poller.poll(0)) while self.clients[peer_uuid] in socks and socks[self.clients[peer_uuid]] == zmq.POLLIN: #self.set_status("Socket Read Possible") sink,data = self.clients[peer_uuid].recv_multipart() with taco.globals.download_limiter_lock: taco.globals.download_limiter.add(len(data)) self.set_client_last_reply(peer_uuid) self.next_request = taco.commands.Process_Reply(peer_uuid,data) if self.next_request != "": with taco.globals.medium_priority_output_queue_lock: taco.globals.medium_priority_output_queue[peer_uuid].put(self.next_request) self.sleep.set() socks = dict(poller.poll(0)) #cleanup block self.error_msg = [] if self.clients[peer_uuid] in socks and socks[self.clients[peer_uuid]] == zmq.POLLERR: self.error_msg.append("got a socket error") if abs(self.client_timeout[peer_uuid] - time.time()) > taco.constants.ROLLCALL_TIMEOUT: self.error_msg.append("havn't seen communications") if len(self.error_msg) > 0: self.set_status("Stopping client: " + peer_uuid + " -- " + " and ".join(self.error_msg),2) poller.unregister(self.clients[peer_uuid]) self.clients[peer_uuid].close(0) del self.clients[peer_uuid] del self.client_timeout[peer_uuid] with taco.globals.high_priority_output_queue_lock: del taco.globals.high_priority_output_queue[peer_uuid] with taco.globals.medium_priority_output_queue_lock: del taco.globals.medium_priority_output_queue[peer_uuid] with taco.globals.low_priority_output_queue_lock: del taco.globals.low_priority_output_queue[peer_uuid] with taco.globals.file_request_output_queue_lock: del taco.globals.file_request_output_queue[peer_uuid] self.client_reconnect_mod[peer_uuid] = min(self.client_reconnect_mod[peer_uuid] + taco.constants.CLIENT_RECONNECT_MOD,taco.constants.CLIENT_RECONNECT_MAX) self.client_connect_time[peer_uuid] = time.time() + self.client_reconnect_mod[peer_uuid] self.set_status("Terminating Clients") for peer_uuid in self.clients.keys(): self.clients[peer_uuid].close(0) self.set_status("Stopping zmq ThreadedAuthenticator") clientauth.stop() clientctx.term() self.set_status("Clients Exit")
def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-logs", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("[INIT] Unable to drop privileges") return # Create the sockets context = zmq.Context() self.log_socket = context.socket(zmq.PULL) self.controler = context.socket(zmq.ROUTER) self.controler.setsockopt(zmq.IDENTITY, b"lava-logs") # Limit the number of messages in the queue self.controler.setsockopt(zmq.SNDHWM, 2) # From http://api.zeromq.org/4-2:zmq-setsockopt#toc5 # "Immediately readies that connection for data transfer with the master" self.controler.setsockopt(zmq.CONNECT_RID, b"master") if options['ipv6']: self.logger.info("[INIT] Enabling IPv6") self.log_socket.setsockopt(zmq.IPV6, 1) self.controler.setsockopt(zmq.IPV6, 1) if options['encrypt']: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug("[INIT] Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate(options['master_cert']) self.logger.debug("[INIT] Using slaves certificates from: %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) except IOError as err: self.logger.error("[INIT] %s", err) self.auth.stop() return self.log_socket.curve_publickey = master_public self.log_socket.curve_secretkey = master_secret self.log_socket.curve_server = True self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_serverkey = master_public self.logger.debug("[INIT] Watching %s", options["slaves_certs"]) self.cert_dir_path = options["slaves_certs"] self.inotify_fd = watch_directory(options["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.log_socket.bind(options['socket']) self.controler.connect(options['master_socket']) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.log_socket, zmq.POLLIN) self.poller.register(self.controler, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] listening for logs") # PING right now: the master is waiting for this message to start # scheduling. self.controler.send_multipart([b"master", b"PING"]) try: self.main_loop() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) # Close the controler socket self.controler.close(linger=0) self.poller.unregister(self.controler) # Carefully close the logging socket as we don't want to lose messages self.logger.info("[EXIT] Disconnect logging socket and process messages") endpoint = u(self.log_socket.getsockopt(zmq.LAST_ENDPOINT)) self.logger.debug("[EXIT] unbinding from '%s'", endpoint) self.log_socket.unbind(endpoint) # Empty the queue try: while self.wait_for_messages(True): # Flush test cases cache for every iteration because we might # get killed soon. self.flush_test_cases() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Last flush self.flush_test_cases() self.logger.info("[EXIT] Closing the logging socket: the queue is empty") self.log_socket.close() if options['encrypt']: self.auth.stop() context.term()
class Command(LAVADaemonCommand): """ worker_host is the hostname of the worker this field is set by the admin and could therefore be empty in a misconfigured instance. """ logger = None help = "LAVA dispatcher master" default_logfile = "/var/log/lava-server/lava-master.log" def __init__(self, *args, **options): super().__init__(*args, **options) self.auth = None self.controler = None self.event_socket = None self.poller = None self.pipe_r = None self.inotify_fd = None # List of logs # List of known dispatchers. At startup do not load this from the # database. This will help to know if the slave as restarted or not. self.dispatchers = { "lava-logs": SlaveDispatcher("lava-logs", online=False) } self.events = {"canceling": set(), "available_dt": set()} def add_arguments(self, parser): super().add_arguments(parser) net = parser.add_argument_group("network") net.add_argument( '--master-socket', default='tcp://*:5556', help="Socket for master-slave communication. Default: tcp://*:5556" ) net.add_argument('--event-url', default="tcp://localhost:5500", help="URL of the publisher") net.add_argument('--ipv6', default=False, action='store_true', help="Enable IPv6 on the listening sockets") net.add_argument('--encrypt', default=False, action='store_true', help="Encrypt messages") net.add_argument( '--master-cert', default='/etc/lava-dispatcher/certificates.d/master.key_secret', help="Certificate for the master socket") net.add_argument('--slaves-certs', default='/etc/lava-dispatcher/certificates.d', help="Directory for slaves certificates") def send_status(self, hostname): """ The master crashed, send a STATUS message to get the current state of jobs """ jobs = TestJob.objects.filter( actual_device__worker_host__hostname=hostname, state=TestJob.STATE_RUNNING) for job in jobs: self.logger.info("[%d] STATUS => %s (%s)", job.id, hostname, job.actual_device.hostname) send_multipart_u(self.controler, [hostname, 'STATUS', str(job.id)]) def dispatcher_alive(self, hostname): if hostname not in self.dispatchers: # The server crashed: send a STATUS message self.logger.warning("Unknown dispatcher <%s> (server crashed)", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname) self.send_status(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() def controler_socket(self): try: # We need here to use the zmq.NOBLOCK flag, otherwise we could block # the whole main loop where this function is called. msg = self.controler.recv_multipart(zmq.NOBLOCK) except zmq.error.Again: return False # This is way to verbose for production and should only be activated # by (and for) developers # self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = u(msg[0]) # 2: the action action = u(msg[1]) # Check that lava-logs only send PINGs if hostname == "lava-logs" and action != "PING": self.logger.error("%s => %s Invalid action from log daemon", hostname, action) return True # Handle the actions if action == 'HELLO' or action == 'HELLO_RETRY': self._handle_hello(hostname, action, msg) elif action == 'PING': self._handle_ping(hostname, action, msg) elif action == 'END': self._handle_end(hostname, action, msg) elif action == 'START_OK': self._handle_start_ok(hostname, action, msg) else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True def read_event_socket(self): try: msg = self.event_socket.recv_multipart(zmq.NOBLOCK) except zmq.error.Again: return False try: (topic, _, dt, username, data) = (u(m) for m in msg) data = simplejson.loads(data) except ValueError: self.logger.error("Invalid event: %s", msg) return True if topic.endswith(".testjob"): if data["state"] == "Canceling": self.events["canceling"].add(int(data["job"])) elif data["state"] == "Submitted": if "device_type" in data: self.events["available_dt"].add(data["device_type"]) elif topic.endswith(".device"): if data["state"] == "Idle" and data["health"] in [ "Good", "Unknown", "Looping" ]: self.events["available_dt"].add(data["device_type"]) return True def _handle_end(self, hostname, action, msg): # pylint: disable=unused-argument try: job_id = int(msg[2]) error_msg = msg[3] compressed_description = msg[4] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return try: job = TestJob.objects.get(id=job_id) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)]) return filename = os.path.join(job.output_dir, 'description.yaml') # If description.yaml already exists: a END was already received if os.path.exists(filename): self.logger.info("[%d] %s => END (duplicated), skipping", job_id, hostname) else: if compressed_description: self.logger.info("[%d] %s => END", job_id, hostname) else: self.logger.info( "[%d] %s => END (lava-run crashed, mark job as INCOMPLETE)", job_id, hostname) with transaction.atomic(): # TODO: find a way to lock actual_device job = TestJob.objects.select_for_update() \ .get(id=job_id) job.go_state_finished(TestJob.HEALTH_INCOMPLETE) if error_msg: self.logger.error("[%d] Error: %s", job_id, error_msg) job.failure_comment = error_msg job.save() # Create description.yaml even if it's empty # Allows to know when END messages are duplicated try: # Create the directory if it was not already created mkdir(os.path.dirname(filename)) # TODO: check that compressed_description is not "" description = lzma.decompress(compressed_description) with open(filename, 'w') as f_description: f_description.write(description.decode("utf-8")) if description: parse_job_description(job) except (OSError, lzma.LZMAError) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception("[%d] %s", job_id, exc) # ACK the job and mark the dispatcher as alive send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname) def _handle_hello(self, hostname, action, msg): # Check the protocol version try: slave_version = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return self.logger.info("%s => %s", hostname, action) if slave_version != PROTOCOL_VERSION: self.logger.error( "<%s> using protocol v%d while master is using v%d", hostname, slave_version, PROTOCOL_VERSION) return send_multipart_u(self.controler, [hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: if action == 'HELLO': self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning("Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO # message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname) # Mark the dispatcher as alive self.dispatcher_alive(hostname) def _handle_ping(self, hostname, action, msg): # pylint: disable=unused-argument self.logger.debug("%s => PING(%d)", hostname, PING_INTERVAL) # Send back a signal send_multipart_u( self.controler, [hostname, 'PONG', str(PING_INTERVAL)]) self.dispatcher_alive(hostname) def _handle_start_ok(self, hostname, action, msg): # pylint: disable=unused-argument try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): # TODO: find a way to lock actual_device job = TestJob.objects.select_for_update() \ .get(id=job_id) job.go_state_running() job.save() except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) else: self.dispatcher_alive(hostname) def export_definition(self, job): # pylint: disable=no-self-use job_def = yaml.safe_load(job.definition) job_def['compatibility'] = job.pipeline_compatibility # no need for the dispatcher to retain comments return yaml.dump(job_def) def save_job_config(self, job, device_cfg, env_str, env_dut_str, dispatcher_cfg): output_dir = job.output_dir mkdir(output_dir) with open(os.path.join(output_dir, "job.yaml"), "w") as f_out: f_out.write(self.export_definition(job)) with open(os.path.join(output_dir, "device.yaml"), "w") as f_out: yaml.dump(device_cfg, f_out) if env_str: with open(os.path.join(output_dir, "env.yaml"), "w") as f_out: f_out.write(env_str) if env_dut_str: with open(os.path.join(output_dir, "env.dut.yaml"), "w") as f_out: f_out.write(env_dut_str) if dispatcher_cfg: with open(os.path.join(output_dir, "dispatcher.yaml"), "w") as f_out: f_out.write(dispatcher_cfg) def start_job(self, job): # Load job definition to get the variables for template # rendering job_def = yaml.safe_load(job.definition) job_ctx = job_def.get('context', {}) device = job.actual_device worker = device.worker_host # TODO: check that device_cfg is not None! device_cfg = device.load_configuration(job_ctx) # Try to load the dispatcher specific files and then fallback to the # default configuration files. env_str = load_optional_yaml_file( os.path.join(DISPATCHERS_PATH, worker.hostname, "env.yaml"), ENV_PATH) env_dut_str = load_optional_yaml_file( os.path.join(DISPATCHERS_PATH, worker.hostname, "env.dut.yaml"), ENV_DUT_PATH) dispatcher_cfg = load_optional_yaml_file( os.path.join(DISPATCHERS_PATH, worker.hostname, "dispatcher.yaml"), os.path.join(DISPATCHERS_PATH, "%s.yaml" % worker.hostname)) self.save_job_config(job, device_cfg, env_str, env_dut_str, dispatcher_cfg) self.logger.info("[%d] START => %s (%s)", job.id, worker.hostname, device.hostname) send_multipart_u(self.controler, [ worker.hostname, 'START', str(job.id), self.export_definition(job), yaml.dump(device_cfg), dispatcher_cfg, env_str, env_dut_str ]) # For multinode jobs, start the dynamic connections parent = job for sub_job in job.sub_jobs_list: if sub_job == parent or not sub_job.dynamic_connection: continue # inherit only enough configuration for dynamic_connection operation self.logger.info( "[%d] Trimming dynamic connection device configuration.", sub_job.id) min_device_cfg = parent.actual_device.minimise_configuration( device_cfg) self.save_job_config(sub_job, min_device_cfg, env_str, env_dut_str, dispatcher_cfg) self.logger.info("[%d] START => %s (connection)", sub_job.id, worker.hostname) send_multipart_u(self.controler, [ worker.hostname, 'START', str(sub_job.id), self.export_definition(sub_job), yaml.dump(min_device_cfg), dispatcher_cfg, env_str, env_dut_str ]) def start_jobs(self, jobs=None): """ Loop on all scheduled jobs and send the START message to the slave. """ # make the request atomic query = TestJob.objects.select_for_update() # Only select test job that are ready query = query.filter(state=TestJob.STATE_SCHEDULED) # Only start jobs on online workers query = query.filter( actual_device__worker_host__state=Worker.STATE_ONLINE) # exclude test job without a device: they are special test jobs like # dynamic connection. query = query.exclude(actual_device=None) # Allow for partial scheduling if jobs is not None: query = query.filter(id__in=jobs) # Loop on all jobs for job in query: msg = None try: self.start_job(job) except jinja2.TemplateNotFound as exc: self.logger.error("[%d] Template not found: '%s'", job.id, exc.message) msg = "Template not found: '%s'" % exc.message except jinja2.TemplateSyntaxError as exc: self.logger.error( "[%d] Template syntax error in '%s', line %d: %s", job.id, exc.name, exc.lineno, exc.message) msg = "Template syntax error in '%s', line %d: %s" % ( exc.name, exc.lineno, exc.message) except OSError as exc: self.logger.error("[%d] Unable to read '%s': %s", job.id, exc.filename, exc.strerror) msg = "Cannot open '%s': %s" % (exc.filename, exc.strerror) except yaml.YAMLError as exc: self.logger.error("[%d] Unable to parse job definition: %s", job.id, exc) msg = "Cannot parse job definition: %s" % exc if msg: # Add the error as lava.job result metadata = { "case": "job", "definition": "lava", "error_type": "Infrastructure", "error_msg": msg, "result": "fail" } suite, _ = TestSuite.objects.get_or_create(name="lava", job=job) TestCase.objects.create(name="job", suite=suite, result=TestCase.RESULT_FAIL, metadata=yaml.dump(metadata)) job.go_state_finished(TestJob.HEALTH_INCOMPLETE, True) job.save() def cancel_jobs(self, partial=False): # make the request atomic query = TestJob.objects.select_for_update() # Only select the test job that are canceling query = query.filter(state=TestJob.STATE_CANCELING) # Only cancel jobs on online workers query = query.filter( actual_device__worker_host__state=Worker.STATE_ONLINE) # Allow for partial canceling if partial: query = query.filter(id__in=list(self.events["canceling"])) # Loop on all jobs for job in query: worker = job.lookup_worker if job.dynamic_connection else job.actual_device.worker_host self.logger.info("[%d] CANCEL => %s", job.id, worker.hostname) send_multipart_u(self.controler, [worker.hostname, 'CANCEL', str(job.id)]) def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-master", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options['user'], options['group']): self.logger.error("[INIT] Unable to drop privileges") return filename = os.path.join(settings.MEDIA_ROOT, 'lava-master-config.yaml') self.logger.debug("[INIT] Dumping config to %s", filename) with open(filename, 'w') as output: yaml.dump(options, output) self.logger.info("[INIT] Marking all workers as offline") with transaction.atomic(): for worker in Worker.objects.select_for_update().all(): worker.go_state_offline() worker.save() # Create the sockets context = zmq.Context() self.controler = context.socket(zmq.ROUTER) self.event_socket = context.socket(zmq.SUB) if options['ipv6']: self.logger.info("[INIT] Enabling IPv6") self.controler.setsockopt(zmq.IPV6, 1) self.event_socket.setsockopt(zmq.IPV6, 1) if options['encrypt']: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug("[INIT] Opening master certificate: %s", options['master_cert']) master_public, master_secret = zmq.auth.load_certificate( options['master_cert']) self.logger.debug("[INIT] Using slaves certificates from: %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) except OSError as err: self.logger.error(err) self.auth.stop() return self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_server = True self.logger.debug("[INIT] Watching %s", options["slaves_certs"]) self.inotify_fd = watch_directory(options["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.controler.setsockopt(zmq.IDENTITY, b"master") # From http://api.zeromq.org/4-2:zmq-setsockopt#toc42 # "If two clients use the same identity when connecting to a ROUTER # [...] the ROUTER socket shall hand-over the connection to the new # client and disconnect the existing one." self.controler.setsockopt(zmq.ROUTER_HANDOVER, 1) self.controler.bind(options['master_socket']) self.event_socket.setsockopt(zmq.SUBSCRIBE, b(settings.EVENT_TOPIC)) self.event_socket.connect(options['event_url']) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.controler, zmq.POLLIN) self.poller.register(self.event_socket, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] LAVA master has started.") self.logger.info("[INIT] Using protocol version %d", PROTOCOL_VERSION) try: self.main_loop(options) except BaseException as exc: self.logger.error("[CLOSE] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Drop controler socket: the protocol does handle lost messages self.logger.info( "[CLOSE] Closing the controler socket and dropping messages") self.controler.close(linger=0) self.event_socket.close(linger=0) if options['encrypt']: self.auth.stop() context.term() def main_loop(self, options): last_schedule = last_dispatcher_check = time.time() while True: try: try: # Compute the timeout now = time.time() timeout = min( SCHEDULE_INTERVAL - (now - last_schedule), PING_INTERVAL - (now - last_dispatcher_check)) # If some actions are remaining, decrease the timeout if any([self.events[k] for k in self.events.keys()]): timeout = min(timeout, 2) # Wait at least for 1ms timeout = max(timeout * 1000, 1) # Wait for data or a timeout sockets = dict(self.poller.poll(timeout)) except zmq.error.ZMQError: continue if sockets.get(self.pipe_r) == zmq.POLLIN: self.logger.info("[POLL] Received a signal, leaving") break # Command socket if sockets.get(self.controler) == zmq.POLLIN: while self.controler_socket( ): # Unqueue all pending messages pass # Events socket if sockets.get(self.event_socket) == zmq.POLLIN: while self.read_event_socket( ): # Unqueue all pending messages pass # Wait for the next iteration to handle the event. # In fact, the code that generated the event (lava-logs or # lava-server-gunicorn) needs some time to commit the # database transaction. # If we are too fast, the database object won't be # available (or in the right state) yet. continue # Inotify socket if sockets.get(self.inotify_fd) == zmq.POLLIN: os.read(self.inotify_fd, 4096) self.logger.debug("[AUTH] Reloading certificates from %s", options['slaves_certs']) self.auth.configure_curve(domain='*', location=options['slaves_certs']) # Check dispatchers status now = time.time() if now - last_dispatcher_check > PING_INTERVAL: for hostname, dispatcher in self.dispatchers.items(): if dispatcher.online and now - dispatcher.last_msg > DISPATCHER_TIMEOUT: if hostname == "lava-logs": self.logger.error( "[STATE] lava-logs goes OFFLINE") else: self.logger.error( "[STATE] Dispatcher <%s> goes OFFLINE", hostname) self.dispatchers[hostname].go_offline() last_dispatcher_check = now # Limit accesses to the database. This will also limit the rate of # CANCEL and START messages if time.time() - last_schedule > SCHEDULE_INTERVAL: if self.dispatchers["lava-logs"].online: schedule(self.logger) # Dispatch scheduled jobs with transaction.atomic(): self.start_jobs() else: self.logger.warning( "lava-logs is offline: can't schedule jobs") # Handle canceling jobs with transaction.atomic(): self.cancel_jobs() # Do not count the time taken to schedule jobs last_schedule = time.time() else: # Cancel the jobs and remove the jobs from the set if self.events["canceling"]: with transaction.atomic(): self.cancel_jobs(partial=True) self.events["canceling"] = set() # Schedule for available device-types if self.events["available_dt"]: jobs = schedule(self.logger, self.events["available_dt"]) self.events["available_dt"] = set() # Dispatch scheduled jobs with transaction.atomic(): self.start_jobs(jobs) except (OperationalError, InterfaceError): self.logger.info("[RESET] database connection reset.") # Closing the database connection will force Django to reopen # the connection connection.close() time.sleep(2)
import zmq import zmq.auth from zmq.auth.thread import ThreadAuthenticator context = zmq.Context() server = context.socket(zmq.REP) auth = ThreadAuthenticator(context) auth.start() auth.allow('127.0.0.1') auth.configure_plain(domain='*', passwords={'admin': 'password'}) server.plain_server = True server.setsockopt(zmq.PLAIN_SERVER, 1) server.connect('tcp://127.0.0.1:5556') msg = server.recv_string() server.send(b'Authenticated') auth.stop()
def make_auth(self): return ThreadAuthenticator(self.context)
class ZmqReceiver(object): def __init__(self, zmq_rep_bind_address=None, zmq_sub_connect_addresses=None, recreate_sockets_on_timeout_of_sec=600, username=None, password=None): self.context = zmq.Context() self.auth = None self.last_received_message = None self.is_running = False self.thread = None self.zmq_rep_bind_address = zmq_rep_bind_address self.zmq_sub_connect_addresses = zmq_sub_connect_addresses self.poller = zmq.Poller() self.sub_sockets = [] self.rep_socket = None if username is not None and password is not None: # Start an authenticator for this context. # Does not work on PUB/SUB as far as I (probably because the more secure solutions # require two way communication as well) self.auth = ThreadAuthenticator(self.context) self.auth.start() # Instruct authenticator to handle PLAIN requests self.auth.configure_plain(domain='*', passwords={username: password}) if self.zmq_sub_connect_addresses: for address in self.zmq_sub_connect_addresses: self.sub_sockets.append(SubSocket(self.context, self.poller, address, recreate_sockets_on_timeout_of_sec)) if zmq_rep_bind_address: self.rep_socket = RepSocket(self.context, self.poller, zmq_rep_bind_address, self.auth) # May take up to 60 seconds to actually stop since poller has timeout of 60 seconds def stop(self): self.is_running = False logger.info("Closing pub and sub sockets...") if self.auth is not None: self.auth.stop() def run(self): self.is_running = True while self.is_running: socks = dict(self.poller.poll(1000)) logger.debug("Poll cycle over. checking sockets") if self.rep_socket: incoming_message = self.rep_socket.recv_string(socks) if incoming_message is not None: self.last_received_message = incoming_message try: logger.debug("Got info from REP socket") response_message = self.handle_incoming_message(incoming_message) self.rep_socket.send(response_message) except Exception as e: logger.error(e) for sub_socket in self.sub_sockets: incoming_message = sub_socket.recv_string(socks) if incoming_message is not None: if incoming_message != "zmq_sub_heartbeat": self.last_received_message = incoming_message logger.debug("Got info from SUB socket") try: self.handle_incoming_message(incoming_message) except Exception as e: logger.error(e) if self.rep_socket: self.rep_socket.destroy() for sub_socket in self.sub_sockets: sub_socket.destroy() def create_response_message(self, status_code, status_message, response_message): if response_message is not None: return json.dumps({"status_code": status_code, "status_message": status_message, "response_message": response_message}) else: return json.dumps({"status_code": status_code, "status_message": status_message}) def handle_incoming_message(self, message): if message != "zmq_sub_heartbeat": return self.create_response_message(200, "OK", None)