class Broker(object): """Class that route jobs to agents. Options: - **frontend**: the ZMQ socket to receive jobs. - **backend**: the ZMQ socket to communicate with agents. - **heartbeat**: the ZMQ socket to receive heartbeat requests. - **register** : the ZMQ socket to register agents. - **receiver**: the ZMQ socket that receives data from agents. - **publisher**: the ZMQ socket to publish agents data """ def __init__(self, frontend=DEFAULT_FRONTEND, backend=DEFAULT_BACKEND, heartbeat=None, register=DEFAULT_REG, io_threads=DEFAULT_IOTHREADS, agent_timeout=DEFAULT_TIMEOUT_MOVF, receiver=DEFAULT_BROKER_RECEIVER, publisher=DEFAULT_PUBLISHER, db='python', dboptions=None): # before doing anything, we verify if a broker is already up and # running logger.debug('Verifying if there is a running broker') pid = verify_broker(frontend) if pid is not None: # oops. can't do this ! logger.debug('Ooops, we have a running broker on that socket') raise DuplicateBrokerError(pid) self.endpoints = {'frontend': frontend, 'backend': backend, 'register': register, 'receiver': receiver, 'publisher': publisher} if heartbeat is not None: self.endpoints['heartbeat'] = heartbeat logger.debug('Initializing the broker.') for endpoint in self.endpoints.values(): if endpoint.startswith('ipc'): register_ipc_file(endpoint) self.context = zmq.Context(io_threads=io_threads) # setting up the sockets self._frontend = self.context.socket(zmq.ROUTER) self._frontend.identity = 'broker-' + frontend self._frontend.bind(frontend) self._backend = self.context.socket(zmq.ROUTER) self._backend.bind(backend) self._registration = self.context.socket(zmq.PULL) self._registration.bind(register) self._receiver = self.context.socket(zmq.PULL) self._receiver.bind(receiver) self._publisher = self.context.socket(zmq.PUB) self._publisher.bind(publisher) # setting up the streams self.loop = ioloop.IOLoop() self._frontstream = zmqstream.ZMQStream(self._frontend, self.loop) self._frontstream.on_recv(self._handle_recv_front) self._backstream = zmqstream.ZMQStream(self._backend, self.loop) self._backstream.on_recv(self._handle_recv_back) self._regstream = zmqstream.ZMQStream(self._registration, self.loop) self._regstream.on_recv(self._handle_reg) self._rcvstream = zmqstream.ZMQStream(self._receiver, self.loop) self._rcvstream.on_recv(self._handle_recv) # heartbeat if heartbeat is not None: self.pong = Heartbeat(heartbeat, io_loop=self.loop, ctx=self.context, onregister=self._deregister) else: self.pong = None # status self.started = False self.poll_timeout = None # controller self.ctrl = BrokerController(self, self.loop, db=db, dboptions=dboptions, agent_timeout=agent_timeout) def _handle_recv(self, msg): # publishing all the data received from agents self._publisher.send(msg[0]) # saving the data locally data = json.loads(msg[0]) agent_id = str(data.get('agent_id')) self.ctrl.save_data(agent_id, data) def _deregister(self): logger.debug('Unregistering all agents') self.ctrl.unregister_agents() def _handle_reg(self, msg): if msg[0] == 'REGISTER': self.ctrl.register_agent(msg[1]) elif msg[0] == 'UNREGISTER': self.ctrl.unregister_agent(msg[1]) def _send_json(self, target, data): try: self._frontstream.send_multipart(target + [json.dumps(data)]) except ValueError: logger.error('Could not dump %s' % str(data)) raise def _handle_recv_front(self, msg, tentative=0): # front => back # if the last part of the message is 'PING', we just PONG back # this is used as a health check data = json.loads(msg[2]) target = msg[:-1] cmd = data['command'] if cmd == 'PING': res = {'result': {'pid': os.getpid(), 'endpoints': self.endpoints, 'agents': self.ctrl.agents}} self._send_json(target, res) return elif cmd == 'LISTRUNS': logger.debug('Asked for LISTRUNS') res = {'result': self.ctrl.list_runs()} logger.debug('Got %s' % str(res)) self._send_json(target, res) return elif cmd == 'STOPRUN': run_id = data['run_id'] stopped_agents = self.ctrl.stop_run(run_id, msg) # we give back the list of agents we stopped res = {'result': stopped_agents} self._send_json(target, res) return elif cmd == 'GET_DATA': # we send back the data we have in the db # XXX stream ? db_data = self.ctrl.get_data(data['run_id'], data_type=data.get('data_type'), groupby=data.get('groupby', False)) self._send_json(target, {'result': db_data}) return elif cmd == 'GET_COUNTS': counts = self.ctrl.get_counts(data['run_id']) self._send_json(target, {'result': counts}) return elif cmd == 'GET_METADATA': metadata = self.ctrl.get_metadata(data['run_id']) self._send_json(target, {'result': metadata}) return # other commands below this point are for agents if tentative == 3: logger.debug('No agents') self._send_json(target, {'error': 'No agent'}) return # the msg tells us which agent to work with data = json.loads(msg[2]) # XXX we need to unserialize here # broker protocol cmd = data['command'] if cmd == 'LIST': # we return a list of agent ids and their status self._send_json(target, {'result': self.ctrl.agents}) return elif cmd == 'RUN': # create a unique id for this run run_id = str(uuid4()) # get some agents try: agents = self.ctrl.reserve_agents(data['agents'], run_id) except NotEnoughWorkersError: self._send_json(target, {'error': 'Not enough agents'}) return # send to every agent with the run_id and the receiver endpoint data['run_id'] = run_id data['args']['zmq_receiver'] = self.endpoints['receiver'] msg[2] = json.dumps(data) # notice when the test was started data['args']['started'] = time.time() # save the tests metadata in the db self.ctrl.save_metadata(run_id, data['args']) self.ctrl.flush_db() for agent_id in agents: self.ctrl.send_to_agent(agent_id, msg) # tell the client which agents where selected. res = {'result': {'agents': agents, 'run_id': run_id}} self._send_json(target, res) return if 'agent_id' not in data: raise NotImplementedError('DEAD CODE?') else: agent_id = str(data['agent_id']) self.ctrl.send_to_agent(agent_id, msg) def _handle_recv_back(self, msg): # back => front #logger.debug('front <- back [%s]' % msg[0]) # let's remove the agent id and track the time it took agent_id = msg[0] msg = msg[1:] # grabbing the data to update the agents statuses if needed data = json.loads(msg[-1]) if 'error' in data: result = data['error'] logger.error(result.get('exception')) else: result = data['result'] if result.get('command') == '_STATUS': statuses = result['status'].values() run_id = self.ctrl.update_status(agent_id, statuses) if run_id is not None: # if the tests are finished, publish this on the pubsub. self._publisher.send(json.dumps({'data_type': 'run-finished', 'run_id': run_id})) return # other things are pass-through try: self._frontstream.send_multipart(msg) except Exception, e: logger.error('Could not send to front') logger.error(msg) # we don't want to die on error. we just log it exc_type, exc_value, exc_traceback = sys.exc_info() exc = traceback.format_tb(exc_traceback) exc.insert(0, str(e)) logger.error('\n'.join(exc))