def main(): flock = FlockManager( match_found, config["BLEET_TIMEOUT"], config["SERVICE_TIMEOUT"] ) logger.info("Shepherd starting.") while True: # Wait until either the public or sheep socket has messages waiting zmq.select([public, sheep], [], [], timeout = 5) # Will grab all of the outstanding messages from the outside and place them # in the request queue while public.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0: request = public.recv_json() logger.debug("Raw test request: %s", str(request)) request = TestRequest.from_dict(request) try: submission = \ Submission.objects.get(id = ObjectId(request.submission_id)) except Submission.DoesNotExist as e: logger.warning( "Received test request for non-existant submission [%s].", str(request.submission_id) ) continue except bson.errors.InvalidId as e: logger.warning("Received malformed test request. %s", str(e)) continue try: assignment = Assignment.objects.get(id = submission.assignment) except Assignment.DoesNotExist as e: logger.error( "Received test request for a submission [%s] referencing " "an invalid assignment [%s].", str(submission.id), str(submission.assignment) ) continue if not assignment.test_harness: logger.warning( "Received test request for a submission [%s] referencing " "an assignment [%s] that does not have a test harness " "associated with it.", str(submission.id), str(submission.assignment) ) continue try: test_harness = \ TestHarness.objects.get(id = assignment.test_harness) except TestHarness.DoesNotExit as e: logger.error( "Received test request for a submission [%s] referencing " "an assignment [%s] that references a non-existant test " "harness [%s].", str(submission.id), str(submission.assignment), str(assignment.test_harness) ) continue # Gather all the necessary information from the test request # received from the outside. processed_request = InternalTestRequest( submission.id, test_harness.config.get("galah/timeout", config["BLEET_TIMEOUT"].seconds), test_harness.config.get("galah/environment", {}) ) logger.info("Received test request.") flock.received_request(processed_request) # Will grab all of the outstanding messages from the sheep and process them while sheep.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0: try: sheep_identity, sheep_message = router_recv_json(sheep) sheep_message = FlockMessage.from_dict(sheep_message) logger.debug( "Received message from sheep: %s", str(sheep_message) ) except ValueError as e: logger.error("Could not decode sheep's message: %s", str(e)) logger.debug( "Exception thrown while decoding sheep's message...", exc_info = sys.exc_info() ) continue if sheep_message.type == "distress": logger.warn("Received distress message. Sending bloot.") router_send_json( sheep, sheep_identity, FlockMessage("bloot", "").to_dict() ) elif sheep_message.type == "bleet": logger.debug( "Sheep [%s] bleeted. Sending bloot.", repr(sheep_identity) ) result = flock.sheep_bleeted(sheep_identity) # Under certain circumstances we want to completely ignore a # bleet (see FlockManager.sheep_bleeted() for more details) if result is FlockManager.IGNORE: logger.debug("Ignoring bleet.") continue if not result: router_send_json( sheep, sheep_identity, FlockMessage("identify", "").to_dict() ) logger.info( "Unrecognized sheep [%s] connected, identify sent.", repr(sheep_identity) ) continue router_send_json( sheep, sheep_identity, FlockMessage("bloot", "").to_dict() ) elif sheep_message.type == "environment": if not flock.manage_sheep(sheep_identity, sheep_message.body): logger.warn( "Received environment from an already-recognized sheep." ) elif sheep_message.type == "result": logger.info("Received test result from sheep.") logger.debug( "Received test result from sheep: %s", str(sheep_message.body) ) try: submission_id = ObjectId(sheep_message.body["id"]) submission = Submission.objects.get(id = submission_id) test_result = TestResult.from_dict(sheep_message.body) try: test_result.save() except InvalidDocument: logger.warn( "Test result is too large for the database.", exc_info = True ) test_result = TestResult(failed = True) test_result.save() submission.test_results = test_result.id submission.save() except (InvalidId, Submission.DoesNotExist) as e: logger.warn( "Could not retrieve submission [%s] for test result " "received from sheep [%s].", str(submission_id), repr(sheep_identity) ) continue router_send_json( sheep, sheep_identity, FlockMessage( "bloot", sheep_message.body["id"] ).to_dict() ) if not flock.sheep_finished(sheep_identity): logger.info( "Got result from sheep [%s] who was not processing " "a test request.", repr(sheep_identity) ) # Let the flock manager get rid of any dead or killed sheep. lost_sheep, killed_sheep = flock.cleanup() if lost_sheep: logger.warn( "%d sheep lost due to bleet timeout: %s", len(lost_sheep), str([repr(i) for i in lost_sheep]) ) if killed_sheep: logger.warn( "%d sheep lost due to request timeout: %s", len(killed_sheep), str([repr(i) for i in killed_sheep]) )
def run(znconsumers): log = logging.getLogger("galah.sheep.maintainer") log.info("Maintainer starting") producer = start_producer() consumers = [] # Continually make sure that all of the threads are up until it's time to # exit while not universal.exiting: if not universal.orphaned_results.empty(): logger.warning( "Orphaned results detected, going into distress mode." ) while not universal.orphaned_results.empty(): try: # We want to create a whole new socket everytime so we don't # stack messages up in the queue. We also don't want to just # send it once and let ZMQ take care of it because it might # be eaten by a defunct shepherd and then we'd be stuck forever. shepherd = universal.context.socket(zmq.DEALER) shepherd.linger = 0 shepherd.connect(config["shepherd/SHEEP_SOCKET"]) shepherd.send_json(FlockMessage("distress", "").to_dict()) logger.info( "Sent distress message to shepherd, waiting for response." ) message = exithelpers.recv_json(shepherd, timeout = 1000 * 60) message = FlockMessage.from_dict(message) if message.type == "bloot" and message.body == "": while not universal.orphaned_results.empty(): result = universal.orphaned_results.get() try: shepherd.send_json( FlockMessage("result", result).to_dict() ) confirmation = exithelpers.recv_json( shepherd, timeout = 1000 * 5 ) confirmation = FlockMessage.from_dict(confirmation) if confirmation.type == "bloot" and \ confirmation.body == "": continue except: universal.orphaned_results.put(result) raise except universal.Exiting: logger.warning( "Orphaned results have not been sent back to the " "shepherd. I WILL NOT ABANDON THEM, YOU WILL HAVE TO " "KILL ME WITH FIRE! (SIGKILL is fire in this analogy)." ) # Nah man. universal.exiting = False continue except exithelpers.Timeout: continue # Remove any dead consumers from the list dead_consumers = 0 for c in consumers[:]: if not c.isAlive(): dead_consumers += 1 consumers.remove(c) if dead_consumers > 0: logger.warning( "Found %d dead consumers, restarting them.", dead_consumers ) # Start up consumers until we have the desired amount while len(consumers) < znconsumers: consumers.append(start_consumer()) # If the producer died, start it again if not producer.isAlive(): log.warning("Found dead producer, restarting it.") producer = start_producer() # Sleep for awhile time.sleep(poll_timeout) raise universal.Exiting()
def run(znconsumers): log = logging.getLogger("galah.sheep.maintainer") log.info("Maintainer starting") producer = start_producer() consumers = [] # Continually make sure that all of the threads are up until it's time to # exit while not universal.exiting: if not universal.orphaned_results.empty(): logger.warning( "Orphaned results detected, going into distress mode.") while not universal.orphaned_results.empty(): try: # We want to create a whole new socket everytime so we don't # stack messages up in the queue. We also don't want to just # send it once and let ZMQ take care of it because it might # be eaten by a defunct shepherd and then we'd be stuck forever. shepherd = universal.context.socket(zmq.DEALER) shepherd.linger = 0 shepherd.connect(config["shepherd/SHEEP_SOCKET"]) shepherd.send_json(FlockMessage("distress", "").to_dict()) logger.info( "Sent distress message to shepherd, waiting for response.") message = exithelpers.recv_json(shepherd, timeout=1000 * 60) message = FlockMessage.from_dict(message) if message.type == "bloot" and message.body == "": while not universal.orphaned_results.empty(): result = universal.orphaned_results.get() try: shepherd.send_json( FlockMessage("result", result).to_dict()) confirmation = exithelpers.recv_json(shepherd, timeout=1000 * 5) confirmation = FlockMessage.from_dict(confirmation) if confirmation.type == "bloot" and \ confirmation.body == "": continue except: universal.orphaned_results.put(result) raise except universal.Exiting: logger.warning( "Orphaned results have not been sent back to the " "shepherd. I WILL NOT ABANDON THEM, YOU WILL HAVE TO " "KILL ME WITH FIRE! (SIGKILL is fire in this analogy).") # Nah man. universal.exiting = False continue except exithelpers.Timeout: continue # Remove any dead consumers from the list dead_consumers = 0 for c in consumers[:]: if not c.isAlive(): dead_consumers += 1 consumers.remove(c) if dead_consumers > 0: logger.warning("Found %d dead consumers, restarting them.", dead_consumers) # Start up consumers until we have the desired amount while len(consumers) < znconsumers: consumers.append(start_consumer()) # If the producer died, start it again if not producer.isAlive(): log.warning("Found dead producer, restarting it.") producer = start_producer() # Sleep for awhile time.sleep(poll_timeout) raise universal.Exiting()