def match_found(flock_manager, sheep_identity, request): logger.info( "Sending test request for submission [%s] to sheep [%s].", request.submission_id, repr(sheep_identity) ) # Get submission and test harness to send to sheep submission = Submission.objects(id = request.submission_id).exclude( "most_recent", "uploaded_filenames" ).first() assignment = Assignment.objects.get(id = submission.assignment) test_harness = TestHarness.objects.get(id = assignment.test_harness) # Apply any personal deadlines to the assignment object. user = User.objects.get(email = submission.user) assignment.apply_personal_deadlines(user) data = { "assignment": assignment.to_dict(), "submission": submission.to_dict(), "test_harness": test_harness.to_dict() } router_send_json( sheep, sheep_identity, FlockMessage("request", data).to_dict() ) return True
def run(znconsumers): log = logging.getLogger("galah.sheep.maintainer") log.info("Maintainer starting") producer = start_producer() consumers = [] # Continually make sure that all of the threads are up until it's time to # exit while not universal.exiting: if not universal.orphaned_results.empty(): logger.warning( "Orphaned results detected, going into distress mode." ) while not universal.orphaned_results.empty(): try: # We want to create a whole new socket everytime so we don't # stack messages up in the queue. We also don't want to just # send it once and let ZMQ take care of it because it might # be eaten by a defunct shepherd and then we'd be stuck forever. shepherd = universal.context.socket(zmq.DEALER) shepherd.linger = 0 shepherd.connect(config["shepherd/SHEEP_SOCKET"]) shepherd.send_json(FlockMessage("distress", "").to_dict()) logger.info( "Sent distress message to shepherd, waiting for response." ) message = exithelpers.recv_json(shepherd, timeout = 1000 * 60) message = FlockMessage.from_dict(message) if message.type == "bloot" and message.body == "": while not universal.orphaned_results.empty(): result = universal.orphaned_results.get() try: shepherd.send_json( FlockMessage("result", result).to_dict() ) confirmation = exithelpers.recv_json( shepherd, timeout = 1000 * 5 ) confirmation = FlockMessage.from_dict(confirmation) if confirmation.type == "bloot" and \ confirmation.body == "": continue except: universal.orphaned_results.put(result) raise except universal.Exiting: logger.warning( "Orphaned results have not been sent back to the " "shepherd. I WILL NOT ABANDON THEM, YOU WILL HAVE TO " "KILL ME WITH FIRE! (SIGKILL is fire in this analogy)." ) # Nah man. universal.exiting = False continue except exithelpers.Timeout: continue # Remove any dead consumers from the list dead_consumers = 0 for c in consumers[:]: if not c.isAlive(): dead_consumers += 1 consumers.remove(c) if dead_consumers > 0: logger.warning( "Found %d dead consumers, restarting them.", dead_consumers ) # Start up consumers until we have the desired amount while len(consumers) < znconsumers: consumers.append(start_consumer()) # If the producer died, start it again if not producer.isAlive(): log.warning("Found dead producer, restarting it.") producer = start_producer() # Sleep for awhile time.sleep(poll_timeout) raise universal.Exiting()
def _run(): logger = logging.getLogger("galah.sheep.%s" % threading.currentThread().name) logger.info("Consumer starting.") # Initialize the correct consumer suite. virtual_suite = get_virtual_suite(config["VIRTUAL_SUITE"]) consumer = virtual_suite.Consumer(logger) # Set up the socket to send/receive messages to/from the shepherd shepherd = universal.context.socket(zmq.DEALER) shepherd.linger = 0 shepherd.connect(config["shepherd/SHEEP_SOCKET"]) # Loop until the program is shutting down while not universal.exiting: # Prepare a VM and make sure we're completely prepared to handle a test # request before asking the shepherd for one. logger.info("Waiting for virtual machine to become available...") machine_id = consumer.prepare_machine() def bleet(): shepherd.send_json(FlockMessage("bleet", "").to_dict()) # Figure out when we should send the next bleet return ( datetime.datetime.now() + config["shepherd/BLEET_TIMEOUT"] / 2 ) # Send the intitial bleet so that the shepherd knows we're available logger.info("Ready for test request. Sending initial bleet.") next_bleet_time = bleet() # Set to True whenever the shepherd bloots. Set to False everytime we # bleet. If this variable is still False by the time it's time to bleet # again we know we've lost the shepherd. shepherd_blooted = False # Process traffic from shepherd. while True: try: message = exithelpers.recv_json( shepherd, timeout = max( 1, # 1 millisecond (0 would imply infinite timeout) (next_bleet_time - datetime.datetime.now()).seconds * 1000 ) ) message = FlockMessage(message["type"], message["body"]) except exithelpers.Timeout: if not shepherd_blooted: raise universal.ShepherdLost() logger.debug("Sending bleet.") next_bleet_time = bleet() shepherd_blooted = False continue if message.type == "bloot": logger.debug("Got bloot.") shepherd_blooted = True elif message.type == "identify": logger.info( "Received request to identify. Sending environment." ) # identify is a valid response to a bleet. shepherd_blooted = True identification = FlockMessage( type = "environment", body = universal.environment ) shepherd.send_json(identification.to_dict()) elif message.type == "request": # Received test request from the shepherd logger.info("Test request received, running tests.") logger.debug("Test request: %s", str(message)) result = consumer.run_test(machine_id, message.body) # Check to see if the test harness crashed/somehow testing was # unable to be done. if result is None: result = { "failed": True } # Add in the submission id to the result that we send back result["id"] = str(message.body["submission"]["id"]) logger.info("Testing completed, sending results to shepherd.") logger.debug("Raw test results: %s", str(result)) shepherd.send_json(FlockMessage("result", result).to_dict()) # Wait for the shepherd to acknowledge the result. Ignore any # messages that we get from the shepherd besides an acknowledge. deadline = \ datetime.datetime.now() + datetime.timedelta(seconds = 30) while True: try: confirmation = exithelpers.recv_json( shepherd, timeout = max( 1, # 1 millisecond (0 would imply infinite timeout) (deadline - datetime.datetime.now()).seconds * 1000 ) ) confirmation = FlockMessage( confirmation["type"], confirmation["body"] ) except exithelpers.Timeout: raise universal.ShepherdLost(result = result) logger.debug("Received message: %s", str(confirmation)) if confirmation.type == "bloot" and \ confirmation.body == result["id"]: shepherd_blooted = True break break
def main(): flock = FlockManager( match_found, config["BLEET_TIMEOUT"], config["SERVICE_TIMEOUT"] ) logger.info("Shepherd starting.") while True: # Wait until either the public or sheep socket has messages waiting zmq.select([public, sheep], [], [], timeout = 5) # Will grab all of the outstanding messages from the outside and place them # in the request queue while public.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0: request = public.recv_json() logger.debug("Raw test request: %s", str(request)) request = TestRequest.from_dict(request) try: submission = \ Submission.objects.get(id = ObjectId(request.submission_id)) except Submission.DoesNotExist as e: logger.warning( "Received test request for non-existant submission [%s].", str(request.submission_id) ) continue except bson.errors.InvalidId as e: logger.warning("Received malformed test request. %s", str(e)) continue try: assignment = Assignment.objects.get(id = submission.assignment) except Assignment.DoesNotExist as e: logger.error( "Received test request for a submission [%s] referencing " "an invalid assignment [%s].", str(submission.id), str(submission.assignment) ) continue if not assignment.test_harness: logger.warning( "Received test request for a submission [%s] referencing " "an assignment [%s] that does not have a test harness " "associated with it.", str(submission.id), str(submission.assignment) ) continue try: test_harness = \ TestHarness.objects.get(id = assignment.test_harness) except TestHarness.DoesNotExit as e: logger.error( "Received test request for a submission [%s] referencing " "an assignment [%s] that references a non-existant test " "harness [%s].", str(submission.id), str(submission.assignment), str(assignment.test_harness) ) continue # Gather all the necessary information from the test request # received from the outside. processed_request = InternalTestRequest( submission.id, test_harness.config.get("galah/timeout", config["BLEET_TIMEOUT"].seconds), test_harness.config.get("galah/environment", {}) ) logger.info("Received test request.") flock.received_request(processed_request) # Will grab all of the outstanding messages from the sheep and process them while sheep.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0: try: sheep_identity, sheep_message = router_recv_json(sheep) sheep_message = FlockMessage.from_dict(sheep_message) logger.debug( "Received message from sheep: %s", str(sheep_message) ) except ValueError as e: logger.error("Could not decode sheep's message: %s", str(e)) logger.debug( "Exception thrown while decoding sheep's message...", exc_info = sys.exc_info() ) continue if sheep_message.type == "distress": logger.warn("Received distress message. Sending bloot.") router_send_json( sheep, sheep_identity, FlockMessage("bloot", "").to_dict() ) elif sheep_message.type == "bleet": logger.debug( "Sheep [%s] bleeted. Sending bloot.", repr(sheep_identity) ) result = flock.sheep_bleeted(sheep_identity) # Under certain circumstances we want to completely ignore a # bleet (see FlockManager.sheep_bleeted() for more details) if result is FlockManager.IGNORE: logger.debug("Ignoring bleet.") continue if not result: router_send_json( sheep, sheep_identity, FlockMessage("identify", "").to_dict() ) logger.info( "Unrecognized sheep [%s] connected, identify sent.", repr(sheep_identity) ) continue router_send_json( sheep, sheep_identity, FlockMessage("bloot", "").to_dict() ) elif sheep_message.type == "environment": if not flock.manage_sheep(sheep_identity, sheep_message.body): logger.warn( "Received environment from an already-recognized sheep." ) elif sheep_message.type == "result": logger.info("Received test result from sheep.") logger.debug( "Received test result from sheep: %s", str(sheep_message.body) ) try: submission_id = ObjectId(sheep_message.body["id"]) submission = Submission.objects.get(id = submission_id) test_result = TestResult.from_dict(sheep_message.body) try: test_result.save() except InvalidDocument: logger.warn( "Test result is too large for the database.", exc_info = True ) test_result = TestResult(failed = True) test_result.save() submission.test_results = test_result.id submission.save() except (InvalidId, Submission.DoesNotExist) as e: logger.warn( "Could not retrieve submission [%s] for test result " "received from sheep [%s].", str(submission_id), repr(sheep_identity) ) continue router_send_json( sheep, sheep_identity, FlockMessage( "bloot", sheep_message.body["id"] ).to_dict() ) if not flock.sheep_finished(sheep_identity): logger.info( "Got result from sheep [%s] who was not processing " "a test request.", repr(sheep_identity) ) # Let the flock manager get rid of any dead or killed sheep. lost_sheep, killed_sheep = flock.cleanup() if lost_sheep: logger.warn( "%d sheep lost due to bleet timeout: %s", len(lost_sheep), str([repr(i) for i in lost_sheep]) ) if killed_sheep: logger.warn( "%d sheep lost due to request timeout: %s", len(killed_sheep), str([repr(i) for i in killed_sheep]) )
def bleet(): shepherd.send_json(FlockMessage("bleet", "").to_dict()) # Figure out when we should send the next bleet return (datetime.datetime.now() + config["shepherd/BLEET_TIMEOUT"] / 2)
def _run(): logger = logging.getLogger("galah.sheep.%s" % threading.currentThread().name) logger.info("Consumer starting.") # Initialize the correct consumer suite. virtual_suite = get_virtual_suite(config["VIRTUAL_SUITE"]) consumer = virtual_suite.Consumer(logger) # Set up the socket to send/receive messages to/from the shepherd shepherd = universal.context.socket(zmq.DEALER) shepherd.linger = 0 shepherd.connect(config["shepherd/SHEEP_SOCKET"]) # Loop until the program is shutting down while not universal.exiting: # Prepare a VM and make sure we're completely prepared to handle a test # request before asking the shepherd for one. logger.info("Waiting for virtual machine to become available...") machine_id = consumer.prepare_machine() def bleet(): shepherd.send_json(FlockMessage("bleet", "").to_dict()) # Figure out when we should send the next bleet return (datetime.datetime.now() + config["shepherd/BLEET_TIMEOUT"] / 2) # Send the intitial bleet so that the shepherd knows we're available logger.info("Ready for test request. Sending initial bleet.") next_bleet_time = bleet() # Set to True whenever the shepherd bloots. Set to False everytime we # bleet. If this variable is still False by the time it's time to bleet # again we know we've lost the shepherd. shepherd_blooted = False # Process traffic from shepherd. while True: try: message = exithelpers.recv_json( shepherd, timeout=max( 1, # 1 millisecond (0 would imply infinite timeout) (next_bleet_time - datetime.datetime.now()).seconds * 1000)) message = FlockMessage(message["type"], message["body"]) except exithelpers.Timeout: if not shepherd_blooted: raise universal.ShepherdLost() logger.debug("Sending bleet.") next_bleet_time = bleet() shepherd_blooted = False continue if message.type == "bloot": logger.debug("Got bloot.") shepherd_blooted = True elif message.type == "identify": logger.info( "Received request to identify. Sending environment.") # identify is a valid response to a bleet. shepherd_blooted = True identification = FlockMessage(type="environment", body=universal.environment) shepherd.send_json(identification.to_dict()) elif message.type == "request": # Received test request from the shepherd logger.info("Test request received, running tests.") logger.debug("Test request: %s", str(message)) result = consumer.run_test(machine_id, message.body) # Check to see if the test harness crashed/somehow testing was # unable to be done. if result is None: result = {"failed": True} # Add in the submission id to the result that we send back result["id"] = str(message.body["submission"]["id"]) logger.info("Testing completed, sending results to shepherd.") logger.debug("Raw test results: %s", str(result)) shepherd.send_json(FlockMessage("result", result).to_dict()) # Wait for the shepherd to acknowledge the result. Ignore any # messages that we get from the shepherd besides an acknowledge. deadline = \ datetime.datetime.now() + datetime.timedelta(seconds = 30) while True: try: confirmation = exithelpers.recv_json( shepherd, timeout=max( 1, # 1 millisecond (0 would imply infinite timeout) (deadline - datetime.datetime.now()).seconds * 1000)) confirmation = FlockMessage(confirmation["type"], confirmation["body"]) except exithelpers.Timeout: raise universal.ShepherdLost(result=result) logger.debug("Received message: %s", str(confirmation)) if confirmation.type == "bloot" and \ confirmation.body == result["id"]: shepherd_blooted = True break break
def run(znconsumers): log = logging.getLogger("galah.sheep.maintainer") log.info("Maintainer starting") producer = start_producer() consumers = [] # Continually make sure that all of the threads are up until it's time to # exit while not universal.exiting: if not universal.orphaned_results.empty(): logger.warning( "Orphaned results detected, going into distress mode.") while not universal.orphaned_results.empty(): try: # We want to create a whole new socket everytime so we don't # stack messages up in the queue. We also don't want to just # send it once and let ZMQ take care of it because it might # be eaten by a defunct shepherd and then we'd be stuck forever. shepherd = universal.context.socket(zmq.DEALER) shepherd.linger = 0 shepherd.connect(config["shepherd/SHEEP_SOCKET"]) shepherd.send_json(FlockMessage("distress", "").to_dict()) logger.info( "Sent distress message to shepherd, waiting for response.") message = exithelpers.recv_json(shepherd, timeout=1000 * 60) message = FlockMessage.from_dict(message) if message.type == "bloot" and message.body == "": while not universal.orphaned_results.empty(): result = universal.orphaned_results.get() try: shepherd.send_json( FlockMessage("result", result).to_dict()) confirmation = exithelpers.recv_json(shepherd, timeout=1000 * 5) confirmation = FlockMessage.from_dict(confirmation) if confirmation.type == "bloot" and \ confirmation.body == "": continue except: universal.orphaned_results.put(result) raise except universal.Exiting: logger.warning( "Orphaned results have not been sent back to the " "shepherd. I WILL NOT ABANDON THEM, YOU WILL HAVE TO " "KILL ME WITH FIRE! (SIGKILL is fire in this analogy).") # Nah man. universal.exiting = False continue except exithelpers.Timeout: continue # Remove any dead consumers from the list dead_consumers = 0 for c in consumers[:]: if not c.isAlive(): dead_consumers += 1 consumers.remove(c) if dead_consumers > 0: logger.warning("Found %d dead consumers, restarting them.", dead_consumers) # Start up consumers until we have the desired amount while len(consumers) < znconsumers: consumers.append(start_consumer()) # If the producer died, start it again if not producer.isAlive(): log.warning("Found dead producer, restarting it.") producer = start_producer() # Sleep for awhile time.sleep(poll_timeout) raise universal.Exiting()