Пример #1
0
def match_found(flock_manager, sheep_identity, request):
    logger.info(
        "Sending test request for submission [%s] to sheep [%s].",
        request.submission_id,
        repr(sheep_identity)
    )

    # Get submission and test harness to send to sheep
    submission = Submission.objects(id = request.submission_id).exclude(
        "most_recent",
        "uploaded_filenames"
    ).first()
    assignment = Assignment.objects.get(id = submission.assignment)
    test_harness = TestHarness.objects.get(id = assignment.test_harness)

    # Apply any personal deadlines to the assignment object.
    user = User.objects.get(email = submission.user)
    assignment.apply_personal_deadlines(user)

    data = {
        "assignment": assignment.to_dict(),
        "submission": submission.to_dict(),
        "test_harness": test_harness.to_dict()
    }

    router_send_json(
        sheep,
        sheep_identity,
        FlockMessage("request", data).to_dict()
    )

    return True
Пример #2
0
def run(znconsumers):
    log = logging.getLogger("galah.sheep.maintainer")

    log.info("Maintainer starting")

    producer = start_producer()
    consumers = []

    # Continually make sure that all of the threads are up until it's time to
    # exit
    while not universal.exiting:
        if not universal.orphaned_results.empty():
            logger.warning(
                "Orphaned results detected, going into distress mode."
            )

        while not universal.orphaned_results.empty():
            try:
                # We want to create a whole new socket everytime so we don't
                # stack messages up in the queue. We also don't want to just
                # send it once and let ZMQ take care of it because it might
                # be eaten by a defunct shepherd and then we'd be stuck forever.
                shepherd = universal.context.socket(zmq.DEALER)
                shepherd.linger = 0
                shepherd.connect(config["shepherd/SHEEP_SOCKET"])

                shepherd.send_json(FlockMessage("distress", "").to_dict())

                logger.info(
                    "Sent distress message to shepherd, waiting for response."
                )

                message = exithelpers.recv_json(shepherd, timeout = 1000 * 60)
                message = FlockMessage.from_dict(message)

                if message.type == "bloot" and message.body == "":
                    while not universal.orphaned_results.empty():
                        result = universal.orphaned_results.get()

                        try:
                            shepherd.send_json(
                                FlockMessage("result", result).to_dict()
                            )

                            confirmation = exithelpers.recv_json(
                                shepherd, timeout = 1000 * 5
                            )
                            confirmation = FlockMessage.from_dict(confirmation)

                            if confirmation.type == "bloot" and \
                                    confirmation.body == "":
                                continue
                        except:
                            universal.orphaned_results.put(result)
                            raise
            except universal.Exiting:
                logger.warning(
                    "Orphaned results have not been sent back to the "
                    "shepherd. I WILL NOT ABANDON THEM, YOU WILL HAVE TO "
                    "KILL ME WITH FIRE! (SIGKILL is fire in this analogy)."
                )

                # Nah man.
                universal.exiting = False

                continue
            except exithelpers.Timeout:
                continue

        # Remove any dead consumers from the list
        dead_consumers = 0
        for c in consumers[:]:
            if not c.isAlive():
                dead_consumers += 1
                consumers.remove(c)

        if dead_consumers > 0:
            logger.warning(
                "Found %d dead consumers, restarting them.", dead_consumers
            )

        # Start up consumers until we have the desired amount
        while len(consumers) < znconsumers:
            consumers.append(start_consumer())

        # If the producer died, start it again
        if not producer.isAlive():
            log.warning("Found dead producer, restarting it.")

            producer = start_producer()

        # Sleep for awhile
        time.sleep(poll_timeout)

    raise universal.Exiting()
Пример #3
0
def _run():
    logger = logging.getLogger("galah.sheep.%s" % threading.currentThread().name)
    logger.info("Consumer starting.")

    # Initialize the correct consumer suite.
    virtual_suite = get_virtual_suite(config["VIRTUAL_SUITE"])
    consumer = virtual_suite.Consumer(logger)

    # Set up the socket to send/receive messages to/from the shepherd
    shepherd = universal.context.socket(zmq.DEALER)
    shepherd.linger = 0
    shepherd.connect(config["shepherd/SHEEP_SOCKET"])

    # Loop until the program is shutting down
    while not universal.exiting:
        # Prepare a VM and make sure we're completely prepared to handle a test
        # request before asking the shepherd for one.
        logger.info("Waiting for virtual machine to become available...")
        machine_id = consumer.prepare_machine()

        def bleet():
            shepherd.send_json(FlockMessage("bleet", "").to_dict())

            # Figure out when we should send the next bleet
            return (
                datetime.datetime.now() + config["shepherd/BLEET_TIMEOUT"] / 2
            )

        # Send the intitial bleet so that the shepherd knows we're available
        logger.info("Ready for test request. Sending initial bleet.")
        next_bleet_time = bleet()

        # Set to True whenever the shepherd bloots. Set to False everytime we
        # bleet. If this variable is still False by the time it's time to bleet
        # again we know we've lost the shepherd.
        shepherd_blooted = False

        # Process traffic from shepherd.
        while True:
            try:
                message = exithelpers.recv_json(
                    shepherd,
                    timeout = max(
                        1, # 1 millisecond (0 would imply infinite timeout)
                        (next_bleet_time - datetime.datetime.now()).seconds
                            * 1000
                    )
                )

                message = FlockMessage(message["type"], message["body"])
            except exithelpers.Timeout:
                if not shepherd_blooted:
                    raise universal.ShepherdLost()

                logger.debug("Sending bleet.")
                next_bleet_time = bleet()
                shepherd_blooted = False

                continue

            if message.type == "bloot":
                logger.debug("Got bloot.")
                shepherd_blooted = True

            elif message.type == "identify":
                logger.info(
                    "Received request to identify. Sending environment."
                )

                # identify is a valid response to a bleet.
                shepherd_blooted = True

                identification = FlockMessage(
                    type = "environment",
                    body = universal.environment
                )

                shepherd.send_json(identification.to_dict())

            elif message.type == "request":
                # Received test request from the shepherd
                logger.info("Test request received, running tests.")
                logger.debug("Test request: %s", str(message))
                result = consumer.run_test(machine_id, message.body)

                # Check to see if the test harness crashed/somehow testing was
                # unable to be done.
                if result is None:
                    result = {
                        "failed": True
                    }

                # Add in the submission id to the result that we send back
                result["id"] = str(message.body["submission"]["id"])

                logger.info("Testing completed, sending results to shepherd.")
                logger.debug("Raw test results: %s", str(result))
                shepherd.send_json(FlockMessage("result", result).to_dict())

                # Wait for the shepherd to acknowledge the result. Ignore any
                # messages that we get from the shepherd besides an acknowledge.
                deadline = \
                    datetime.datetime.now() + datetime.timedelta(seconds = 30)
                while True:
                    try:
                        confirmation = exithelpers.recv_json(
                            shepherd,
                            timeout = max(
                                1, # 1 millisecond (0 would imply infinite timeout)
                                (deadline - datetime.datetime.now()).seconds
                                    * 1000
                            )
                        )

                        confirmation = FlockMessage(
                            confirmation["type"], confirmation["body"]
                        )
                    except exithelpers.Timeout:
                        raise universal.ShepherdLost(result = result)

                    logger.debug("Received message: %s", str(confirmation))

                    if confirmation.type == "bloot" and \
                            confirmation.body == result["id"]:
                        shepherd_blooted = True
                        break

                break
Пример #4
0
def main():
    flock = FlockManager(
        match_found,
        config["BLEET_TIMEOUT"],
        config["SERVICE_TIMEOUT"]
    )

    logger.info("Shepherd starting.")

    while True:
        # Wait until either the public or sheep socket has messages waiting
        zmq.select([public, sheep], [], [], timeout = 5)

        # Will grab all of the outstanding messages from the outside and place them
        # in the request queue
        while public.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
            request = public.recv_json()
            logger.debug("Raw test request: %s", str(request))

            request = TestRequest.from_dict(request)
            try:
                submission = \
                    Submission.objects.get(id = ObjectId(request.submission_id))
            except Submission.DoesNotExist as e:
                logger.warning(
                    "Received test request for non-existant submission [%s].",
                    str(request.submission_id)
                )
                continue
            except bson.errors.InvalidId as e:
                logger.warning("Received malformed test request. %s", str(e))
                continue

            try:
                assignment = Assignment.objects.get(id = submission.assignment)
            except Assignment.DoesNotExist as e:
                logger.error(
                    "Received test request for a submission [%s] referencing "
                    "an invalid assignment [%s].",
                    str(submission.id),
                    str(submission.assignment)
                )
                continue

            if not assignment.test_harness:
                logger.warning(
                    "Received test request for a submission [%s] referencing "
                    "an assignment [%s] that does not have a test harness "
                    "associated with it.",
                    str(submission.id),
                    str(submission.assignment)
                )
                continue

            try:
                test_harness = \
                    TestHarness.objects.get(id = assignment.test_harness)
            except TestHarness.DoesNotExit as e:
                logger.error(
                    "Received test request for a submission [%s] referencing "
                    "an assignment [%s] that references a non-existant test "
                    "harness [%s].",
                    str(submission.id),
                    str(submission.assignment),
                    str(assignment.test_harness)
                )
                continue

            # Gather all the necessary information from the test request
            # received from the outside.
            processed_request = InternalTestRequest(
                submission.id,
                test_harness.config.get("galah/timeout",
                    config["BLEET_TIMEOUT"].seconds),
                test_harness.config.get("galah/environment", {})
            )

            logger.info("Received test request.")

            flock.received_request(processed_request)


        # Will grab all of the outstanding messages from the sheep and process them
        while sheep.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
            try:
                sheep_identity, sheep_message = router_recv_json(sheep)
                sheep_message = FlockMessage.from_dict(sheep_message)
                logger.debug(
                    "Received message from sheep: %s",
                    str(sheep_message)
                )
            except ValueError as e:
                logger.error("Could not decode sheep's message: %s", str(e))
                logger.debug(
                    "Exception thrown while decoding sheep's message...",
                    exc_info = sys.exc_info()
                )
                continue

            if sheep_message.type == "distress":
                logger.warn("Received distress message. Sending bloot.")
                router_send_json(
                    sheep, sheep_identity, FlockMessage("bloot", "").to_dict()
                )

            elif sheep_message.type == "bleet":
                logger.debug(
                    "Sheep [%s] bleeted. Sending bloot.",
                    repr(sheep_identity)
                )

                result = flock.sheep_bleeted(sheep_identity)

                # Under certain circumstances we want to completely ignore a
                # bleet (see FlockManager.sheep_bleeted() for more details)
                if result is FlockManager.IGNORE:
                    logger.debug("Ignoring bleet.")
                    continue

                if not result:
                    router_send_json(
                        sheep,
                        sheep_identity,
                        FlockMessage("identify", "").to_dict()
                    )

                    logger.info(
                        "Unrecognized sheep [%s] connected, identify sent.",
                        repr(sheep_identity)
                    )

                    continue

                router_send_json(
                    sheep,
                    sheep_identity,
                    FlockMessage("bloot", "").to_dict()
                )
            elif sheep_message.type == "environment":
                if not flock.manage_sheep(sheep_identity, sheep_message.body):
                    logger.warn(
                        "Received environment from an already-recognized sheep."
                    )
            elif sheep_message.type == "result":
                logger.info("Received test result from sheep.")
                logger.debug(
                    "Received test result from sheep: %s",
                    str(sheep_message.body)
                )

                try:
                    submission_id = ObjectId(sheep_message.body["id"])

                    submission = Submission.objects.get(id = submission_id)

                    test_result = TestResult.from_dict(sheep_message.body)
                    try:
                        test_result.save()
                    except InvalidDocument:
                        logger.warn(
                            "Test result is too large for the database.",
                            exc_info = True
                        )
                        test_result = TestResult(failed = True)
                        test_result.save()

                    submission.test_results = test_result.id
                    submission.save()
                except (InvalidId, Submission.DoesNotExist) as e:
                    logger.warn(
                        "Could not retrieve submission [%s] for test result "
                        "received from sheep [%s].",
                        str(submission_id),
                        repr(sheep_identity)
                    )

                    continue

                router_send_json(
                    sheep,
                    sheep_identity,
                    FlockMessage(
                        "bloot", sheep_message.body["id"]
                    ).to_dict()
                )

                if not flock.sheep_finished(sheep_identity):
                    logger.info(
                        "Got result from sheep [%s] who was not processing "
                        "a test request.",
                        repr(sheep_identity)
                    )

        # Let the flock manager get rid of any dead or killed sheep.
        lost_sheep, killed_sheep = flock.cleanup()

        if lost_sheep:
            logger.warn(
                "%d sheep lost due to bleet timeout: %s",
                len(lost_sheep),
                str([repr(i) for i in lost_sheep])
            )

        if killed_sheep:
            logger.warn(
                "%d sheep lost due to request timeout: %s",
                len(killed_sheep),
                str([repr(i) for i in killed_sheep])
            )
Пример #5
0
        def bleet():
            shepherd.send_json(FlockMessage("bleet", "").to_dict())

            # Figure out when we should send the next bleet
            return (datetime.datetime.now() +
                    config["shepherd/BLEET_TIMEOUT"] / 2)
Пример #6
0
def _run():
    logger = logging.getLogger("galah.sheep.%s" %
                               threading.currentThread().name)
    logger.info("Consumer starting.")

    # Initialize the correct consumer suite.
    virtual_suite = get_virtual_suite(config["VIRTUAL_SUITE"])
    consumer = virtual_suite.Consumer(logger)

    # Set up the socket to send/receive messages to/from the shepherd
    shepherd = universal.context.socket(zmq.DEALER)
    shepherd.linger = 0
    shepherd.connect(config["shepherd/SHEEP_SOCKET"])

    # Loop until the program is shutting down
    while not universal.exiting:
        # Prepare a VM and make sure we're completely prepared to handle a test
        # request before asking the shepherd for one.
        logger.info("Waiting for virtual machine to become available...")
        machine_id = consumer.prepare_machine()

        def bleet():
            shepherd.send_json(FlockMessage("bleet", "").to_dict())

            # Figure out when we should send the next bleet
            return (datetime.datetime.now() +
                    config["shepherd/BLEET_TIMEOUT"] / 2)

        # Send the intitial bleet so that the shepherd knows we're available
        logger.info("Ready for test request. Sending initial bleet.")
        next_bleet_time = bleet()

        # Set to True whenever the shepherd bloots. Set to False everytime we
        # bleet. If this variable is still False by the time it's time to bleet
        # again we know we've lost the shepherd.
        shepherd_blooted = False

        # Process traffic from shepherd.
        while True:
            try:
                message = exithelpers.recv_json(
                    shepherd,
                    timeout=max(
                        1,  # 1 millisecond (0 would imply infinite timeout)
                        (next_bleet_time - datetime.datetime.now()).seconds *
                        1000))

                message = FlockMessage(message["type"], message["body"])
            except exithelpers.Timeout:
                if not shepherd_blooted:
                    raise universal.ShepherdLost()

                logger.debug("Sending bleet.")
                next_bleet_time = bleet()
                shepherd_blooted = False

                continue

            if message.type == "bloot":
                logger.debug("Got bloot.")
                shepherd_blooted = True

            elif message.type == "identify":
                logger.info(
                    "Received request to identify. Sending environment.")

                # identify is a valid response to a bleet.
                shepherd_blooted = True

                identification = FlockMessage(type="environment",
                                              body=universal.environment)

                shepherd.send_json(identification.to_dict())

            elif message.type == "request":
                # Received test request from the shepherd
                logger.info("Test request received, running tests.")
                logger.debug("Test request: %s", str(message))
                result = consumer.run_test(machine_id, message.body)

                # Check to see if the test harness crashed/somehow testing was
                # unable to be done.
                if result is None:
                    result = {"failed": True}

                # Add in the submission id to the result that we send back
                result["id"] = str(message.body["submission"]["id"])

                logger.info("Testing completed, sending results to shepherd.")
                logger.debug("Raw test results: %s", str(result))
                shepherd.send_json(FlockMessage("result", result).to_dict())

                # Wait for the shepherd to acknowledge the result. Ignore any
                # messages that we get from the shepherd besides an acknowledge.
                deadline = \
                    datetime.datetime.now() + datetime.timedelta(seconds = 30)
                while True:
                    try:
                        confirmation = exithelpers.recv_json(
                            shepherd,
                            timeout=max(
                                1,  # 1 millisecond (0 would imply infinite timeout)
                                (deadline - datetime.datetime.now()).seconds *
                                1000))

                        confirmation = FlockMessage(confirmation["type"],
                                                    confirmation["body"])
                    except exithelpers.Timeout:
                        raise universal.ShepherdLost(result=result)

                    logger.debug("Received message: %s", str(confirmation))

                    if confirmation.type == "bloot" and \
                            confirmation.body == result["id"]:
                        shepherd_blooted = True
                        break

                break
Пример #7
0
def run(znconsumers):
    log = logging.getLogger("galah.sheep.maintainer")

    log.info("Maintainer starting")

    producer = start_producer()
    consumers = []

    # Continually make sure that all of the threads are up until it's time to
    # exit
    while not universal.exiting:
        if not universal.orphaned_results.empty():
            logger.warning(
                "Orphaned results detected, going into distress mode.")

        while not universal.orphaned_results.empty():
            try:
                # We want to create a whole new socket everytime so we don't
                # stack messages up in the queue. We also don't want to just
                # send it once and let ZMQ take care of it because it might
                # be eaten by a defunct shepherd and then we'd be stuck forever.
                shepherd = universal.context.socket(zmq.DEALER)
                shepherd.linger = 0
                shepherd.connect(config["shepherd/SHEEP_SOCKET"])

                shepherd.send_json(FlockMessage("distress", "").to_dict())

                logger.info(
                    "Sent distress message to shepherd, waiting for response.")

                message = exithelpers.recv_json(shepherd, timeout=1000 * 60)
                message = FlockMessage.from_dict(message)

                if message.type == "bloot" and message.body == "":
                    while not universal.orphaned_results.empty():
                        result = universal.orphaned_results.get()

                        try:
                            shepherd.send_json(
                                FlockMessage("result", result).to_dict())

                            confirmation = exithelpers.recv_json(shepherd,
                                                                 timeout=1000 *
                                                                 5)
                            confirmation = FlockMessage.from_dict(confirmation)

                            if confirmation.type == "bloot" and \
                                    confirmation.body == "":
                                continue
                        except:
                            universal.orphaned_results.put(result)
                            raise
            except universal.Exiting:
                logger.warning(
                    "Orphaned results have not been sent back to the "
                    "shepherd. I WILL NOT ABANDON THEM, YOU WILL HAVE TO "
                    "KILL ME WITH FIRE! (SIGKILL is fire in this analogy).")

                # Nah man.
                universal.exiting = False

                continue
            except exithelpers.Timeout:
                continue

        # Remove any dead consumers from the list
        dead_consumers = 0
        for c in consumers[:]:
            if not c.isAlive():
                dead_consumers += 1
                consumers.remove(c)

        if dead_consumers > 0:
            logger.warning("Found %d dead consumers, restarting them.",
                           dead_consumers)

        # Start up consumers until we have the desired amount
        while len(consumers) < znconsumers:
            consumers.append(start_consumer())

        # If the producer died, start it again
        if not producer.isAlive():
            log.warning("Found dead producer, restarting it.")

            producer = start_producer()

        # Sleep for awhile
        time.sleep(poll_timeout)

    raise universal.Exiting()