예제 #1
0
def main():
    flock = FlockManager(
        match_found,
        config["BLEET_TIMEOUT"],
        config["SERVICE_TIMEOUT"]
    )

    logger.info("Shepherd starting.")

    while True:
        # Wait until either the public or sheep socket has messages waiting
        zmq.select([public, sheep], [], [], timeout = 5)

        # Will grab all of the outstanding messages from the outside and place them
        # in the request queue
        while public.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
            request = public.recv_json()
            logger.debug("Raw test request: %s", str(request))

            request = TestRequest.from_dict(request)
            try:
                submission = \
                    Submission.objects.get(id = ObjectId(request.submission_id))
            except Submission.DoesNotExist as e:
                logger.warning(
                    "Received test request for non-existant submission [%s].",
                    str(request.submission_id)
                )
                continue
            except bson.errors.InvalidId as e:
                logger.warning("Received malformed test request. %s", str(e))
                continue

            try:
                assignment = Assignment.objects.get(id = submission.assignment)
            except Assignment.DoesNotExist as e:
                logger.error(
                    "Received test request for a submission [%s] referencing "
                    "an invalid assignment [%s].",
                    str(submission.id),
                    str(submission.assignment)
                )
                continue

            if not assignment.test_harness:
                logger.warning(
                    "Received test request for a submission [%s] referencing "
                    "an assignment [%s] that does not have a test harness "
                    "associated with it.",
                    str(submission.id),
                    str(submission.assignment)
                )
                continue

            try:
                test_harness = \
                    TestHarness.objects.get(id = assignment.test_harness)
            except TestHarness.DoesNotExit as e:
                logger.error(
                    "Received test request for a submission [%s] referencing "
                    "an assignment [%s] that references a non-existant test "
                    "harness [%s].",
                    str(submission.id),
                    str(submission.assignment),
                    str(assignment.test_harness)
                )
                continue

            # Gather all the necessary information from the test request
            # received from the outside.
            processed_request = InternalTestRequest(
                submission.id,
                test_harness.config.get("galah/timeout",
                    config["BLEET_TIMEOUT"].seconds),
                test_harness.config.get("galah/environment", {})
            )

            logger.info("Received test request.")

            flock.received_request(processed_request)


        # Will grab all of the outstanding messages from the sheep and process them
        while sheep.getsockopt(zmq.EVENTS) & zmq.POLLIN != 0:
            try:
                sheep_identity, sheep_message = router_recv_json(sheep)
                sheep_message = FlockMessage.from_dict(sheep_message)
                logger.debug(
                    "Received message from sheep: %s",
                    str(sheep_message)
                )
            except ValueError as e:
                logger.error("Could not decode sheep's message: %s", str(e))
                logger.debug(
                    "Exception thrown while decoding sheep's message...",
                    exc_info = sys.exc_info()
                )
                continue

            if sheep_message.type == "distress":
                logger.warn("Received distress message. Sending bloot.")
                router_send_json(
                    sheep, sheep_identity, FlockMessage("bloot", "").to_dict()
                )

            elif sheep_message.type == "bleet":
                logger.debug(
                    "Sheep [%s] bleeted. Sending bloot.",
                    repr(sheep_identity)
                )

                result = flock.sheep_bleeted(sheep_identity)

                # Under certain circumstances we want to completely ignore a
                # bleet (see FlockManager.sheep_bleeted() for more details)
                if result is FlockManager.IGNORE:
                    logger.debug("Ignoring bleet.")
                    continue

                if not result:
                    router_send_json(
                        sheep,
                        sheep_identity,
                        FlockMessage("identify", "").to_dict()
                    )

                    logger.info(
                        "Unrecognized sheep [%s] connected, identify sent.",
                        repr(sheep_identity)
                    )

                    continue

                router_send_json(
                    sheep,
                    sheep_identity,
                    FlockMessage("bloot", "").to_dict()
                )
            elif sheep_message.type == "environment":
                if not flock.manage_sheep(sheep_identity, sheep_message.body):
                    logger.warn(
                        "Received environment from an already-recognized sheep."
                    )
            elif sheep_message.type == "result":
                logger.info("Received test result from sheep.")
                logger.debug(
                    "Received test result from sheep: %s",
                    str(sheep_message.body)
                )

                try:
                    submission_id = ObjectId(sheep_message.body["id"])

                    submission = Submission.objects.get(id = submission_id)

                    test_result = TestResult.from_dict(sheep_message.body)
                    try:
                        test_result.save()
                    except InvalidDocument:
                        logger.warn(
                            "Test result is too large for the database.",
                            exc_info = True
                        )
                        test_result = TestResult(failed = True)
                        test_result.save()

                    submission.test_results = test_result.id
                    submission.save()
                except (InvalidId, Submission.DoesNotExist) as e:
                    logger.warn(
                        "Could not retrieve submission [%s] for test result "
                        "received from sheep [%s].",
                        str(submission_id),
                        repr(sheep_identity)
                    )

                    continue

                router_send_json(
                    sheep,
                    sheep_identity,
                    FlockMessage(
                        "bloot", sheep_message.body["id"]
                    ).to_dict()
                )

                if not flock.sheep_finished(sheep_identity):
                    logger.info(
                        "Got result from sheep [%s] who was not processing "
                        "a test request.",
                        repr(sheep_identity)
                    )

        # Let the flock manager get rid of any dead or killed sheep.
        lost_sheep, killed_sheep = flock.cleanup()

        if lost_sheep:
            logger.warn(
                "%d sheep lost due to bleet timeout: %s",
                len(lost_sheep),
                str([repr(i) for i in lost_sheep])
            )

        if killed_sheep:
            logger.warn(
                "%d sheep lost due to request timeout: %s",
                len(killed_sheep),
                str([repr(i) for i in killed_sheep])
            )
예제 #2
0
def run(znconsumers):
    log = logging.getLogger("galah.sheep.maintainer")

    log.info("Maintainer starting")

    producer = start_producer()
    consumers = []

    # Continually make sure that all of the threads are up until it's time to
    # exit
    while not universal.exiting:
        if not universal.orphaned_results.empty():
            logger.warning(
                "Orphaned results detected, going into distress mode."
            )

        while not universal.orphaned_results.empty():
            try:
                # We want to create a whole new socket everytime so we don't
                # stack messages up in the queue. We also don't want to just
                # send it once and let ZMQ take care of it because it might
                # be eaten by a defunct shepherd and then we'd be stuck forever.
                shepherd = universal.context.socket(zmq.DEALER)
                shepherd.linger = 0
                shepherd.connect(config["shepherd/SHEEP_SOCKET"])

                shepherd.send_json(FlockMessage("distress", "").to_dict())

                logger.info(
                    "Sent distress message to shepherd, waiting for response."
                )

                message = exithelpers.recv_json(shepherd, timeout = 1000 * 60)
                message = FlockMessage.from_dict(message)

                if message.type == "bloot" and message.body == "":
                    while not universal.orphaned_results.empty():
                        result = universal.orphaned_results.get()

                        try:
                            shepherd.send_json(
                                FlockMessage("result", result).to_dict()
                            )

                            confirmation = exithelpers.recv_json(
                                shepherd, timeout = 1000 * 5
                            )
                            confirmation = FlockMessage.from_dict(confirmation)

                            if confirmation.type == "bloot" and \
                                    confirmation.body == "":
                                continue
                        except:
                            universal.orphaned_results.put(result)
                            raise
            except universal.Exiting:
                logger.warning(
                    "Orphaned results have not been sent back to the "
                    "shepherd. I WILL NOT ABANDON THEM, YOU WILL HAVE TO "
                    "KILL ME WITH FIRE! (SIGKILL is fire in this analogy)."
                )

                # Nah man.
                universal.exiting = False

                continue
            except exithelpers.Timeout:
                continue

        # Remove any dead consumers from the list
        dead_consumers = 0
        for c in consumers[:]:
            if not c.isAlive():
                dead_consumers += 1
                consumers.remove(c)

        if dead_consumers > 0:
            logger.warning(
                "Found %d dead consumers, restarting them.", dead_consumers
            )

        # Start up consumers until we have the desired amount
        while len(consumers) < znconsumers:
            consumers.append(start_consumer())

        # If the producer died, start it again
        if not producer.isAlive():
            log.warning("Found dead producer, restarting it.")

            producer = start_producer()

        # Sleep for awhile
        time.sleep(poll_timeout)

    raise universal.Exiting()
예제 #3
0
def run(znconsumers):
    log = logging.getLogger("galah.sheep.maintainer")

    log.info("Maintainer starting")

    producer = start_producer()
    consumers = []

    # Continually make sure that all of the threads are up until it's time to
    # exit
    while not universal.exiting:
        if not universal.orphaned_results.empty():
            logger.warning(
                "Orphaned results detected, going into distress mode.")

        while not universal.orphaned_results.empty():
            try:
                # We want to create a whole new socket everytime so we don't
                # stack messages up in the queue. We also don't want to just
                # send it once and let ZMQ take care of it because it might
                # be eaten by a defunct shepherd and then we'd be stuck forever.
                shepherd = universal.context.socket(zmq.DEALER)
                shepherd.linger = 0
                shepherd.connect(config["shepherd/SHEEP_SOCKET"])

                shepherd.send_json(FlockMessage("distress", "").to_dict())

                logger.info(
                    "Sent distress message to shepherd, waiting for response.")

                message = exithelpers.recv_json(shepherd, timeout=1000 * 60)
                message = FlockMessage.from_dict(message)

                if message.type == "bloot" and message.body == "":
                    while not universal.orphaned_results.empty():
                        result = universal.orphaned_results.get()

                        try:
                            shepherd.send_json(
                                FlockMessage("result", result).to_dict())

                            confirmation = exithelpers.recv_json(shepherd,
                                                                 timeout=1000 *
                                                                 5)
                            confirmation = FlockMessage.from_dict(confirmation)

                            if confirmation.type == "bloot" and \
                                    confirmation.body == "":
                                continue
                        except:
                            universal.orphaned_results.put(result)
                            raise
            except universal.Exiting:
                logger.warning(
                    "Orphaned results have not been sent back to the "
                    "shepherd. I WILL NOT ABANDON THEM, YOU WILL HAVE TO "
                    "KILL ME WITH FIRE! (SIGKILL is fire in this analogy).")

                # Nah man.
                universal.exiting = False

                continue
            except exithelpers.Timeout:
                continue

        # Remove any dead consumers from the list
        dead_consumers = 0
        for c in consumers[:]:
            if not c.isAlive():
                dead_consumers += 1
                consumers.remove(c)

        if dead_consumers > 0:
            logger.warning("Found %d dead consumers, restarting them.",
                           dead_consumers)

        # Start up consumers until we have the desired amount
        while len(consumers) < znconsumers:
            consumers.append(start_consumer())

        # If the producer died, start it again
        if not producer.isAlive():
            log.warning("Found dead producer, restarting it.")

            producer = start_producer()

        # Sleep for awhile
        time.sleep(poll_timeout)

    raise universal.Exiting()