示例#1
0
    def test_paused_node(self):
        """Tests adding job executions when the node is paused"""

        node_offers = NodeOffers(self.paused_node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent_paused, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent_paused, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Ensure it accepts new tasks for already running job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        # Don't accept new job executions while paused
        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NODE_NOT_READY)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)
示例#2
0
    def test_timed_out_system_job_task(self):
        """Tests running through a job execution where a system job task times out"""

        ingest_job_type = Ingest.objects.get_ingest_job_type()
        ingest_job_type.max_tries = 1
        ingest_job_type.save()
        job = job_test_utils.create_job(job_type=ingest_job_type, num_exes=1)
        job_exe = job_test_utils.create_job_exe(job=job)
        running_job_exe = RunningJobExecution(job_exe)

        # Start job-task and then task times out
        when_launched = now() + timedelta(seconds=1)
        job_task_started = when_launched + timedelta(seconds=1)
        when_timed_out = job_task_started + timedelta(seconds=1)
        job_task = running_job_exe.start_next_task()
        self.task_mgr.launch_tasks([job_task], when_launched)
        update = job_test_utils.create_task_status_update(
            job_task.id, 'agent', TaskStatusUpdate.RUNNING, job_task_started)
        self.task_mgr.handle_task_update(update)
        running_job_exe.task_update(update)
        running_job_exe.execution_timed_out(job_task, when_timed_out)
        self.assertTrue(running_job_exe.is_finished())
        self.assertFalse(running_job_exe.is_next_task_ready())

        job_exe = JobExecution.objects.get(id=job_exe.id)
        self.assertEqual('FAILED', job_exe.status)
        self.assertEqual('ingest-timeout', job_exe.error.name)
        self.assertEqual(when_timed_out, job_exe.ended)
示例#3
0
    def init_with_database(self):
        """Initializes the job execution metrics with the execution history from the database
        """

        oldest_time = self._finished_metrics_over_time.time_blocks[0].start
        blank_config = ExecutionConfiguration()
        for job_exe_end in JobExecutionEnd.objects.get_recent_job_exe_end_metrics(
                oldest_time):
            running_job_exe = RunningJobExecution('', job_exe_end.job_exe,
                                                  job_exe_end.job_type,
                                                  blank_config, 0)
            running_job_exe._set_final_status(job_exe_end.status,
                                              job_exe_end.ended,
                                              job_exe_end.error)
            self._finished_metrics.add_job_execution(running_job_exe)
            self._finished_metrics_over_time.add_job_execution(running_job_exe)
示例#4
0
    def test_job_exe_clean_task(self, mock_get_slaves):
        """Tests the NodeManager where a cleanup task is returned to clean up a job execution"""

        mock_get_slaves.return_value = self.slave_infos

        when = now()
        node_mgr = NodeManager()
        node_mgr.register_agent_ids([self.node_agent_1, self.node_agent_2])
        node_mgr.sync_with_database('master_host', 5050)
        cleanup_mgr = CleanupManager()
        cleanup_mgr.update_nodes(node_mgr.get_nodes())
        tasks = node_mgr.get_next_tasks(when)

        task_mgr = TaskManager()
        # Complete initial cleanup tasks
        for task in tasks:
            task_mgr.launch_tasks([task], now())
            update = job_test_utils.create_task_status_update(task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
            task_mgr.handle_task_update(update)
            node_mgr.handle_task_update(update)

        # Mark image pull done to get rid of image tasks
        for node in node_mgr.get_nodes():
            node._image_pull_completed()
            node._update_state()

        job_exe = job_test_utils.create_job_exe(node=self.node_1)
        # Add a job execution to clean up and get the cleanup task for it
        cleanup_mgr.add_job_execution(RunningJobExecution(job_exe))
        tasks = node_mgr.get_next_tasks(when)
        self.assertEqual(len(tasks), 1)
        task = tasks[0]
        self.assertEqual(task.agent_id, self.node_agent_1)
        self.assertFalse(task.is_initial_cleanup)
        self.assertEqual(len(task.job_exes), 1)
示例#5
0
    def setUp(self):
        django.setup()

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        self.node_model_1 = node_test_utils.create_node()
        self.job_exe_model_1 = job_test_utils.create_job_exe(
            status='RUNNING', node=self.node_model_1)
        self.job_exe_1 = RunningJobExecution(self.job_exe_model_1)
        self.node_model_2 = node_test_utils.create_node()
        self.job_exe_model_2 = job_test_utils.create_job_exe(
            status='RUNNING', node=self.node_model_2)
        self.job_exe_2 = RunningJobExecution(self.job_exe_model_2)

        self.job_exe_mgr = JobExecutionManager()
示例#6
0
    def test_handle_regular_cleanup_task(self):
        """Tests handling a regular cleanup task"""

        when = now()
        node = Node(self.node_agent, self.node)
        node._last_heath_task = when
        node._initial_cleanup_completed()
        node._image_pull_completed()
        node._update_state()

        # No task since there are no job executions to clean
        self.assertListEqual([], node.get_next_tasks(when))

        # Add job execution and complete task to clean it up
        job_exe = RunningJobExecution(self.job_exe)
        node.add_job_execution(job_exe)
        task = node.get_next_tasks(when)[0]
        self.assertTrue(task.id.startswith(CLEANUP_TASK_ID_PREFIX))
        self.assertFalse(task.is_initial_cleanup)
        self.assertListEqual(task.job_exes, [job_exe])
        self.task_mgr.launch_tasks([task], now())
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.RUNNING, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)
        update = job_test_utils.create_task_status_update(
            task.id, task.agent_id, TaskStatusUpdate.FINISHED, now())
        self.task_mgr.handle_task_update(update)
        node.handle_task_update(update)

        # No task since all job executions have been cleaned
        self.assertListEqual([], node.get_next_tasks(when))
示例#7
0
    def test_lost_node(self):
        """Tests accepting a running and queued job execution and then the node being lost"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])
        manager.update_nodes([self.node, self.paused_node])
        manager.ready_new_offers()

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.ACCEPTED)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.ACCEPTED)

        manager.lost_node(self.node_agent)
        node_offers = manager.pop_offers_with_accepted_job_exes()
        self.assertEqual(len(node_offers), 0)
示例#8
0
    def test_lost_node(self):
        """Tests when the node is lost"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        # Accept a couple job executions
        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_2 = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertGreater(node_offers._available_cpus, 0.0)
        self.assertGreater(node_offers._available_mem, 0.0)
        self.assertGreater(node_offers._available_disk, 0.0)

        # Node is lost
        node_offers.lost_node()
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertEqual(node_offers._available_cpus, 0.0)
        self.assertEqual(node_offers._available_mem, 0.0)
        self.assertEqual(node_offers._available_disk, 0.0)
示例#9
0
    def init_with_database(self):
        """Initializes the job execution metrics with the execution history from the database
        """

        oldest_time = self._finished_metrics_over_time.time_blocks[0].start
        # TODO: this should be in the manager, but the JobExecution model is going to be completely re-worked anyway
        job_exe_query = JobExecution.objects.select_related('error')
        job_exe_query = job_exe_query.filter(
            status__in=['COMPLETED', 'FAILED'], ended__gte=oldest_time)
        for job_exe_model in job_exe_query:
            job_exe = RunningJobExecution(job_exe_model)
            job_exe._set_finished_status(job_exe_model.status,
                                         job_exe_model.ended,
                                         job_exe_model.error)
            self._finished_metrics.add_job_execution(job_exe)
            self._finished_metrics_over_time.add_job_execution(job_exe)
示例#10
0
文件: utils.py 项目: SteveAIS/scale
def create_running_job_exe(agent_id='agent_1', job_type=None, job=None, node=None, timeout=None, input_file_size=10.0,
                           queued=None, started=None, resources=None, priority=None, num_exes=1):
    """Creates a running job execution for unit testing

    :returns: The running job execution
    :rtype: :class:`job.execution.job_exe.RunningJobExecution`
    """

    when = timezone.now()
    if not job:
        job = create_job(job_type=job_type, status='RUNNING', input_file_size=input_file_size, num_exes=num_exes)
    job_type = job.job_type

    # Configuration that occurs at queue time
    input_files = {}
    input_file_ids = job.get_job_data().get_input_file_ids()
    if input_file_ids:
        for input_file in ScaleFile.objects.get_files_for_queued_jobs(input_file_ids):
            input_files[input_file.id] = input_file
    exe_config = QueuedExecutionConfigurator(input_files).configure_queued_job(job)

    job_exe = JobExecution()
    job_exe.set_cluster_id('1234', job.id, job.num_exes)
    job_exe.job = job
    job_exe.job_type = job_type
    job_exe.exe_num = job.num_exes
    if not node:
        node = node_utils.create_node()
    job_exe.node = node
    if not timeout:
        timeout = job.timeout
    job_exe.timeout = timeout
    job_exe.input_file_size = input_file_size
    if not resources:
        resources = job.get_resources()
    job_exe.resources = resources.get_json().get_dict()
    job_exe.configuration = exe_config.get_dict()
    if not queued:
        queued = when
    job_exe.queued = queued
    if not started:
        started = when + datetime.timedelta(seconds=1)
    job_exe.started = started
    job_exe.save()

    if not priority:
        priority = job.priority

    # Configuration that occurs at schedule time
    workspaces = {}
    for workspace in Workspace.objects.all():
        workspaces[workspace.name] = workspace
    secret_config = ScheduledExecutionConfigurator(workspaces).configure_scheduled_job(job_exe, job_type,
                                                                                       job_type.get_job_interface(),'INFO')
    return RunningJobExecution(agent_id, job_exe, job_type, secret_config, priority)
示例#11
0
    def test_consider_next_task(self):
        """Tests consider_next_task() and get_accepted_running_job_exes()"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.ACCEPTED)
        result = node_offers.consider_next_task(job_exe_1)  # Same job_exe, should have no effect
        self.assertEqual(result, NodeOffers.ACCEPTED)

        job_exe_high_cpus = RunningJobExecution(self.running_job_exe_high_cpus)
        result = node_offers.consider_next_task(job_exe_high_cpus)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_CPUS)

        job_exe_high_mem = RunningJobExecution(self.running_job_exe_high_mem)
        result = node_offers.consider_next_task(job_exe_high_mem)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_MEM)

        job_exe_high_disk = RunningJobExecution(self.running_job_exe_high_disk)
        result = node_offers.consider_next_task(job_exe_high_disk)
        self.assertEqual(result, NodeOffers.NOT_ENOUGH_DISK)

        job_exe_2 = RunningJobExecution(self.running_job_exe_2)
        result = node_offers.consider_next_task(job_exe_2)
        self.assertEqual(result, NodeOffers.ACCEPTED)

        self.assertTrue(node_offers.has_accepted_job_exes())
        self.assertEqual(len(node_offers.get_accepted_running_job_exes()), 2)
        self.assertSetEqual(set(node_offers.get_accepted_running_job_exes()), {job_exe_1, job_exe_2})
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        self.assertEqual(node_offers._available_cpus, 68.0)
        self.assertEqual(node_offers._available_mem, 1536.0)
        self.assertEqual(node_offers._available_disk, 2222.0)
示例#12
0
    def test_job_exe_canceled(self):
        """Tests adding a job execution that becomes canceled while scheduling"""

        node_offers = NodeOffers(self.node)
        offer_1 = ResourceOffer('offer_1',  self.node_agent, NodeResources(cpus=24.0, mem=1024.0, disk=1024.0))
        node_offers.add_offer(offer_1)
        offer_2 = ResourceOffer('offer_2',  self.node_agent, NodeResources(cpus=50.0, mem=2048.0, disk=2048.0))
        node_offers.add_offer(offer_2)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        job_exe_1.execution_canceled()
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.TASK_INVALID)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
示例#13
0
    def test_no_offers(self):
        """Tests adding job executions when there are no offers"""

        node_offers = NodeOffers(self.node)
        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])

        job_exe_1 = RunningJobExecution(self.running_job_exe_1)
        result = node_offers.consider_next_task(job_exe_1)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        job_exe_new = QueuedJobExecution(self.queue_1)
        result = node_offers.consider_new_job_exe(job_exe_new)
        self.assertEqual(result, NodeOffers.NO_OFFERS)

        self.assertFalse(node_offers.has_accepted_job_exes())
        self.assertListEqual(node_offers.get_accepted_running_job_exes(), [])
        self.assertListEqual(node_offers.get_accepted_new_job_exes(), [])
示例#14
0
    def test_no_ready_offers(self):
        """Tests considering job executions when no offers are ready"""

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_paused,
            NodeResources(cpus=2.0, mem=1024.0, disk=1024.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent,
            NodeResources(cpus=25.0, mem=2048.0, disk=2048.0))

        manager = OfferManager()
        manager.add_new_offers([offer_1, offer_2])

        job_exe_1 = QueuedJobExecution(self.queue_1)
        result = manager.consider_new_job_exe(job_exe_1)
        self.assertEqual(result, OfferManager.NO_NODES_AVAILABLE)

        job_exe_2 = RunningJobExecution(self.running_job_exe_1)
        result = manager.consider_next_task(job_exe_2)
        self.assertEqual(result, OfferManager.NODE_NOT_READY)
示例#15
0
    def test_job_type_limit(self, mock_taskinfo):
        """Tests running the scheduling thread with a job type limit"""
        mock_taskinfo.return_value = MagicMock()

        Queue.objects.all().delete()
        job_type_with_limit = job_test_utils.create_job_type()
        job_type_with_limit.max_scheduled = 4
        job_type_with_limit.save()
        job_exe_1 = job_test_utils.create_job_exe(job_type=job_type_with_limit,
                                                  status='RUNNING')
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        queue_test_utils.create_queue(job_type=job_type_with_limit)
        job_type_mgr.sync_with_database()
        # One job of this type is already running
        job_exe_mgr.schedule_job_exes([RunningJobExecution(job_exe_1)])

        offer_1 = ResourceOffer(
            'offer_1', self.node_agent_1,
            NodeResources(cpus=200.0, mem=102400.0, disk=102400.0))
        offer_2 = ResourceOffer(
            'offer_2', self.node_agent_2,
            NodeResources(cpus=200.0, mem=204800.0, disk=204800.0))
        offer_mgr.add_new_offers([offer_1, offer_2])

        # Ignore Docker pull tasks
        for node in node_mgr.get_nodes():
            node._is_image_pulled = True

        # Ignore cleanup tasks
        for node in node_mgr.get_nodes():
            node._initial_cleanup_completed()
            node._update_state()

        num_tasks = self._scheduling_thread._perform_scheduling()
        self.assertEqual(
            num_tasks, 3
        )  # One is already running, should only be able to schedule 3 more
示例#16
0
    def schedule_job_executions(self, framework_id, job_executions,
                                workspaces):
        """Schedules the given job executions on the provided nodes and resources. The corresponding queue models will
        be deleted from the database. All database changes occur in an atomic transaction.

        :param framework_id: The scheduling framework ID
        :type framework_id: string
        :param job_executions: A list of queued job executions that have been given nodes and resources on which to run
        :type job_executions: list[:class:`queue.job_exe.QueuedJobExecution`]
        :param workspaces: A dict of all workspaces stored by name
        :type workspaces: {string: :class:`storage.models.Workspace`}
        :returns: The scheduled job executions
        :rtype: list[:class:`job.execution.job_exe.RunningJobExecution`]
        """

        if not job_executions:
            return []

        job_exe_ids = []
        for job_execution in job_executions:
            job_exe_ids.append(job_execution.id)

        # Lock corresponding job executions
        job_exes = {}
        for job_exe in JobExecution.objects.select_for_update().filter(
                id__in=job_exe_ids).order_by('id'):
            job_exes[job_exe.id] = job_exe

        # Set up job executions to schedule
        executions_to_schedule = []
        for job_execution in job_executions:
            queue = job_execution.queue
            node_id = job_execution.provided_node_id
            resources = job_execution.provided_resources
            job_exe = job_exes[job_execution.id]

            # Ignore executions that are no longer queued (executions may have been changed since queue model was last
            # queried)
            if job_exe.status != 'QUEUED':
                continue

            # Check that resources are sufficient
            if resources.cpus < queue.cpus_required:
                msg = 'Job execution requires %s CPUs and only %s were provided'
                raise Exception(
                    msg % (str(queue.cpus_required), str(resources.cpus)))
            if resources.mem < queue.mem_required:
                msg = 'Job execution requires %s MiB of memory and only %s MiB were provided'
                raise Exception(msg %
                                (str(queue.mem_required), str(resources.mem)))
            if resources.disk_in < queue.disk_in_required:
                msg = 'Job execution requires %s MiB of input disk space and only %s MiB were provided'
                raise Exception(
                    msg %
                    (str(queue.disk_in_required), str(resources.disk_in)))
            if resources.disk_out < queue.disk_out_required:
                msg = 'Job execution requires %s MiB of output disk space and only %s MiB were provided'
                raise Exception(
                    msg %
                    (str(queue.disk_out_required), str(resources.disk_out)))
            if resources.disk_total < queue.disk_total_required:
                msg = 'Job execution requires %s MiB of total disk space and only %s MiB were provided'
                raise Exception(msg % (str(
                    queue.disk_total_required), str(resources.disk_total)))

            executions_to_schedule.append((job_exe, node_id, resources))

        # Schedule job executions
        scheduled_job_exes = []
        job_exe_ids_scheduled = []
        for job_exe in JobExecution.objects.schedule_job_executions(
                framework_id, executions_to_schedule, workspaces):
            scheduled_job_exes.append(RunningJobExecution(job_exe))
            job_exe_ids_scheduled.append(job_exe.id)

        # Clear the scheduled job executions from the queue
        Queue.objects.filter(job_exe_id__in=job_exe_ids_scheduled).delete()

        return scheduled_job_exes
示例#17
0
    def test_running_executions(self):
        """Tests the metrics with running executions that complete"""

        node_model_1 = node_test_utils.create_node()
        node_model_2 = node_test_utils.create_node()
        job_type_1 = job_test_utils.create_job_type()
        job_type_2 = job_test_utils.create_job_type()
        job_exe_model_1 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_1)
        job_exe_model_2 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_1)
        job_exe_model_3 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_1)
        job_exe_model_4 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_1)
        job_exe_model_5 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_2)
        job_exe_model_6 = job_test_utils.create_job_exe(job_type=job_type_1, status='RUNNING', node=node_model_2)
        job_exe_model_7 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2)
        job_exe_model_8 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2)
        job_exe_model_9 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2)
        job_exe_model_10 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2)
        job_exe_model_11 = job_test_utils.create_job_exe(job_type=job_type_2, status='RUNNING', node=node_model_2)
        job_exe_1 = RunningJobExecution(job_exe_model_1)
        job_exe_2 = RunningJobExecution(job_exe_model_2)
        job_exe_3 = RunningJobExecution(job_exe_model_3)
        job_exe_4 = RunningJobExecution(job_exe_model_4)
        job_exe_5 = RunningJobExecution(job_exe_model_5)
        job_exe_6 = RunningJobExecution(job_exe_model_6)
        job_exe_7 = RunningJobExecution(job_exe_model_7)
        job_exe_8 = RunningJobExecution(job_exe_model_8)
        job_exe_9 = RunningJobExecution(job_exe_model_9)
        job_exe_10 = RunningJobExecution(job_exe_model_10)
        job_exe_11 = RunningJobExecution(job_exe_model_11)

        # NOTE: This unit test is about to get CRAZY. I apologize for the complexity, but this is needed for a
        # thorough testing
        self.metrics.add_running_job_exes([job_exe_1, job_exe_2, job_exe_3, job_exe_4, job_exe_5, job_exe_6, job_exe_7,
                                           job_exe_8, job_exe_9, job_exe_10, job_exe_11])
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, now())

        # Check expected totals
        self.assertEqual(node_list_dict[0]['job_executions']['running']['total'], 4)
        for job_type_dict in node_list_dict[0]['job_executions']['running']['by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 3)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 1)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['total'], 0)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['total'], 0)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['system']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['running']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['running']['by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['algorithm']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['system']['total'], 0)

        # Finish some job executions
        end_time_1 = now()
        job_exe_1._set_finished_status('COMPLETED', end_time_1)
        job_exe_2._set_finished_status('FAILED', end_time_1, error=self.data_error)
        job_exe_4._set_finished_status('FAILED', end_time_1, error=self.alg_error)
        self.metrics.job_exe_finished(job_exe_1)
        self.metrics.job_exe_finished(job_exe_2)
        self.metrics.job_exe_finished(job_exe_4)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, end_time_1 + datetime.timedelta(seconds=1))

        # Check expected totals
        self.assertEqual(node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['job_type_id'], job_type_1.id)
        self.assertEqual(node_list_dict[0]['job_executions']['completed']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['completed']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['completed']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['completed']['by_job_type'][0]['job_type_id'],
                         job_type_1.id)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['total'], 2)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type'][0]['job_type_id'],
                         job_type_2.id)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['failed']['data']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['by_job_type'][0]['job_type_id'],
                         job_type_1.id)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['system']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['running']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['running']['by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['algorithm']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['system']['total'], 0)

        # Finish some job executions (all executions still on node 2)
        end_time_2 = end_time_1 + FinishedJobExeMetricsOverTime.BLOCK_LENGTH
        job_exe_5._set_finished_status('COMPLETED', end_time_2)
        job_exe_6._set_finished_status('COMPLETED', end_time_2)
        job_exe_7._set_finished_status('COMPLETED', end_time_2)
        job_exe_8._set_finished_status('COMPLETED', end_time_2)
        job_exe_9._set_finished_status('COMPLETED', end_time_2)
        job_exe_10._set_finished_status('COMPLETED', end_time_2)
        job_exe_11._set_finished_status('COMPLETED', end_time_2)
        self.metrics.job_exe_finished(job_exe_5)
        self.metrics.job_exe_finished(job_exe_6)
        self.metrics.job_exe_finished(job_exe_7)
        self.metrics.job_exe_finished(job_exe_8)
        self.metrics.job_exe_finished(job_exe_9)
        self.metrics.job_exe_finished(job_exe_10)
        self.metrics.job_exe_finished(job_exe_11)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, end_time_2)

        # Check expected totals
        self.assertEqual(node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['job_type_id'], job_type_1.id)
        self.assertEqual(node_list_dict[0]['job_executions']['completed']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['completed']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['completed']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['completed']['by_job_type'][0]['job_type_id'],
                         job_type_1.id)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['total'], 2)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['by_job_type'][0]['job_type_id'],
                         job_type_2.id)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['failed']['data']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['by_job_type'][0]['job_type_id'],
                         job_type_1.id)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['system']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['completed']['total'], 7)
        for job_type_dict in node_list_dict[1]['job_executions']['completed']['by_job_type']:
            if job_type_dict['job_type_id'] == job_type_1.id:
                self.assertEqual(job_type_dict['count'], 2)
            elif job_type_dict['job_type_id'] == job_type_2.id:
                self.assertEqual(job_type_dict['count'], 5)
            else:
                self.fail('Unexpected job type ID')
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['algorithm']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['system']['total'], 0)

        # Let all finished job executions roll off by time, only running remaining
        end_time_3 = end_time_2 + FinishedJobExeMetricsOverTime.TOTAL_TIME_PERIOD
        end_time_3 += FinishedJobExeMetricsOverTime.BLOCK_LENGTH + datetime.timedelta(seconds=1)
        node_list_dict = [{'id': node_model_1.id}, {'id': node_model_2.id}]
        self.metrics.generate_status_json(node_list_dict, end_time_3)

        # Check expected totals
        self.assertEqual(node_list_dict[0]['job_executions']['running']['total'], 1)
        self.assertEqual(len(node_list_dict[0]['job_executions']['running']['by_job_type']), 1)
        self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['count'], 1)
        self.assertEqual(node_list_dict[0]['job_executions']['running']['by_job_type'][0]['job_type_id'], job_type_1.id)
        self.assertEqual(node_list_dict[0]['job_executions']['completed']['total'], 0)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['total'], 0)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['algorithm']['total'], 0)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(node_list_dict[0]['job_executions']['failed']['system']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['running']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['completed']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['algorithm']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['data']['total'], 0)
        self.assertEqual(node_list_dict[1]['job_executions']['failed']['system']['total'], 0)
示例#18
0
文件: manager.py 项目: kaydoh/scale
    def _process_scheduled_job_executions(self, framework_id,
                                          queued_job_executions, job_types,
                                          workspaces):
        """Processes the given queued job executions that have been scheduled and returns the new running job
        executions. All database updates occur in an atomic transaction.

        :param framework_id: The scheduling framework ID
        :type framework_id: string
        :param queued_job_executions: A list of queued job executions that have been scheduled
        :type queued_job_executions: list
        :param job_types: A dict of all job types stored by ID
        :type job_types: dict
        :param workspaces: A dict of all workspaces stored by name
        :type workspaces: dict
        :returns: The running job executions stored in lists by node ID
        :rtype: dict
        """

        started = now()
        running_job_exes = {}
        configurator = ScheduledExecutionConfigurator(workspaces)

        with transaction.atomic():
            # Bulk create the job execution models
            job_exe_models = []
            scheduled_models = {}  # {queue ID: (job_exe model, config)}
            canceled_models = {}  # {queue ID: job_exe model}
            for queued_job_exe in queued_job_executions:
                job_exe_model = queued_job_exe.create_job_exe_model(
                    framework_id, started)
                job_exe_models.append(job_exe_model)
                if queued_job_exe.is_canceled:
                    canceled_models[queued_job_exe.id] = job_exe_model
                else:
                    job_type = job_types[job_exe_model.job_type_id]
                    # The configuration stored in the job_exe model has been censored so it is safe to save in database
                    # The returned configuration may contain secrets and should be passed to running job_exe for use
                    config = configurator.configure_scheduled_job(
                        job_exe_model, job_type, queued_job_exe.interface,
                        scheduler_mgr.config.system_logging_level)
                    scheduled_models[queued_job_exe.id] = (job_exe_model,
                                                           config)
            JobExecution.objects.bulk_create(job_exe_models)

            # Create running and canceled job executions
            queue_ids = []
            canceled_job_exe_end_models = []
            for queued_job_exe in queued_job_executions:
                queue_ids.append(queued_job_exe.id)
                if queued_job_exe.is_canceled:
                    job_exe_model = canceled_models[queued_job_exe.id]
                    canceled_job_exe_end_models.append(
                        job_exe_model.create_canceled_job_exe_end_model(
                            started))
                else:
                    agent_id = queued_job_exe.scheduled_agent_id
                    job_exe_model = scheduled_models[queued_job_exe.id][0]
                    job_type = job_types[job_exe_model.job_type_id]
                    config = scheduled_models[queued_job_exe.id][
                        1]  # May contain secrets!
                    priority = queued_job_exe.priority
                    running_job_exe = RunningJobExecution(
                        agent_id, job_exe_model, job_type, config, priority)
                    if running_job_exe.node_id in running_job_exes:
                        running_job_exes[running_job_exe.node_id].append(
                            running_job_exe)
                    else:
                        running_job_exes[running_job_exe.node_id] = [
                            running_job_exe
                        ]

            # Add canceled job execution end models to manager to be sent to messaging backend
            if canceled_job_exe_end_models:
                job_exe_mgr.add_canceled_job_exes(canceled_job_exe_end_models)

            # Delete queue models
            Queue.objects.filter(id__in=queue_ids).delete()

        duration = now() - started
        msg = 'Queries to process scheduled jobs took %.3f seconds'
        if duration > SCHEDULE_QUERY_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())

        return running_job_exes
示例#19
0
class TestJobExecutionManager(TransactionTestCase):
    """Tests the JobExecutionManager class"""

    fixtures = ['basic_errors.json', 'basic_job_errors.json']

    def setUp(self):
        django.setup()

        # Clear error cache so test works correctly
        CACHED_BUILTIN_ERRORS.clear()

        self.node_model_1 = node_test_utils.create_node()
        self.job_exe_model_1 = job_test_utils.create_job_exe(
            status='RUNNING', node=self.node_model_1)
        self.job_exe_1 = RunningJobExecution(self.job_exe_model_1)
        self.node_model_2 = node_test_utils.create_node()
        self.job_exe_model_2 = job_test_utils.create_job_exe(
            status='RUNNING', node=self.node_model_2)
        self.job_exe_2 = RunningJobExecution(self.job_exe_model_2)

        self.job_exe_mgr = JobExecutionManager()

    def test_generate_status_json(self):
        """Tests calling generate_status_json() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2])
        json_dict = [{
            'id': self.node_model_1.id
        }, {
            'id': self.node_model_2.id
        }]
        self.job_exe_mgr.generate_status_json(json_dict, now())

        for node_dict in json_dict:
            self.assertEqual(node_dict['job_executions']['running']['total'],
                             1)

    def test_schedule_job_exes(self):
        """Tests calling schedule_job_exes() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2])

        # Both executions should be in the manager and ready
        self.assertEqual(len(self.job_exe_mgr.get_running_job_exes()), 2)
        self.assertEqual(len(self.job_exe_mgr.get_ready_job_exes()), 2)
        self.assertIsNotNone(
            self.job_exe_mgr.get_running_job_exe(self.job_exe_1.id))
        self.assertIsNotNone(
            self.job_exe_mgr.get_running_job_exe(self.job_exe_2.id))

    def test_handle_task_timeout(self):
        """Tests calling handle_task_timeout() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2])

        task = self.job_exe_1.start_next_task()
        self.job_exe_mgr.handle_task_timeout(task, now())

        self.assertEqual(self.job_exe_1.status, 'FAILED')

    def test_handle_task_update(self):
        """Tests calling handle_task_update() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2])

        # Start task
        task_1 = self.job_exe_1.start_next_task()
        task_1_started = now() - timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(
            task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started)

        # Job execution is not finished, so None should be returned
        result = self.job_exe_mgr.handle_task_update(update)
        self.assertIsNone(result)

        # Fail task
        task_1_failed = task_1_started + timedelta(seconds=1)
        update = job_test_utils.create_task_status_update(
            task_1.id,
            'agent',
            TaskStatusUpdate.FAILED,
            task_1_failed,
            exit_code=1)

        # Job execution is finished, so it should be returned
        result = self.job_exe_mgr.handle_task_update(update)
        self.assertEqual(self.job_exe_1.id, result.id)

    def test_init_with_database(self):
        """Tests calling init_with_database() successfully"""

        self.job_exe_mgr.init_with_database()

    def test_lost_node(self):
        """Tests calling lost_node() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2])

        task_1 = self.job_exe_1.start_next_task()
        task_1_started = now() - timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(
            task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started)
        self.job_exe_mgr.handle_task_update(update)

        lost_job_exe = self.job_exe_mgr.lost_node(self.node_model_1.id,
                                                  now())[0]
        self.assertEqual(lost_job_exe.id, self.job_exe_1.id)
        self.assertEqual(lost_job_exe.status, 'FAILED')
        self.assertEqual(lost_job_exe._error.name, 'node-lost')

    def test_sync_with_database(self):
        """Tests calling sync_with_database() successfully"""

        self.job_exe_mgr.schedule_job_exes([self.job_exe_1, self.job_exe_2])

        task_1 = self.job_exe_1.start_next_task()
        task_1_started = now() - timedelta(minutes=5)
        update = job_test_utils.create_task_status_update(
            task_1.id, 'agent', TaskStatusUpdate.RUNNING, task_1_started)
        self.job_exe_mgr.handle_task_update(update)

        # Cancel job_exe_1 and have manager sync with database
        JobExecution.objects.update_status([self.job_exe_model_1], 'CANCELED',
                                           now())
        tasks_to_kill = self.job_exe_mgr.sync_with_database()

        self.assertEqual(self.job_exe_1.status, 'CANCELED')
        self.assertEqual(len(tasks_to_kill), 1)
        self.assertEqual(tasks_to_kill[0].id, task_1.id)