def test_retrying_lost_processor_jobs(self, mock_describe_jobs, mock_list_jobs, mock_send_job): mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE mock_describe_jobs.return_value = EMPTY_DESCRIBE_JOBS_QUEUE_RESPONSE job = create_processor_job() job.save() job2 = create_processor_job() job2.batch_job_id = "MISSING" job2.save() processor_job_manager.retry_lost_processor_jobs() self.assertEqual(len(mock_send_job.mock_calls), 2) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertTrue(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertFalse(original_job.success) original_job2 = jobs[1] self.assertTrue(original_job2.retried) self.assertEqual(original_job2.num_retries, 0) self.assertFalse(original_job2.success) retried_job = jobs[2] self.assertEqual(retried_job.num_retries, 1) retried_job2 = jobs[3] self.assertEqual(retried_job2.num_retries, 1)
def test_repeated_processor_failures(self, mock_list_jobs, mock_send_job): """Jobs will be repeatedly retried.""" mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE job = create_processor_job() for i in range(utils.MAX_NUM_RETRIES): processor_job_manager.handle_processor_jobs([job]) self.assertEqual(i + 1, len(mock_send_job.mock_calls)) jobs = ProcessorJob.objects.all().order_by("-id") previous_job = jobs[1] self.assertTrue(previous_job.retried) self.assertEqual(previous_job.num_retries, i) self.assertFalse(previous_job.success) job = jobs[0] self.assertFalse(job.retried) self.assertEqual(job.num_retries, i + 1) # Once MAX_NUM_RETRIES has been hit handle_repeated_failure # should be called. processor_job_manager.handle_processor_jobs([job]) last_job = ProcessorJob.objects.all().order_by("-id")[0] self.assertTrue(last_job.retried) self.assertEqual(last_job.num_retries, utils.MAX_NUM_RETRIES) self.assertFalse(last_job.success)
def test_not_retrying_lost_processor_jobs(self, mock_describe_jobs, mock_list_jobs, mock_send_job): """Make sure that we don't retry processor jobs we shouldn't.""" mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE mock_describe_jobs.return_value = { "jobs": [{ "jobId": "FINDME", "status": "RUNNABLE" }] } job = create_processor_job() job.batch_job_id = "FINDME" job.save() processor_job_manager.retry_lost_processor_jobs() self.assertEqual(len(mock_send_job.mock_calls), 0) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertFalse(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertEqual(original_job.success, None) # Make sure no additional job was created. self.assertEqual(jobs.count(), 1)
def test_retrying_lost_smasher_jobs(self, mock_describe_jobs, mock_list_jobs, mock_send_job): """Make sure that the smasher jobs will get retried even though they don't have a volume_index. I'm not entirely sure this test is still necessary but we'll need a separate smasher compute environment so this could test that once it's done. """ mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE mock_describe_jobs.return_value = EMPTY_DESCRIBE_JOBS_QUEUE_RESPONSE job = create_processor_job(pipeline="SMASHER") job.volume_index = None # Smasher jobs won't have a volume_index. job.save() processor_job_manager.retry_lost_processor_jobs() self.assertEqual(len(mock_send_job.mock_calls), 1) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertTrue(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertFalse(original_job.success) retried_job = jobs[1] self.assertEqual(retried_job.num_retries, 1)
def test_not_retrying_hung_processor_jobs(self, mock_describe_jobs, mock_list_jobs, mock_send_job): """Tests that we don't restart processor jobs that are still running.""" mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE mock_describe_jobs.return_value = { "jobs": [{ "jobId": "FINDME", "status": "RUNNING" }] } job = create_processor_job() job.start_time = timezone.now() job.batch_job_id = "FINDME" job.save() processor_job_manager.retry_hung_processor_jobs() self.assertEqual(len(mock_send_job.mock_calls), 0) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertFalse(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertEqual(original_job.success, None) self.assertEqual(jobs.count(), 1)
def test_requeuing_compendia_job_no_batch_job_queue(self, mock_send_job): mock_send_job.side_effect = fake_send_job job = create_processor_job() job.batch_job_queue = None job.pipeline_applied = "CREATE_COMPENDIA" job.save() self.env = EnvironmentVarGuard() self.env.set("RUNING_IN_CLOUD", "True") with self.settings(RUNNING_IN_CLOUD=True): job_requeuing.requeue_processor_job(job) self.assertEqual(len(mock_send_job.mock_calls), 1) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertTrue(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertFalse(original_job.success) retried_job = jobs[1] self.assertEqual(retried_job.num_retries, 1) self.assertIsNotNone(retried_job.batch_job_queue)
def test_retrying_hung_processor_jobs(self, mock_describe_jobs, mock_list_jobs, mock_send_job): mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE mock_describe_jobs.return_value = { "jobs": [{ "jobId": "FINDME", "status": "FAILED" }] } job = create_processor_job() job.start_time = timezone.now() job.batch_job_id = "FINDME" job.save() job2 = create_processor_job() job2.start_time = timezone.now() job2.batch_job_id = "MISSING" job2.save() processor_job_manager.retry_hung_processor_jobs() self.assertEqual(len(mock_send_job.mock_calls), 2) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertTrue(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertFalse(original_job.success) original_job2 = jobs[1] self.assertTrue(original_job2.retried) self.assertEqual(original_job2.num_retries, 0) self.assertFalse(original_job2.success) retried_job = jobs[2] self.assertEqual(retried_job.num_retries, 1) retried_job2 = jobs[3] self.assertEqual(retried_job2.num_retries, 1)
def test_not_retrying_janitor_jobs(self, mock_describe_jobs, mock_list_jobs, mock_send_job): mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE mock_describe_jobs.return_value = EMPTY_DESCRIBE_JOBS_QUEUE_RESPONSE job = create_processor_job(pipeline="JANITOR") job.save() processor_job_manager.retry_lost_processor_jobs() self.assertEqual(len(mock_send_job.mock_calls), 0) jobs = ProcessorJob.objects.order_by("id") self.assertEqual(len(jobs), 1)
def test_not_retrying_old_processor_jobs(self, mock_describe_jobs, mock_list_jobs, mock_send_job): """Makes sure temporary logic to limit the Foreman's scope works.""" mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE mock_describe_jobs.return_value = EMPTY_DESCRIBE_JOBS_QUEUE_RESPONSE job = create_processor_job() job.created_at = DAY_BEFORE_JOB_CUTOFF job.save() processor_job_manager.retry_lost_processor_jobs() self.assertEqual(len(mock_send_job.mock_calls), 0) self.assertEqual(1, ProcessorJob.objects.all().count())
def test_requeuing_processor_job(self, mock_send_job): mock_send_job.side_effect = fake_send_job job = create_processor_job() job_requeuing.requeue_processor_job(job) self.assertEqual(len(mock_send_job.mock_calls), 1) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertTrue(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertFalse(original_job.success) retried_job = jobs[1] self.assertEqual(retried_job.num_retries, 1)
def test_requeuing_processor_job_w_more_ram(self, mock_send_job): mock_send_job.side_effect = fake_send_job job = create_processor_job(pipeline="SALMON", ram_amount=16384, start_time=timezone.now()) job_requeuing.requeue_processor_job(job) self.assertEqual(len(mock_send_job.mock_calls), 1) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertTrue(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertFalse(original_job.success) retried_job = jobs[1] self.assertEqual(retried_job.num_retries, 1) self.assertEqual(original_job.ram_amount, 16384) self.assertEqual(retried_job.ram_amount, 32768)
def test_retrying_failed_processor_jobs(self, mock_list_jobs, mock_send_job): mock_send_job.side_effect = fake_send_job mock_list_jobs.return_value = EMPTY_LIST_JOBS_QUEUE_RESPONSE job = create_processor_job() job.success = False job.save() processor_job_manager.retry_failed_processor_jobs() self.assertEqual(len(mock_send_job.mock_calls), 1) jobs = ProcessorJob.objects.order_by("id") original_job = jobs[0] self.assertTrue(original_job.retried) self.assertEqual(original_job.num_retries, 0) self.assertFalse(original_job.success) retried_job = jobs[1] self.assertEqual(retried_job.num_retries, 1)