def test_get_instance_id_local(self): """Test that local is used for instance id.""" # Ensure utils.INSTANCE_ID hasn't been set yet in case the # order the tests are run in ever changes utils.INSTANCE_ID = None with self.settings(RUNNING_IN_CLOUD=False): self.assertEqual(utils.get_instance_id(), "local") # Ensure that the second call uses the now-set global value # by changing what settings would tell it. with self.settings(RUNNING_IN_CLOUD=True): self.assertEqual(utils.get_instance_id(), "local")
def test_get_instance_id_cloud(self, mock_get): """Test that a request is made and the global value is stored""" # Ensure utils.INSTANCE_ID hasn't been set yet in case the # order the tests are run in ever changes utils.INSTANCE_ID = None mock_get.return_value = Mock(ok=True) mock_get.return_value.text = "instance_id" with self.settings(RUNNING_IN_CLOUD=True): self.assertEqual(utils.get_instance_id(), "instance_id") # Ensure that the second call uses the now-set global value. # (By resetting the mocks, calling it again, and checking that # the values didn't need to be set again). mock_get.reset_mock() utils.get_instance_id() mock_get.assert_not_called()
def start_job(job_id: int) -> DownloaderJob: """Record in the database that this job is being started. Retrieves the job from the database and returns it after marking it as started. """ logger.debug("Starting Downloader Job.", downloader_job=job_id) try: job = DownloaderJob.objects.get(id=job_id) except DownloaderJob.DoesNotExist: logger.error("Cannot find downloader job record.", downloader_job=job_id) raise worker_id = get_instance_id() # This job should not have been started. if job.start_time is not None: logger.error("This downloader job has already been started!!!", downloader_job=job.id) raise Exception( "downloaders.start_job called on a job that has already been started!" ) # Set up the SIGTERM handler so we can appropriately handle being interrupted. # (`docker stop` uses SIGTERM, not SIGINT, but better to catch both.) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) job.worker_id = worker_id job.worker_version = SYSTEM_VERSION job.start_time = timezone.now() job.save() needs_downloading = any(original_file.needs_downloading() for original_file in job.original_files.all()) if not needs_downloading: logger.error(( "No files associated with this job need to be downloaded! Aborting!" ), job_id=job.id) job.start_time = timezone.now() job.failure_reason = "Was told to redownload file(s) that are already downloaded!" job.success = False job.no_retry = True job.end_time = timezone.now() job.save() sys.exit(0) global CURRENT_JOB CURRENT_JOB = job return job
def test_dharma(self): dlj1 = DownloaderJob() dlj1.accession_code = 'D1' dlj1.worker_id = get_instance_id() dlj1.start_time = datetime.datetime.now() dlj1.save() dlj2 = DownloaderJob() dlj2.accession_code = 'D2' dlj2.worker_id = get_instance_id() dlj2.start_time = datetime.datetime.now() dlj2.save() dlj3 = DownloaderJob() dlj3.accession_code = 'D3' dlj3.worker_id = get_instance_id() dlj3.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj3 assoc.save() sample = Sample() sample.accession_code = 'Blahblahblah' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=2, force_harakiri=True) except SystemExit as e: # This is supposed to happen! self.assertTrue(True) exited = True except Exception as e: # This isn't! self.assertTrue(False) self.assertTrue(exited) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=15, force_harakiri=True) except SystemExit as e: # This is not supposed to happen! self.assertTrue(False) exited = True except Exception as e: # This is! self.assertTrue(True) self.assertFalse(exited)
def start_job(job_id: int, max_downloader_jobs_per_node=MAX_DOWNLOADER_JOBS_PER_NODE, force_harakiri=False) -> DownloaderJob: """Record in the database that this job is being started. Retrieves the job from the database and returns it after marking it as started. """ logger.debug("Starting Downloader Job.", downloader_job=job_id) try: job = DownloaderJob.objects.get(id=job_id) except DownloaderJob.DoesNotExist: logger.error("Cannot find downloader job record.", downloader_job=job_id) raise worker_id = get_instance_id() num_downloader_jobs_currently_running = DownloaderJob.objects.filter( worker_id=worker_id, start_time__isnull=False, end_time__isnull=True, success__isnull=True, retried=False).count() # Death and rebirth. # if settings.RUNNING_IN_CLOUD or force_harakiri: # if num_downloader_jobs_currently_running >= int(max_downloader_jobs_per_node): # # Wait for the death window # while True: # seconds = datetime.datetime.now().second # # Mass harakiri happens every 15 seconds. # if seconds % 15 == 0: # job.start_time = None # job.num_retries = job.num_retries - 1 # job.failure_reason = "Killed by harakiri" # job.success = False # job.save() # # What is dead may never die! # sys.exit(0) # This job should not have been started. if job.start_time is not None: logger.error("This downloader job has already been started!!!", downloader_job=job.id) raise Exception( "downloaders.start_job called on a job that has already been started!" ) # Set up the SIGTERM handler so we can appropriately handle being interrupted. # (`docker stop` uses SIGTERM, not SIGINT.) # (however, Nomad sends an SIGINT so catch both.) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) job.worker_id = worker_id job.worker_version = SYSTEM_VERSION job.start_time = timezone.now() job.save() needs_downloading = False for original_file in job.original_files.all(): if original_file.needs_downloading(): needs_downloading = True if not needs_downloading: logger.error(( "No files associated with this job need to be downloaded! Aborting!" ), job_id=job.id) job.start_time = timezone.now() job.failure_reason = "Was told to redownload file(s) that are already downloaded!" job.success = False job.no_retry = True job.end_time = timezone.now() job.save() sys.exit(0) global CURRENT_JOB CURRENT_JOB = job return job
import logging import sys from django.conf import settings import daiquiri from data_refinery_common.utils import get_env_variable_gracefully, get_instance_id # Most of the formatting in this string is for the logging system. All # that the call to format() does is replace the "{0}" in the string # with the worker id. FORMAT_STRING = ("%(asctime)s {0} %(name)s %(color)s%(levelname)s%(extras)s" ": %(message)s%(color_stop)s").format(get_instance_id()) LOG_LEVEL = None def unconfigure_root_logger(): """Prevents the root logger from duplicating our messages. The root handler comes preconfigured with a handler. This causes all our logs to be logged twice, once with our cool handler and one that lacks all context. This function removes that stupid extra handler. """ root_logger = logging.getLogger(None) # Remove all handlers for handler in list(root_logger.handlers): root_logger.removeHandler(handler)
def start_job(job_context: Dict): """A processor function to start jobs. Record in the database that this job is being started and retrieves the job's batches from the database and adds them to the dictionary passed in with the key 'batches'. """ job = job_context["job"] # This job should not have been started. if job.start_time is not None and settings.RUNNING_IN_CLOUD: if job.success: failure_reason = "ProcessorJob has already completed succesfully - why are we here again? Bad Nomad!" logger.error(failure_reason, job_id=job.id ) job_context["original_files"] = [] job_context["computed_files"] = [] job_context['abort'] = True # Will be saved by end_job. job_context['job'].failure_reason = failure_reason return job_context if job.success == False: failure_reason = "ProcessorJob has already completed with a fail - why are we here again? Bad Nomad!" logger.error(failure_reason, job_id=job.id ) job_context["original_files"] = [] job_context["computed_files"] = [] job_context['abort'] = True # Will be saved by end_job. job_context['job'].failure_reason = failure_reason return job_context logger.error("This processor job has already been started!!!", processor_job=job.id) raise Exception("processors.start_job called on job %s that has already been started!" % str(job.id)) original_file = job.original_files.first() if original_file and not original_file.needs_processing(job_context["job_id"]): failure_reason = ("Sample has a good computed file, it must have been processed, " "so it doesn't need to be downloaded! Aborting!") logger.error(failure_reason, job_id=job.id, original_file=original_file ) job_context["original_files"] = [] job_context["computed_files"] = [] job_context['abort'] = True # Will be saved by end_job. job_context['job'].failure_reason = failure_reason return job_context # Set up the SIGTERM handler so we can appropriately handle being interrupted. # (`docker stop` uses SIGTERM, not SIGINT.) # (however, Nomad sends an SIGINT so catch both.) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) job.worker_id = get_instance_id() job.worker_version = SYSTEM_VERSION job.start_time = timezone.now() job.save() global CURRENT_JOB CURRENT_JOB = job logger.debug("Starting processor Job.", processor_job=job.id, pipeline=job.pipeline_applied) # Janitor jobs don't operate on file objects. # Tximport jobs don't need to download the original file, they # just need it to know what experiment to process. if job.pipeline_applied not in ["JANITOR", "TXIMPORT"]: # Some jobs take OriginalFiles, other take Datasets if job.pipeline_applied not in ["SMASHER", "QN_REFERENCE", "COMPENDIA"]: job_context = prepare_original_files(job_context) if not job_context.get("success", True): return job_context else: job_context = prepare_dataset(job_context) if not job_context.get("success", True): return job_context else: # Just in case job_context["original_files"] = [] job_context["computed_files"] = [] return job_context
def start_job(job_context: Dict): """A processor function to start jobs. Record in the database that this job is being started and retrieves the job's batches from the database and adds them to the dictionary passed in with the key 'batches'. """ job = job_context["job"] original_file = job.original_files.first() if ( not job.pipeline_applied == ProcessorPipeline.TXIMPORT.value and original_file and not original_file.needs_processing(job_context["job_id"]) ): failure_reason = ( "Sample has a good computed file, it must have been processed, " "so it doesn't need to be downloaded! Aborting!" ) logger.error(failure_reason, job_id=job.id, original_file=original_file) job_context["original_files"] = [] job_context["computed_files"] = [] job_context["abort"] = True # Will be saved by end_job. job_context["job"].failure_reason = failure_reason return job_context # Set up the SIGTERM handler so we can appropriately handle being interrupted. # (`docker stop` uses SIGTERM, not SIGINT.) # (however, Nomad sends an SIGINT so catch both.) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) # This job should not have been started, for some reason Nomad restarts some of our jobs # https://github.com/AlexsLemonade/refinebio/issues/1487 if job.start_time is not None and settings.RUNNING_IN_CLOUD: # Let's just log the event and let the job run instead of failing # and also reset the endtime and failure reason, since those fields might have been set logger.warn( "ProcessorJob was restarted by Nomad. We do not know why this happened", processor_job=job.id, success=job.success, failure_reason=job.failure_reason, start_time=job.start_time, end_time=job.end_time, ) job.end_time = None job.failure_reason = None job.worker_id = get_instance_id() job.worker_version = SYSTEM_VERSION job.start_time = timezone.now() job.save() global CURRENT_JOB CURRENT_JOB = job logger.debug("Starting processor Job.", processor_job=job.id, pipeline=job.pipeline_applied) # Janitor jobs don't operate on file objects. # Tximport jobs don't need to download the original file, they # just need it to know what experiment to process. if job.pipeline_applied not in [ ProcessorPipeline.JANITOR.value, ProcessorPipeline.TXIMPORT.value, ]: # Some jobs take OriginalFiles, other take Datasets if ProcessorPipeline[job.pipeline_applied] not in SMASHER_JOB_TYPES: job_context = prepare_original_files(job_context) if not job_context.get("success", True): return job_context else: job_context = prepare_dataset(job_context) if not job_context.get("success", True): return job_context else: # Just in case job_context["original_files"] = [] job_context["computed_files"] = [] return job_context
def start_job(job_context: Dict): """A processor function to start jobs. Record in the database that this job is being started and retrieves the job's batches from the database and adds them to the dictionary passed in with the key 'batches'. """ job = job_context["job"] # This job should not have been started. if job.start_time is not None and settings.RUNNING_IN_CLOUD: if job.success: logger.error( "ProcessorJob has already completed succesfully - why are we here again? Bad Nomad!", job_id=job.id) job_context["original_files"] = [] job_context["computed_files"] = [] job_context['abort'] = True return job_context if job.success == False: logger.error( "ProcessorJob has already completed with a fail - why are we here again? Bad Nomad!", job_id=job.id) job_context["original_files"] = [] job_context["computed_files"] = [] job_context['abort'] = True return job_context logger.error("This processor job has already been started!!!", processor_job=job.id) raise Exception( "processors.start_job called on job %s that has already been started!" % str(job.id)) # Set up the SIGTERM handler so we can appropriately handle being interrupted. # (`docker stop` uses SIGTERM, not SIGINT.) # (however, Nomad sends an SIGINT so catch both.) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) job.worker_id = get_instance_id() job.worker_version = SYSTEM_VERSION job.start_time = timezone.now() job.save() global CURRENT_JOB CURRENT_JOB = job logger.debug("Starting processor Job.", processor_job=job.id, pipeline=job.pipeline_applied) # Janitors have no requirement if job.pipeline_applied not in ["JANITOR"]: # Some jobs take OriginalFiles, other take Datasets if job.pipeline_applied not in [ "SMASHER", "QN_REFERENCE", "COMPENDIA" ]: job_context = prepare_original_files(job_context) if not job_context.get("success", True): return job_context else: job_context = prepare_dataset(job_context) if not job_context.get("success", True): return job_context else: # Just in case job_context["original_files"] = [] job_context["computed_files"] = [] return job_context