def _download_file_ftp(download_url: str, downloader_job: DownloaderJob, target_file_path: str) -> bool: """ Download a file to a location using FTP via urllib. """ try: logger.debug("Downloading file from %s to %s via FTP.", download_url, target_file_path, downloader_job=downloader_job.id) # Ancient unresolved bug. WTF python: https://bugs.python.org/issue27973 urllib.request.urlcleanup() with closing(urllib.request.urlopen(download_url)) as request: with open(target_file_path, "wb") as target_file: shutil.copyfileobj(request, target_file, CHUNK_SIZE) urllib.request.urlcleanup() except Exception: logger.exception("Exception caught while downloading file from the URL via FTP: %s", download_url, downloader_job=downloader_job.id) downloader_job.failure_reason = ("Exception caught while downloading " "file from the URL via FTP: {}").format(download_url) return False return True
def _extract_files(file_path: str, accession_code: str, job: DownloaderJob) -> List[str]: """Extract zip and return a list of the raw files. """ logger.debug("Extracting %s!", file_path, file_path=file_path, downloader_job=job.id) abs_with_code_raw = LOCAL_ROOT_DIR + "/" + accession_code + "/raw/" try: # This is technically an unsafe operation. # However, we're trusting AE as a data source. with zipfile.ZipFile(file_path, "r") as zip_ref: zip_ref.extractall(abs_with_code_raw) # Other zips for this same accession will go into this # directory too, so look at what's in the zip file rather than # what's in the directory it's being extracted to. files_in_zip = zip_ref.namelist() return [{ "absolute_path": abs_with_code_raw + f, "filename": f } for f in files_in_zip] except Exception as e: reason = "Exception %s caught while extracting %s", str(e), str( file_path) logger.exception(reason, downloader_job=job.id) job.failure_reason = reason raise
def _download_file(download_url: str, file_path: str, job: DownloaderJob) -> DownloaderJob: """Download the file via FTP. I spoke to Erin from Ensembl about ways to improve this. They're looking into it, but have decided against adding an Aspera endpoint. She suggested using `rsync`, we could try shelling out to that. """ try: logger.debug("Downloading file from %s to %s.", download_url, file_path, downloader_job=job.id) urllib.request.urlcleanup() target_file = open(file_path, "wb") with closing(urllib.request.urlopen(download_url)) as request: shutil.copyfileobj(request, target_file, CHUNK_SIZE) # Ancient unresolved bug. WTF python: https://bugs.python.org/issue27973 urllib.request.urlcleanup() except Exception: failure_template = "Exception caught while downloading file from: %s" logger.exception(failure_template, download_url, downloader_job=job.id) job.failure_reason = failure_template % download_url job.success = False return job finally: target_file.close() job.success = True return job
def _extract_files(file_path: str, accession_code: str, job: DownloaderJob) -> List[str]: """Extract zip and return a list of the raw files. """ logger.debug("Extracting %s!", file_path, file_path=file_path, downloader_job=job.id) try: # This is technically an unsafe operation. # However, we're trusting AE as a data source. zip_ref = zipfile.ZipFile(file_path, "r") abs_with_code_raw = LOCAL_ROOT_DIR + '/' + accession_code + '/raw/' zip_ref.extractall(abs_with_code_raw) # Other zips for this same accession will go into this # directory too, so look at what's in the zip file rather than # what's in the directory it's being extracted to. files_in_zip = zip_ref.namelist() zip_ref.close() # os.abspath doesn't do what I thought it does, hency this monstrocity. files = [{'absolute_path': abs_with_code_raw + f, 'filename': f} for f in files_in_zip] except Exception as e: reason = "Exception %s caught while extracting %s", str(e), str(file_path) logger.exception(reason, downloader_job=job.id) job.failure_reason = reason raise os.remove(file_path) return files
def _download_file(download_url: str, file_path: str, job: DownloaderJob, force_ftp=False) -> None: """ Download a file from GEO via Aspera unless `force_ftp` is True """ # Ensure directory exists os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True) if not force_ftp: return _download_file_aspera( download_url=download_url, downloader_job=job, target_file_path=file_path ) else: try: logger.debug( "Downloading file from %s to %s.", download_url, file_path, downloader_job=job.id ) # Ancient unresolved bug. WTF python: https://bugs.python.org/issue27973 urllib.request.urlcleanup() target_file = open(file_path, "wb") with closing(urllib.request.urlopen(download_url)) as request: shutil.copyfileobj(request, target_file, CHUNK_SIZE) urllib.request.urlcleanup() except Exception: logger.exception("Exception caught while downloading file.", downloader_job=job.id) job.failure_reason = "Exception caught while downloading file" raise finally: target_file.close() return True
def queue_downloader_job_for_original_files( self, original_files: List[OriginalFile], experiment_accession_code: str = None, is_transcriptome: bool = False): """Creates a single DownloaderJob with multiple files to download. """ source_urls = [ original_file.source_url for original_file in original_files ] # There is already a downloader job associated with this file. old_assocs = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url__in=source_urls) if len(old_assocs) > 0: logger.debug("We found an existing DownloaderJob for these urls.", source_urls=source_urls) return False # Transcriptome is a special case because there's no sample_object. if is_transcriptome: downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX else: sample_object = original_files[0].samples.first() downloader_task = job_lookup.determine_downloader_task( sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.info("No valid downloader task found for sample.", sample=sample_object.id, original_file=original_files[0].id) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment_accession_code downloader_job.save() downloaded_urls = [] for original_file in original_files: DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) downloaded_urls.append(original_file.source_url) try: logger.info("Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, downloaded_urls=downloaded_urls) message_queue.send_job(downloader_task, downloader_job) except Exception as e: # If the task doesn't get sent we don't want the # downloader_job to be left floating logger.exception("Failed to enqueue downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, error=str(e)) downloader_job.success = False downloader_job.failure_reason = str(e) downloader_job.save()
def _verify_batch_grouping(files: List[File], job: DownloaderJob) -> None: """All batches in the same job should have the same downloader url""" for file in files: if file.download_url != files[0].download_url: failure_message = ("A Batch's file doesn't have the same download " "URL as the other batches' files.") logger.error(failure_message, downloader_job=job.id) job.failure_reason = failure_message raise ValueError(failure_message)
def _verify_files(file1: File, file2: File, job: DownloaderJob) -> None: """Verifies that the two files are the same. This is useful for this downloader because each job has two batches which should each have the same two files. """ if file1.download_url != file2.download_url: failure_message = ("A Batch's file doesn't have the same download " "URL as the other batch's file.") logger.error(failure_message, downloader_job=job.id) job.failure_reason = failure_message raise ValueError(failure_message)
def _upload_files(job_dir: str, files: List[File], job: DownloaderJob) -> None: try: for file in files: file.size_in_bytes = os.path.getsize( file.get_temp_pre_path(job_dir)) file.save() file.upload_raw_file(job_dir) except Exception: logger.exception("Exception caught while uploading file.", downloader_job=job.id, batch=file.batch.id) job.failure_reason = "Exception caught while uploading file." raise finally: file.remove_temp_directory(job_dir)
def _extract_file(files: List[File], job: DownloaderJob) -> None: """Extract zip from temp directory and move to raw directory. Additionally this function sets the size_in_bytes field of each Batch in batches. To save database calls it does not save the batch itself since it will be saved soon when its status changes in utils.end_job. """ # zip_path and local_dir should be common to all batches in the group job_dir = utils.JOB_DIR_PREFIX + str(job.id) zip_path = files[0].get_temp_download_path(job_dir) local_dir = files[0].get_temp_dir(job_dir) dirs_to_clean = set() logger.debug("Extracting %s", zip_path, downloader_job=job.id) try: zip_ref = zipfile.ZipFile(zip_path, "r") zip_ref.extractall(local_dir) for file in files: batch_directory = file.get_temp_dir(job_dir) raw_file_location = file.get_temp_pre_path(job_dir) # The platform is part of the batch's location so if the # batches in this job have different platforms then some # of them need to be moved to the directory corresponding # to thier platform. if local_dir != batch_directory: os.makedirs(batch_directory, exist_ok=True) dirs_to_clean.add(batch_directory) incorrect_location = os.path.join(local_dir, file.name) os.rename(incorrect_location, raw_file_location) file.size_in_bytes = os.path.getsize(raw_file_location) file.save() file.upload_raw_file(job_dir) except Exception: logger.exception("Exception caught while extracting %s", zip_path, downloader_job=job.id) job.failure_reason = "Exception caught while extracting " + zip_path raise finally: zip_ref.close() file.remove_temp_directory(job_dir) for directory in dirs_to_clean: shutil.rmtree(directory)
def _download_file(download_url: str, file_path: str, job: DownloaderJob) -> None: try: logger.debug("Downloading file from %s to %s.", download_url, file_path, downloader_job=job.id) target_file = open(file_path, "wb") with closing(urllib.request.urlopen(download_url)) as request: shutil.copyfileobj(request, target_file, CHUNK_SIZE) except Exception: logger.exception("Exception caught while downloading batch.", downloader_job=job.id) job.failure_reason = "Exception caught while downloading batch" raise finally: target_file.close()
def _download_file(download_url: str, file_path: str, job: DownloaderJob) -> None: """ Download a file from ArrayExpress via FTP. There is no Aspera endpoint which I can find. """ try: logger.debug("Downloading file from %s to %s.", download_url, file_path, downloader_job=job.id) target_file = open(file_path, "wb") with closing(urllib.request.urlopen(download_url, timeout=60)) as request: shutil.copyfileobj(request, target_file, CHUNK_SIZE) except Exception: logger.exception("Exception caught while downloading file.", downloader_job=job.id) job.failure_reason = "Exception caught while downloading file" raise finally: target_file.close()
def _download_file_http(download_url: str, downloader_job: DownloaderJob, target_file_path: str) -> bool: try: logger.debug( "Downloading file from %s to %s using HTTP.", download_url, target_file_path, downloader_job=downloader_job.id, ) # This function will try to recover if the download fails download_file(download_url, target_file_path) except Exception as e: logger.exception("Exception caught while downloading file.", downloader_job=downloader_job.id) downloader_job.failure_reason = "Exception caught while downloading file\\n " + str( e).replace("\n", "\\n") return False return True
def _download_file(download_url: str, file_path: str, job: DownloaderJob) -> None: failure_template = "Exception caught while downloading file from: %s" try: logger.debug("Downloading file from %s to %s.", download_url, file_path, downloader_job=job.id) urllib.request.urlcleanup() target_file = open(file_path, "wb") with closing(urllib.request.urlopen(download_url)) as request: shutil.copyfileobj(request, target_file, CHUNK_SIZE) # Ancient unresolved bug. WTF python: https://bugs.python.org/issue27973 urllib.request.urlcleanup() except Exception: logger.exception(failure_template, download_url, downloader_job=job.id) job.failure_reason = failure_template % download_url raise finally: target_file.close()
def _download_file_aspera(file: File, downloader_job: DownloaderJob, target_file_path: str) -> bool: """ Download a file to a location using Aspera by shelling out to the `ascp` client. """ try: logger.debug("Downloading file from %s to %s via Aspera.", file.download_url, target_file_path, downloader_job=downloader_job.id) # aspera.sra.ebi.ac.uk users port 33001 for SSH communication # We are also NOT using encryption (-T) to avoid slowdown, # and we are not using any kind of rate limiting. command_str = ( ".aspera/cli/bin/ascp -P33001 -i .aspera/cli/etc/asperaweb_id_dsa.openssh {src} {dest}" ) formatted_command = command_str.format(src=file.download_url, dest=target_file_path) completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Something went wrong! Else, just fall through to returning True. if completed_command.returncode != 0: stderr = str(completed_command.stderr) logger.error( "Shell call to ascp failed with error message: %s\nCommand was: %s", stderr, formatted_command) return False except Exception: logger.exception( "Exception caught while downloading batch from the URL via Aspera: %s", file.download_url, downloader_job=downloader_job.id) downloader_job.failure_reason = ( "Exception caught while downloading " "batch from the URL via Aspera: {}").format(file.download_url) return False return True
def _download_file_http(download_url: str, downloader_job: DownloaderJob, target_file_path: str) -> bool: try: target_file = open(target_file_path, "wb") logger.debug( "Downloading file from %s to %s using HTTP.", download_url, target_file_path, downloader_job=downloader_job.id, ) with closing(urllib.request.urlopen(download_url, timeout=60)) as request: shutil.copyfileobj(request, target_file, CHUNK_SIZE) except Exception as e: logger.exception("Exception caught while downloading file.", downloader_job=downloader_job.id) downloader_job.failure_reason = "Exception caught while downloading file\\n " + str( e).replace("\n", "\\n") return False finally: target_file.close() return True
def _download_file(original_file: OriginalFile, downloader_job: DownloaderJob, target_file_path: str) -> bool: """ Download file dispatcher. Dispatches to the HTTP or Aspera downloader """ download_url = original_file.source_url # SRA files have Apsera downloads. if "ftp.sra.ebi.ac.uk" in download_url: # From: ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz # To: [email protected]:/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz download_url = download_url.replace("ftp", "era-fasp@fasp") download_url = download_url.replace(".uk/", ".uk:/") original_file.source_url = download_url return _download_file_aspera(download_url, downloader_job, target_file_path, 0, original_file, source="ENA") elif "ncbi.nlm.nih.gov" in download_url: # Try to convert old-style endpoints into new-style endpoints if possible try: if "anonftp" in download_url or "dbtest" in download_url: accession = download_url.split("/")[-1].split(".sra")[0] new_url = get_https_sra_download(accession) if new_url: download_url = new_url except Exception: pass return _download_file_http(download_url, downloader_job, target_file_path) else: downloader_job.failure_reason = ( "Unrecognized URL pattern: {}").format(download_url) return False return True
def queue_downloader_jobs(self, experiment: Experiment, samples: List[Sample]): """This enqueues DownloaderJobs on a per-file basis. There is a complementary function below for enqueueing multi-file DownloaderJobs. """ files_to_download = [] for sample in samples: files_for_sample = OriginalFile.objects.filter(sample=sample, is_downloaded=False) for og_file in files_for_sample: files_to_download.append(og_file) download_urls_with_jobs = {} for original_file in files_to_download: # We don't need to create multiple downloaders for the same file. # However, we do want to associate original_files with the # DownloaderJobs that will download them. if original_file.source_url in download_urls_with_jobs.keys(): DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=download_urls_with_jobs[ original_file.source_url], original_file=original_file) continue # There is already a downloader job associated with this file. old_assocs = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url=original_file.source_url) if len(old_assocs) > 0: logger.debug( "We found an existing DownloaderJob for this file/url.", original_file_id=original_file.id) continue sample_object = original_file.samples.first() downloader_task = job_lookup.determine_downloader_task( sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.info("No valid downloader task found for sample.", sample=sample_object.id, original_file=original_file.id) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment.accession_code downloader_job.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) download_urls_with_jobs[ original_file.source_url] = downloader_job try: logger.info("Queuing downloader job for URL: " + original_file.source_url, survey_job=self.survey_job.id, downloader_job=downloader_job.id) message_queue.send_job(downloader_task, downloader_job) except Exception as e: # If the task doesn't get sent we don't want the # downloader_job to be left floating logger.exception( "Failed to enqueue downloader job for URL: " + original_file.source_url, survey_job=self.survey_job.id, downloader_job=downloader_job.id) downloader_job.success = False downloader_job.failure_reason = str(e) downloader_job.save()
def requeue_downloader_job(last_job: DownloaderJob) -> (bool, str): """Queues a new downloader job. The new downloader job will have num_retries one greater than last_job.num_retries. Returns True and the volume index of the downloader job upon successful dispatching, False and an empty string otherwise. """ num_retries = last_job.num_retries + 1 ram_amount = last_job.ram_amount # If there's no start time then it's likely that the instance got # cycled which means we didn't get OOM-killed, so we don't need to # increase the RAM amount. if last_job.start_time and last_job.failure_reason is None: if ram_amount == 1024: ram_amount = 4096 elif ram_amount == 4096: ram_amount = 16384 original_file = last_job.original_files.first() if not original_file: last_job.no_retry = True last_job.success = False last_job.failure_reason = ( "Foreman told to requeue a DownloaderJob without an OriginalFile - why?!" ) last_job.save() logger.info( "Foreman told to requeue a DownloaderJob without an OriginalFile - why?!", last_job=str(last_job), ) return False if not original_file.needs_processing(): last_job.no_retry = True last_job.success = False last_job.failure_reason = "Foreman told to redownload job with prior successful processing." last_job.save() logger.info( "Foreman told to redownload job with prior successful processing.", last_job=str(last_job), ) return False first_sample = original_file.samples.first() # This is a magic string that all the dbGaP studies appear to have if first_sample and ("in the dbGaP study" in first_sample.title): last_job.no_retry = True last_job.success = False last_job.failure_reason = "Sample is dbGaP access controlled." last_job.save() logger.info( "Avoiding requeuing for DownloaderJob for dbGaP run accession: " + str(first_sample.accession_code)) return False new_job = DownloaderJob( num_retries=num_retries, downloader_task=last_job.downloader_task, ram_amount=ram_amount, accession_code=last_job.accession_code, was_recreated=last_job.was_recreated, ) new_job.save() for original_file in last_job.original_files.all(): DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=new_job, original_file=original_file) logger.debug( "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.", last_job.id, new_job.id, ) try: if send_job(Downloaders[last_job.downloader_task], job=new_job, is_dispatch=True): last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save() else: # Can't communicate with Batch just now, leave the job for a later loop. new_job.delete() return False except Exception: logger.error( "Failed to requeue DownloaderJob which had ID %d with a new DownloaderJob with ID %d.", last_job.id, new_job.id, ) # Can't communicate with Batch just now, leave the job for a later loop. new_job.delete() return False return True
def _download_file_aspera(download_url: str, downloader_job: DownloaderJob, target_file_path: str, attempt: int=0, source="NCBI" ) -> bool: """ Download a file to a location using Aspera by shelling out to the `ascp` client. """ try: logger.debug("Downloading file from %s to %s via Aspera.", download_url, target_file_path, downloader_job=downloader_job.id) if source is "ENA": # aspera.sra.ebi.ac.uk users port 33001 for SSH communication # We are also NOT using encryption (-T) to avoid slowdown, # and we are not using any kind of rate limiting. command_str = ".aspera/cli/bin/ascp -P33001 -i .aspera/cli/etc/asperaweb_id_dsa.openssh {src} {dest}" formatted_command = command_str.format(src=download_url, dest=target_file_path) completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: # NCBI requires encryption and recommends -k1 resume. command_str = ".aspera/cli/bin/ascp -T -k1 -i .aspera/cli/etc/asperaweb_id_dsa.openssh {src} {dest}" formatted_command = command_str.format(src=download_url, dest=target_file_path) completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Something went wrong! Else, just fall through to returning True. if completed_command.returncode != 0: stdout = completed_command.stdout.decode().strip() stderr = completed_command.stderr.decode().strip() logger.debug("Shell call of `%s` to ascp failed with error message: %s", formatted_command, stderr, downloader_job=downloader_job.id) # Sometimes, Aspera fails mysteriously. # Wait a few minutes and try again. if attempt > 5: logger.info("Final shell call of `%s` to ascp failed with error message: %s", formatted_command, stderr + "\nSTDOUT: " + stdout, downloader_job=downloader_job.id) downloader_job.failure_reason = "stderr:\n " + stderr + "\nstdout:\n " + stdout return False else: time.sleep(5) return _download_file_aspera(download_url, downloader_job, target_file_path, attempt + 1, source ) except Exception: logger.exception("Exception caught while downloading file from the URL via Aspera: %s", download_url, downloader_job=downloader_job.id) downloader_job.failure_reason = ("Exception caught while downloading " "file from the URL via Aspera: {}").format(download_url) return False # If Aspera has given a zero-byte file for some reason, let's back off and retry. if (not os.path.exists(target_file_path)) or (os.path.getsize(target_file_path) < 1): if os.path.exists(target_file_path): os.remove(target_file_path) if attempt > 5: downloader_job.failure_reason = "Got zero byte file from aspera after 5 attempts." return False logger.error("Got zero byte ascp download for target, retrying.", target_url=download_url, downloader_job=downloader_job.id) time.sleep(10) return _download_file_aspera(download_url, downloader_job, target_file_path, attempt + 1, source ) return True
def _download_file_aspera( download_url: str, downloader_job: DownloaderJob, target_file_path: str, attempt=0 ) -> bool: """ Download a file to a location using Aspera by shelling out to the `ascp` client. """ try: logger.debug( "Downloading file from %s to %s via Aspera.", download_url, target_file_path, downloader_job=downloader_job.id, ) ascp = ".aspera/cli/bin/ascp" key = ".aspera/cli/etc/asperaweb_id_dsa.openssh" url = download_url user = "******" ftp = "ftp-trace.ncbi.nlm.nih.gov" if url.startswith("ftp://"): url = url.replace("ftp://", "") url = url.replace(ftp, "").replace("ftp.ncbi.nlm.nih.gov", "") # Resume level 1, use encryption, unlimited speed command_str = "{} -i {} -k1 -T {}@{}:{} {}".format( ascp, key, user, ftp, url, target_file_path ) formatted_command = command_str.format(src=download_url, dest=target_file_path) completed_command = subprocess.run( formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE ) # Something went wrong! Else, just fall through to returning True. if completed_command.returncode != 0: stderr = completed_command.stderr.decode().strip() logger.debug( "Shell call of `%s` to ascp failed with error message: %s", formatted_command, stderr, downloader_job=downloader_job.id, ) # Sometimes, GEO fails mysteriously. # Wait a few minutes and try again. if attempt >= 5: downloader_job.failure_reason = stderr logger.error( "All attempts to download accession via ascp failed: %s\nCommand was: %s", stderr, formatted_command, downloader_job=downloader_job.id, ) return False else: time.sleep(30) return _download_file_aspera( download_url, downloader_job, target_file_path, attempt + 1 ) except Exception: logger.exception( "Exception caught while downloading file from the URL via Aspera: %s", download_url, downloader_job=downloader_job.id, ) downloader_job.failure_reason = ( "Exception caught while downloading " "file from the URL via Aspera: {}" ).format(download_url) return False # If Aspera has given a zero-byte file for some reason, let's back off and retry. if os.path.getsize(target_file_path) < 1: os.remove(target_file_path) if attempt > 5: downloader_job.failure_reason = "Got zero byte file from aspera after 5 attempts." return False logger.error( "Got zero byte ascp download for target, retrying.", target_url=download_url, downloader_job=downloader_job.id, ) time.sleep(10) return _download_file_aspera(download_url, downloader_job, target_file_path, attempt + 1) return True