示例#1
0
def _download_file_ftp(download_url: str, downloader_job: DownloaderJob, target_file_path: str) -> bool:
    """ Download a file to a location using FTP via urllib. """
    try:
        logger.debug("Downloading file from %s to %s via FTP.",
                     download_url,
                     target_file_path,
                     downloader_job=downloader_job.id)

        # Ancient unresolved bug. WTF python: https://bugs.python.org/issue27973
        urllib.request.urlcleanup()

        with closing(urllib.request.urlopen(download_url)) as request:
            with open(target_file_path, "wb") as target_file:
                shutil.copyfileobj(request, target_file, CHUNK_SIZE)

        urllib.request.urlcleanup()
    except Exception:
        logger.exception("Exception caught while downloading file from the URL via FTP: %s",
                         download_url,
                         downloader_job=downloader_job.id)
        downloader_job.failure_reason = ("Exception caught while downloading "
                                         "file from the URL via FTP: {}").format(download_url)
        return False

    return True
示例#2
0
def _extract_files(file_path: str, accession_code: str,
                   job: DownloaderJob) -> List[str]:
    """Extract zip and return a list of the raw files.
    """
    logger.debug("Extracting %s!",
                 file_path,
                 file_path=file_path,
                 downloader_job=job.id)
    abs_with_code_raw = LOCAL_ROOT_DIR + "/" + accession_code + "/raw/"

    try:
        # This is technically an unsafe operation.
        # However, we're trusting AE as a data source.
        with zipfile.ZipFile(file_path, "r") as zip_ref:
            zip_ref.extractall(abs_with_code_raw)
            # Other zips for this same accession will go into this
            # directory too, so look at what's in the zip file rather than
            # what's in the directory it's being extracted to.
            files_in_zip = zip_ref.namelist()

        return [{
            "absolute_path": abs_with_code_raw + f,
            "filename": f
        } for f in files_in_zip]

    except Exception as e:
        reason = "Exception %s caught while extracting %s", str(e), str(
            file_path)
        logger.exception(reason, downloader_job=job.id)
        job.failure_reason = reason
        raise
示例#3
0
def _download_file(download_url: str, file_path: str,
                   job: DownloaderJob) -> DownloaderJob:
    """Download the file via FTP.

    I spoke to Erin from Ensembl about ways to improve this. They're looking into it,
    but have decided against adding an Aspera endpoint.

    She suggested using `rsync`, we could try shelling out to that.

    """
    try:
        logger.debug("Downloading file from %s to %s.",
                     download_url,
                     file_path,
                     downloader_job=job.id)
        urllib.request.urlcleanup()
        target_file = open(file_path, "wb")
        with closing(urllib.request.urlopen(download_url)) as request:
            shutil.copyfileobj(request, target_file, CHUNK_SIZE)

        # Ancient unresolved bug. WTF python: https://bugs.python.org/issue27973
        urllib.request.urlcleanup()
    except Exception:
        failure_template = "Exception caught while downloading file from: %s"
        logger.exception(failure_template, download_url, downloader_job=job.id)
        job.failure_reason = failure_template % download_url
        job.success = False
        return job
    finally:
        target_file.close()

    job.success = True
    return job
示例#4
0
def _extract_files(file_path: str, accession_code: str, job: DownloaderJob) -> List[str]:
    """Extract zip and return a list of the raw files.
    """
    logger.debug("Extracting %s!", file_path, file_path=file_path, downloader_job=job.id)

    try:
        # This is technically an unsafe operation.
        # However, we're trusting AE as a data source.
        zip_ref = zipfile.ZipFile(file_path, "r")
        abs_with_code_raw = LOCAL_ROOT_DIR + '/' + accession_code + '/raw/'
        zip_ref.extractall(abs_with_code_raw)
        # Other zips for this same accession will go into this
        # directory too, so look at what's in the zip file rather than
        # what's in the directory it's being extracted to.
        files_in_zip = zip_ref.namelist()
        zip_ref.close()

        # os.abspath doesn't do what I thought it does, hency this monstrocity.
        files = [{'absolute_path': abs_with_code_raw + f, 'filename': f} for f in files_in_zip]

    except Exception as e:
        reason = "Exception %s caught while extracting %s", str(e), str(file_path)
        logger.exception(reason, downloader_job=job.id)
        job.failure_reason = reason
        raise

    os.remove(file_path)

    return files
示例#5
0
文件: geo.py 项目: erflynn/refinebio
def _download_file(download_url: str, file_path: str, job: DownloaderJob, force_ftp=False) -> None:
    """ Download a file from GEO via Aspera unless `force_ftp` is True
    """

    # Ensure directory exists
    os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True)

    if not force_ftp:
        return _download_file_aspera(
            download_url=download_url, downloader_job=job, target_file_path=file_path
        )
    else:
        try:
            logger.debug(
                "Downloading file from %s to %s.", download_url, file_path, downloader_job=job.id
            )

            # Ancient unresolved bug. WTF python: https://bugs.python.org/issue27973
            urllib.request.urlcleanup()

            target_file = open(file_path, "wb")
            with closing(urllib.request.urlopen(download_url)) as request:
                shutil.copyfileobj(request, target_file, CHUNK_SIZE)

            urllib.request.urlcleanup()
        except Exception:
            logger.exception("Exception caught while downloading file.", downloader_job=job.id)
            job.failure_reason = "Exception caught while downloading file"
            raise
        finally:
            target_file.close()

        return True
示例#6
0
    def queue_downloader_job_for_original_files(
            self,
            original_files: List[OriginalFile],
            experiment_accession_code: str = None,
            is_transcriptome: bool = False):
        """Creates a single DownloaderJob with multiple files to download.
        """
        source_urls = [
            original_file.source_url for original_file in original_files
        ]
        # There is already a downloader job associated with this file.
        old_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
            original_file__source_url__in=source_urls)
        if len(old_assocs) > 0:
            logger.debug("We found an existing DownloaderJob for these urls.",
                         source_urls=source_urls)
            return False

        # Transcriptome is a special case because there's no sample_object.
        if is_transcriptome:
            downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX
        else:
            sample_object = original_files[0].samples.first()
            downloader_task = job_lookup.determine_downloader_task(
                sample_object)

        if downloader_task == job_lookup.Downloaders.NONE:
            logger.info("No valid downloader task found for sample.",
                        sample=sample_object.id,
                        original_file=original_files[0].id)
        else:
            downloader_job = DownloaderJob()
            downloader_job.downloader_task = downloader_task.value
            downloader_job.accession_code = experiment_accession_code
            downloader_job.save()

            downloaded_urls = []
            for original_file in original_files:
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                downloaded_urls.append(original_file.source_url)

            try:
                logger.info("Queuing downloader job.",
                            survey_job=self.survey_job.id,
                            downloader_job=downloader_job.id,
                            downloaded_urls=downloaded_urls)
                message_queue.send_job(downloader_task, downloader_job)
            except Exception as e:
                # If the task doesn't get sent we don't want the
                # downloader_job to be left floating
                logger.exception("Failed to enqueue downloader job.",
                                 survey_job=self.survey_job.id,
                                 downloader_job=downloader_job.id,
                                 error=str(e))
                downloader_job.success = False
                downloader_job.failure_reason = str(e)
                downloader_job.save()
示例#7
0
def _verify_batch_grouping(files: List[File], job: DownloaderJob) -> None:
    """All batches in the same job should have the same downloader url"""
    for file in files:
        if file.download_url != files[0].download_url:
            failure_message = ("A Batch's file doesn't have the same download "
                               "URL as the other batches' files.")
            logger.error(failure_message, downloader_job=job.id)
            job.failure_reason = failure_message
            raise ValueError(failure_message)
示例#8
0
def _verify_files(file1: File, file2: File, job: DownloaderJob) -> None:
    """Verifies that the two files are the same.

    This is useful for this downloader because each job has two
    batches which should each have the same two files.
    """
    if file1.download_url != file2.download_url:
        failure_message = ("A Batch's file doesn't have the same download "
                           "URL as the other batch's file.")
        logger.error(failure_message, downloader_job=job.id)
        job.failure_reason = failure_message
        raise ValueError(failure_message)
示例#9
0
def _upload_files(job_dir: str, files: List[File], job: DownloaderJob) -> None:
    try:
        for file in files:
            file.size_in_bytes = os.path.getsize(
                file.get_temp_pre_path(job_dir))
            file.save()
            file.upload_raw_file(job_dir)
    except Exception:
        logger.exception("Exception caught while uploading file.",
                         downloader_job=job.id,
                         batch=file.batch.id)
        job.failure_reason = "Exception caught while uploading file."
        raise
    finally:
        file.remove_temp_directory(job_dir)
示例#10
0
def _extract_file(files: List[File], job: DownloaderJob) -> None:
    """Extract zip from temp directory and move to raw directory.

    Additionally this function sets the size_in_bytes field of each
    Batch in batches. To save database calls it does not save the
    batch itself since it will be saved soon when its status
    changes in utils.end_job.
    """
    # zip_path and local_dir should be common to all batches in the group
    job_dir = utils.JOB_DIR_PREFIX + str(job.id)
    zip_path = files[0].get_temp_download_path(job_dir)
    local_dir = files[0].get_temp_dir(job_dir)
    dirs_to_clean = set()

    logger.debug("Extracting %s", zip_path, downloader_job=job.id)

    try:
        zip_ref = zipfile.ZipFile(zip_path, "r")
        zip_ref.extractall(local_dir)

        for file in files:
            batch_directory = file.get_temp_dir(job_dir)
            raw_file_location = file.get_temp_pre_path(job_dir)

            # The platform is part of the batch's location so if the
            # batches in this job have different platforms then some
            # of them need to be moved to the directory corresponding
            # to thier platform.
            if local_dir != batch_directory:
                os.makedirs(batch_directory, exist_ok=True)
                dirs_to_clean.add(batch_directory)
                incorrect_location = os.path.join(local_dir, file.name)
                os.rename(incorrect_location, raw_file_location)

            file.size_in_bytes = os.path.getsize(raw_file_location)
            file.save()
            file.upload_raw_file(job_dir)
    except Exception:
        logger.exception("Exception caught while extracting %s",
                         zip_path,
                         downloader_job=job.id)
        job.failure_reason = "Exception caught while extracting " + zip_path
        raise
    finally:
        zip_ref.close()
        file.remove_temp_directory(job_dir)
        for directory in dirs_to_clean:
            shutil.rmtree(directory)
示例#11
0
def _download_file(download_url: str, file_path: str,
                   job: DownloaderJob) -> None:
    try:
        logger.debug("Downloading file from %s to %s.",
                     download_url,
                     file_path,
                     downloader_job=job.id)
        target_file = open(file_path, "wb")
        with closing(urllib.request.urlopen(download_url)) as request:
            shutil.copyfileobj(request, target_file, CHUNK_SIZE)
    except Exception:
        logger.exception("Exception caught while downloading batch.",
                         downloader_job=job.id)
        job.failure_reason = "Exception caught while downloading batch"
        raise
    finally:
        target_file.close()
示例#12
0
def _download_file(download_url: str, file_path: str, job: DownloaderJob) -> None:
    """ Download a file from ArrayExpress via FTP. There is no Aspera endpoint
    which I can find. """
    try:
        logger.debug("Downloading file from %s to %s.",
                     download_url,
                     file_path,
                     downloader_job=job.id)
        target_file = open(file_path, "wb")
        with closing(urllib.request.urlopen(download_url, timeout=60)) as request:
            shutil.copyfileobj(request, target_file, CHUNK_SIZE)
    except Exception:
        logger.exception("Exception caught while downloading file.",
                         downloader_job=job.id)
        job.failure_reason = "Exception caught while downloading file"
        raise
    finally:
        target_file.close()
示例#13
0
def _download_file_http(download_url: str, downloader_job: DownloaderJob,
                        target_file_path: str) -> bool:
    try:
        logger.debug(
            "Downloading file from %s to %s using HTTP.",
            download_url,
            target_file_path,
            downloader_job=downloader_job.id,
        )
        # This function will try to recover if the download fails
        download_file(download_url, target_file_path)
    except Exception as e:
        logger.exception("Exception caught while downloading file.",
                         downloader_job=downloader_job.id)
        downloader_job.failure_reason = "Exception caught while downloading file\\n " + str(
            e).replace("\n", "\\n")
        return False

    return True
示例#14
0
def _download_file(download_url: str, file_path: str,
                   job: DownloaderJob) -> None:
    failure_template = "Exception caught while downloading file from: %s"
    try:
        logger.debug("Downloading file from %s to %s.",
                     download_url,
                     file_path,
                     downloader_job=job.id)
        urllib.request.urlcleanup()
        target_file = open(file_path, "wb")
        with closing(urllib.request.urlopen(download_url)) as request:
            shutil.copyfileobj(request, target_file, CHUNK_SIZE)

        # Ancient unresolved bug. WTF python: https://bugs.python.org/issue27973
        urllib.request.urlcleanup()
    except Exception:
        logger.exception(failure_template, download_url, downloader_job=job.id)
        job.failure_reason = failure_template % download_url
        raise
    finally:
        target_file.close()
示例#15
0
文件: sra.py 项目: dongbohu/ccdl_test
def _download_file_aspera(file: File, downloader_job: DownloaderJob,
                          target_file_path: str) -> bool:
    """ Download a file to a location using Aspera by shelling out to the `ascp` client. """

    try:
        logger.debug("Downloading file from %s to %s via Aspera.",
                     file.download_url,
                     target_file_path,
                     downloader_job=downloader_job.id)

        # aspera.sra.ebi.ac.uk users port 33001 for SSH communication
        # We are also NOT using encryption (-T) to avoid slowdown,
        # and we are not using any kind of rate limiting.
        command_str = (
            ".aspera/cli/bin/ascp -P33001 -i .aspera/cli/etc/asperaweb_id_dsa.openssh {src} {dest}"
        )
        formatted_command = command_str.format(src=file.download_url,
                                               dest=target_file_path)
        completed_command = subprocess.run(formatted_command.split(),
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)

        # Something went wrong! Else, just fall through to returning True.
        if completed_command.returncode != 0:
            stderr = str(completed_command.stderr)
            logger.error(
                "Shell call to ascp failed with error message: %s\nCommand was: %s",
                stderr, formatted_command)
            return False

    except Exception:
        logger.exception(
            "Exception caught while downloading batch from the URL via Aspera: %s",
            file.download_url,
            downloader_job=downloader_job.id)
        downloader_job.failure_reason = (
            "Exception caught while downloading "
            "batch from the URL via Aspera: {}").format(file.download_url)
        return False
    return True
示例#16
0
def _download_file_http(download_url: str, downloader_job: DownloaderJob,
                        target_file_path: str) -> bool:
    try:
        target_file = open(target_file_path, "wb")
        logger.debug(
            "Downloading file from %s to %s using HTTP.",
            download_url,
            target_file_path,
            downloader_job=downloader_job.id,
        )

        with closing(urllib.request.urlopen(download_url,
                                            timeout=60)) as request:
            shutil.copyfileobj(request, target_file, CHUNK_SIZE)
    except Exception as e:
        logger.exception("Exception caught while downloading file.",
                         downloader_job=downloader_job.id)
        downloader_job.failure_reason = "Exception caught while downloading file\\n " + str(
            e).replace("\n", "\\n")
        return False
    finally:
        target_file.close()

    return True
示例#17
0
def _download_file(original_file: OriginalFile, downloader_job: DownloaderJob,
                   target_file_path: str) -> bool:
    """ Download file dispatcher. Dispatches to the HTTP or Aspera downloader
    """
    download_url = original_file.source_url
    # SRA files have Apsera downloads.
    if "ftp.sra.ebi.ac.uk" in download_url:
        # From: ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz
        # To: [email protected]:/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz
        download_url = download_url.replace("ftp", "era-fasp@fasp")
        download_url = download_url.replace(".uk/", ".uk:/")
        original_file.source_url = download_url
        return _download_file_aspera(download_url,
                                     downloader_job,
                                     target_file_path,
                                     0,
                                     original_file,
                                     source="ENA")
    elif "ncbi.nlm.nih.gov" in download_url:
        # Try to convert old-style endpoints into new-style endpoints if possible
        try:
            if "anonftp" in download_url or "dbtest" in download_url:
                accession = download_url.split("/")[-1].split(".sra")[0]
                new_url = get_https_sra_download(accession)
                if new_url:
                    download_url = new_url
        except Exception:
            pass
        return _download_file_http(download_url, downloader_job,
                                   target_file_path)
    else:
        downloader_job.failure_reason = (
            "Unrecognized URL pattern: {}").format(download_url)
        return False

    return True
示例#18
0
    def queue_downloader_jobs(self, experiment: Experiment,
                              samples: List[Sample]):
        """This enqueues DownloaderJobs on a per-file basis.

        There is a complementary function below for enqueueing multi-file
        DownloaderJobs.
        """
        files_to_download = []
        for sample in samples:
            files_for_sample = OriginalFile.objects.filter(sample=sample,
                                                           is_downloaded=False)
            for og_file in files_for_sample:
                files_to_download.append(og_file)

        download_urls_with_jobs = {}
        for original_file in files_to_download:

            # We don't need to create multiple downloaders for the same file.
            # However, we do want to associate original_files with the
            # DownloaderJobs that will download them.
            if original_file.source_url in download_urls_with_jobs.keys():
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=download_urls_with_jobs[
                        original_file.source_url],
                    original_file=original_file)
                continue

            # There is already a downloader job associated with this file.
            old_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
                original_file__source_url=original_file.source_url)
            if len(old_assocs) > 0:
                logger.debug(
                    "We found an existing DownloaderJob for this file/url.",
                    original_file_id=original_file.id)
                continue

            sample_object = original_file.samples.first()
            downloader_task = job_lookup.determine_downloader_task(
                sample_object)

            if downloader_task == job_lookup.Downloaders.NONE:
                logger.info("No valid downloader task found for sample.",
                            sample=sample_object.id,
                            original_file=original_file.id)
            else:
                downloader_job = DownloaderJob()
                downloader_job.downloader_task = downloader_task.value
                downloader_job.accession_code = experiment.accession_code
                downloader_job.save()

                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                download_urls_with_jobs[
                    original_file.source_url] = downloader_job

                try:
                    logger.info("Queuing downloader job for URL: " +
                                original_file.source_url,
                                survey_job=self.survey_job.id,
                                downloader_job=downloader_job.id)
                    message_queue.send_job(downloader_task, downloader_job)
                except Exception as e:
                    # If the task doesn't get sent we don't want the
                    # downloader_job to be left floating
                    logger.exception(
                        "Failed to enqueue downloader job for URL: " +
                        original_file.source_url,
                        survey_job=self.survey_job.id,
                        downloader_job=downloader_job.id)
                    downloader_job.success = False
                    downloader_job.failure_reason = str(e)
                    downloader_job.save()
示例#19
0
def requeue_downloader_job(last_job: DownloaderJob) -> (bool, str):
    """Queues a new downloader job.

    The new downloader job will have num_retries one greater than
    last_job.num_retries.

    Returns True and the volume index of the downloader job upon successful dispatching,
    False and an empty string otherwise.
    """
    num_retries = last_job.num_retries + 1

    ram_amount = last_job.ram_amount
    # If there's no start time then it's likely that the instance got
    # cycled which means we didn't get OOM-killed, so we don't need to
    # increase the RAM amount.
    if last_job.start_time and last_job.failure_reason is None:
        if ram_amount == 1024:
            ram_amount = 4096
        elif ram_amount == 4096:
            ram_amount = 16384

    original_file = last_job.original_files.first()

    if not original_file:
        last_job.no_retry = True
        last_job.success = False
        last_job.failure_reason = (
            "Foreman told to requeue a DownloaderJob without an OriginalFile - why?!"
        )
        last_job.save()
        logger.info(
            "Foreman told to requeue a DownloaderJob without an OriginalFile - why?!",
            last_job=str(last_job),
        )
        return False

    if not original_file.needs_processing():
        last_job.no_retry = True
        last_job.success = False
        last_job.failure_reason = "Foreman told to redownload job with prior successful processing."
        last_job.save()
        logger.info(
            "Foreman told to redownload job with prior successful processing.",
            last_job=str(last_job),
        )
        return False

    first_sample = original_file.samples.first()

    # This is a magic string that all the dbGaP studies appear to have
    if first_sample and ("in the dbGaP study" in first_sample.title):
        last_job.no_retry = True
        last_job.success = False
        last_job.failure_reason = "Sample is dbGaP access controlled."
        last_job.save()
        logger.info(
            "Avoiding requeuing for DownloaderJob for dbGaP run accession: " +
            str(first_sample.accession_code))
        return False

    new_job = DownloaderJob(
        num_retries=num_retries,
        downloader_task=last_job.downloader_task,
        ram_amount=ram_amount,
        accession_code=last_job.accession_code,
        was_recreated=last_job.was_recreated,
    )
    new_job.save()

    for original_file in last_job.original_files.all():
        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=new_job, original_file=original_file)

    logger.debug(
        "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.",
        last_job.id,
        new_job.id,
    )
    try:
        if send_job(Downloaders[last_job.downloader_task],
                    job=new_job,
                    is_dispatch=True):
            last_job.retried = True
            last_job.success = False
            last_job.retried_job = new_job
            last_job.save()
        else:
            # Can't communicate with Batch just now, leave the job for a later loop.
            new_job.delete()
            return False
    except Exception:
        logger.error(
            "Failed to requeue DownloaderJob which had ID %d with a new DownloaderJob with ID %d.",
            last_job.id,
            new_job.id,
        )
        # Can't communicate with Batch just now, leave the job for a later loop.
        new_job.delete()
        return False

    return True
示例#20
0
def _download_file_aspera(download_url: str,
                          downloader_job: DownloaderJob,
                          target_file_path: str,
                          attempt: int=0,
                          source="NCBI"
                          ) -> bool:
    """ Download a file to a location using Aspera by shelling out to the `ascp` client. """

    try:
        logger.debug("Downloading file from %s to %s via Aspera.",
                     download_url,
                     target_file_path,
                     downloader_job=downloader_job.id)

        if source is "ENA":
            # aspera.sra.ebi.ac.uk users port 33001 for SSH communication
            # We are also NOT using encryption (-T) to avoid slowdown,
            # and we are not using any kind of rate limiting.
            command_str = ".aspera/cli/bin/ascp -P33001 -i .aspera/cli/etc/asperaweb_id_dsa.openssh {src} {dest}"
            formatted_command = command_str.format(src=download_url,
                                                   dest=target_file_path)
            completed_command = subprocess.run(formatted_command.split(),
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE)
        else:
            # NCBI requires encryption and recommends -k1 resume.
            command_str = ".aspera/cli/bin/ascp -T -k1 -i .aspera/cli/etc/asperaweb_id_dsa.openssh {src} {dest}"
            formatted_command = command_str.format(src=download_url,
                                                   dest=target_file_path)
            completed_command = subprocess.run(formatted_command.split(),
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE)

        # Something went wrong! Else, just fall through to returning True.
        if completed_command.returncode != 0:

            stdout = completed_command.stdout.decode().strip()
            stderr = completed_command.stderr.decode().strip()
            logger.debug("Shell call of `%s` to ascp failed with error message: %s",
                         formatted_command,
                         stderr,
                         downloader_job=downloader_job.id)

            # Sometimes, Aspera fails mysteriously.
            # Wait a few minutes and try again.
            if attempt > 5:
                logger.info("Final shell call of `%s` to ascp failed with error message: %s",
                         formatted_command,
                         stderr + "\nSTDOUT: " + stdout,
                         downloader_job=downloader_job.id)
                downloader_job.failure_reason = "stderr:\n " + stderr + "\nstdout:\n " + stdout
                return False
            else:
                time.sleep(5)
                return _download_file_aspera(download_url,
                                             downloader_job,
                                             target_file_path,
                                             attempt + 1,
                                             source
                                             )
    except Exception:
        logger.exception("Exception caught while downloading file from the URL via Aspera: %s",
                         download_url,
                         downloader_job=downloader_job.id)
        downloader_job.failure_reason = ("Exception caught while downloading "
                                         "file from the URL via Aspera: {}").format(download_url)
        return False

    # If Aspera has given a zero-byte file for some reason, let's back off and retry.
    if (not os.path.exists(target_file_path)) or (os.path.getsize(target_file_path) < 1):
        if os.path.exists(target_file_path):
            os.remove(target_file_path)

        if attempt > 5:
            downloader_job.failure_reason = "Got zero byte file from aspera after 5 attempts."
            return False

        logger.error("Got zero byte ascp download for target, retrying.",
                     target_url=download_url,
                     downloader_job=downloader_job.id)
        time.sleep(10)
        return _download_file_aspera(download_url,
                                     downloader_job,
                                     target_file_path,
                                     attempt + 1,
                                     source
                                     )
    return True
示例#21
0
文件: geo.py 项目: erflynn/refinebio
def _download_file_aspera(
    download_url: str, downloader_job: DownloaderJob, target_file_path: str, attempt=0
) -> bool:
    """ Download a file to a location using Aspera by shelling out to the `ascp` client. """

    try:
        logger.debug(
            "Downloading file from %s to %s via Aspera.",
            download_url,
            target_file_path,
            downloader_job=downloader_job.id,
        )

        ascp = ".aspera/cli/bin/ascp"
        key = ".aspera/cli/etc/asperaweb_id_dsa.openssh"
        url = download_url
        user = "******"
        ftp = "ftp-trace.ncbi.nlm.nih.gov"
        if url.startswith("ftp://"):
            url = url.replace("ftp://", "")
        url = url.replace(ftp, "").replace("ftp.ncbi.nlm.nih.gov", "")

        # Resume level 1, use encryption, unlimited speed
        command_str = "{} -i {} -k1 -T {}@{}:{} {}".format(
            ascp, key, user, ftp, url, target_file_path
        )
        formatted_command = command_str.format(src=download_url, dest=target_file_path)
        completed_command = subprocess.run(
            formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )

        # Something went wrong! Else, just fall through to returning True.
        if completed_command.returncode != 0:

            stderr = completed_command.stderr.decode().strip()
            logger.debug(
                "Shell call of `%s` to ascp failed with error message: %s",
                formatted_command,
                stderr,
                downloader_job=downloader_job.id,
            )

            # Sometimes, GEO fails mysteriously.
            # Wait a few minutes and try again.
            if attempt >= 5:
                downloader_job.failure_reason = stderr
                logger.error(
                    "All attempts to download accession via ascp failed: %s\nCommand was: %s",
                    stderr,
                    formatted_command,
                    downloader_job=downloader_job.id,
                )
                return False
            else:
                time.sleep(30)
                return _download_file_aspera(
                    download_url, downloader_job, target_file_path, attempt + 1
                )
    except Exception:
        logger.exception(
            "Exception caught while downloading file from the URL via Aspera: %s",
            download_url,
            downloader_job=downloader_job.id,
        )
        downloader_job.failure_reason = (
            "Exception caught while downloading " "file from the URL via Aspera: {}"
        ).format(download_url)
        return False

    # If Aspera has given a zero-byte file for some reason, let's back off and retry.
    if os.path.getsize(target_file_path) < 1:
        os.remove(target_file_path)
        if attempt > 5:
            downloader_job.failure_reason = "Got zero byte file from aspera after 5 attempts."
            return False

        logger.error(
            "Got zero byte ascp download for target, retrying.",
            target_url=download_url,
            downloader_job=downloader_job.id,
        )
        time.sleep(10)
        return _download_file_aspera(download_url, downloader_job, target_file_path, attempt + 1)
    return True