示例#1
0
 def test_download_file_ncbi(self, mock_send_job):
     mock_send_job.return_value = None
     
     dlj = DownloaderJob()
     dlj.accession_code = "DRR002116"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "DRR002116.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = 'DRR002116'
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result, downloaded_files = sra.download_sra(dlj.pk)
     utils.end_downloader_job(dlj, result)
     self.assertTrue(result)
     self.assertEqual(downloaded_files[0].sha1, 'd5374e7fe047d4f76b165c3f5148ab2df9d42cea')
     self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
示例#2
0
def create_downloader_job():
    job = DownloaderJob(
        downloader_task="SRA",
        batch_job_id="DEFAULT",
        num_retries=0,
        accession_code="NUNYA",
        success=None,
    )
    job.save()

    og_file = OriginalFile()
    og_file.source_filename = "doesn't matter"
    og_file.filename = "this either"
    og_file.absolute_file_path = "nor this"
    og_file.save()

    assoc1 = DownloaderJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.downloader_job = job
    assoc1.save()

    og_file = OriginalFile()
    og_file.source_filename = "doesn't matter"
    og_file.filename = "this either"
    og_file.absolute_file_path = "nor this"
    og_file.save()

    assoc = DownloaderJobOriginalFileAssociation()
    assoc.original_file = og_file
    assoc.downloader_job = job
    assoc.save()

    return job
示例#3
0
    def test_download_and_extract_file(self, mock_urlopen):
        mock_urlopen.side_effect = file_caching_urlopen
        dlj = DownloaderJob()
        dlj.save()
        array_express._download_file(
            "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",
            "dlme.zip",
            dlj,
        )
        files = array_express._extract_files("dlme.zip", "123", dlj)

        # Test that all files were correctly extracted
        filenames = [file["filename"] for file in files]
        EXPECTED_FILES = [
            "GSM1426089_controle_colon_86.CEL",
            "GSM1426088_controle_colon_85.CEL",
            "GSM1426087_controle_colon_84.CEL",
            "GSM1426086_controle_colon_83.CEL",
            "GSM1426085_controle_colon_82.CEL",
            "GSM1426084_controle_colon_81.CEL",
            "GSM1426083_controle_colon_80.CEL",
            "GSM1426082_controle_colon_79.CEL",
            "GSM1426081_controle_colon_78.CEL",
            "GSM1426080_controle_colon_77.CEL",
            "GSM1426079_controle_colon_76.CEL",
            "GSM1426078_CD_colon_active_8.CEL",
            "GSM1426077_CD_colon_active_7.CEL",
            "GSM1426076_CD_colon_active_6.CEL",
            "GSM1426075_CD_colon_active_5.CEL",
            "GSM1426074_CD_colon_active_4.CEL",
            "GSM1426073_CD_colon_active_3.CEL",
            "GSM1426072_CD_colon_active_2.CEL",
            "GSM1426071_CD_colon_active_1.CEL",
        ]
        self.assertEqual(sorted(filenames), sorted(EXPECTED_FILES))
示例#4
0
    def create_downloader_job(self):
        job = DownloaderJob(
            downloader_task="SRA",
            nomad_job_id="DOWNLOADER/dispatch-1528945054-e8eaf540",
            num_retries=0,
            accession_code="NUNYA",
            success=None)
        job.save()

        og_file = OriginalFile()
        og_file.source_filename = "doesn't matter"
        og_file.filename = "this either"
        og_file.absolute_file_path = "nor this"
        og_file.save()

        assoc1 = DownloaderJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.downloader_job = job
        assoc1.save()

        og_file = OriginalFile()
        og_file.source_filename = "doesn't matter"
        og_file.filename = "this either"
        og_file.absolute_file_path = "nor this"
        og_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = og_file
        assoc.downloader_job = job
        assoc.save()

        return job
示例#5
0
    def test_download_file_swapper(self, mock_send_job):
        mock_send_job.return_value = None

        dlj = DownloaderJob()
        dlj.accession_code = "DRR002116"
        dlj.save()
        og = OriginalFile()
        og.source_filename = "DRR002116.sra"
        og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra"
        og.is_archive = True
        og.save()
        sample = Sample()
        sample.accession_code = 'DRR002116'
        sample.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()
        result = sra._download_file(og.source_url,
                                    dlj,
                                    "/tmp",
                                    force_ftp=False)
        self.assertTrue(result)
 def test_download_and_extract_file(self, mock_send_job):
     dlj = DownloaderJob()
     dlj.save()
     array_express._download_file(
         'ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip',
         'dlme.zip', dlj)
     files = array_express._extract_files('dlme.zip', '123', dlj)
示例#7
0
 def test_download_file_ncbi(self):
     dlj = DownloaderJob()
     dlj.accession_code = "SRR9117853"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "SRR9117853.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = "SRR9117853"
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result, downloaded_files = sra.download_sra(dlj.pk)
     utils.end_downloader_job(dlj, result)
     self.assertTrue(result)
     self.assertEqual(downloaded_files[0].sha1, "e7ad484fe6f134ba7d1b2664e58cc15ae5a958cc")
     self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
示例#8
0
    def test_aspera_downloader(self):
        """ """

        batch = Batch(survey_job=self.survey_job,
                      source_type="SRA",
                      pipeline_required="SALMON",
                      platform_accession_code="IlluminaHiSeq2000",
                      experiment_accession_code="DRX001563",
                      experiment_title="It doesn't really matter.",
                      organism_id=9031,
                      organism_name="GALLUS GALLUS",
                      release_date="2013-07-19",
                      last_uploaded_date="2017-09-11",
                      status=BatchStatuses.NEW.value)
        batch.save()

        # This is converted from FTP URL to use Aspera
        file = File(
            size_in_bytes=0,
            download_url=
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz",  # noqa
            raw_format="fastq.gz",
            processed_format="tar.gz",
            name="ERR036000_1.fastq.gz",
            internal_location="IlluminaHiSeq2000/SALMON",
            batch=batch)
        dj = DownloaderJob()

        self.assertTrue(sra._download_file(file, dj, file.name))
示例#9
0
    def test_download_file(self):
        dlj = DownloaderJob()
        dlj.accession_code = "ERR036"
        dlj.save()

        og = OriginalFile()
        og.source_filename = "ERR036000.fastq.gz"
        og.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        og.is_archive = True
        og.save()

        sample = Sample()
        sample.accession_code = "ERR036000"
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()

        result, downloaded_files = sra.download_sra(dlj.pk)

        self.assertTrue(result)
        self.assertEqual(downloaded_files[0].sha1,
                         "1dfe5460a4101fe87feeffec0cb2e053f6695961")
        self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
示例#10
0
    def test_download_file(self, mock_send_job):
        mock_send_job.return_value = None
        
        dlj = DownloaderJob()
        dlj.accession_code = "ERR036"
        dlj.save()

        og = OriginalFile()
        og.source_filename = "ERR036000.fastq.gz"
        og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        og.is_archive = True
        og.save()

        sample = Sample()
        sample.accession_code = 'ERR036000'
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()

        success = sra.download_sra(dlj.pk)
示例#11
0
    def queue_downloader_job_for_original_files(
        self,
        original_files: List[OriginalFile],
        experiment_accession_code: str = None,
        is_transcriptome: bool = False,
    ):
        """Creates a single DownloaderJob with multiple files to download.
        """
        # Transcriptome is a special case because there's no sample_object.
        # It's alright to re-process transcriptome indices.
        if is_transcriptome:
            downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX
        else:
            source_urls = [
                original_file.source_url for original_file in original_files
            ]
            # There is already a downloader job associated with this file.
            old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter(
                original_file__source_url__in=source_urls).count()
            if old_assocs_count > 0:
                logger.debug(
                    "We found an existing DownloaderJob for these urls.",
                    source_urls=source_urls)
                return False

            sample_object = original_files[0].samples.first()
            downloader_task = job_lookup.determine_downloader_task(
                sample_object)

        if downloader_task == job_lookup.Downloaders.NONE:
            logger.info(
                "No valid downloader task found for sample.",
                sample=sample_object.id,
                original_file=original_files[0].id,
            )
        else:
            downloader_job = DownloaderJob()
            downloader_job.downloader_task = downloader_task.value
            downloader_job.accession_code = experiment_accession_code
            downloader_job.save()

            downloaded_urls = []
            for original_file in original_files:
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                downloaded_urls.append(original_file.source_url)

            try:
                logger.info(
                    "Queuing downloader job.",
                    survey_job=self.survey_job.id,
                    downloader_job=downloader_job.id,
                    downloaded_urls=downloaded_urls,
                )
                message_queue.send_job(downloader_task, downloader_job)
            except:
                # If we fail to queue the job, it will be requeued.
                pass
示例#12
0
    def queue_downloader_job_for_original_files(
            self,
            original_files: List[OriginalFile],
            experiment_accession_code: str = None,
            is_transcriptome: bool = False):
        """Creates a single DownloaderJob with multiple files to download.
        """
        source_urls = [
            original_file.source_url for original_file in original_files
        ]
        # There is already a downloader job associated with this file.
        old_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
            original_file__source_url__in=source_urls)
        if len(old_assocs) > 0:
            logger.debug("We found an existing DownloaderJob for these urls.",
                         source_urls=source_urls)
            return False

        # Transcriptome is a special case because there's no sample_object.
        if is_transcriptome:
            downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX
        else:
            sample_object = original_files[0].samples.first()
            downloader_task = job_lookup.determine_downloader_task(
                sample_object)

        if downloader_task == job_lookup.Downloaders.NONE:
            logger.info("No valid downloader task found for sample.",
                        sample=sample_object.id,
                        original_file=original_files[0].id)
        else:
            downloader_job = DownloaderJob()
            downloader_job.downloader_task = downloader_task.value
            downloader_job.accession_code = experiment_accession_code
            downloader_job.save()

            downloaded_urls = []
            for original_file in original_files:
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                downloaded_urls.append(original_file.source_url)

            try:
                logger.info("Queuing downloader job.",
                            survey_job=self.survey_job.id,
                            downloader_job=downloader_job.id,
                            downloaded_urls=downloaded_urls)
                message_queue.send_job(downloader_task, downloader_job)
            except Exception as e:
                # If the task doesn't get sent we don't want the
                # downloader_job to be left floating
                logger.exception("Failed to enqueue downloader job.",
                                 survey_job=self.survey_job.id,
                                 downloader_job=downloader_job.id,
                                 error=str(e))
                downloader_job.success = False
                downloader_job.failure_reason = str(e)
                downloader_job.save()
示例#13
0
def create_processor_job(pipeline="AFFY_TO_PCL",
                         ram_amount=2048,
                         start_time=None):
    og_file_1 = OriginalFile()
    og_file_1.source_filename = "doesn't matter"
    og_file_1.filename = "this either"
    og_file_1.absolute_file_path = "nor this"
    og_file_1.save()

    og_file_2 = OriginalFile()
    og_file_2.source_filename = "doesn't matter"
    og_file_2.filename = "this either"
    og_file_2.absolute_file_path = "nor this"
    og_file_2.save()

    downloader_job = None
    if pipeline == "AFFY_TO_PCL":
        downloader_job = DownloaderJob(
            downloader_task="SRA",
            batch_job_id="DEFAULT",
            num_retries=0,
            accession_code="NUNYA",
            success=None,
        )
        downloader_job.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = og_file_2
        assoc.downloader_job = downloader_job
        assoc.save()

        assoc1 = DownloaderJobOriginalFileAssociation()
        assoc1.original_file = og_file_1
        assoc1.downloader_job = downloader_job
        assoc1.save()

    processor_job = ProcessorJob(
        downloader_job=downloader_job,
        pipeline_applied=pipeline,
        batch_job_id="PROCESSOR/dispatch-1528945054-e8eaf540",
        ram_amount=ram_amount,
        num_retries=0,
        success=None,
        start_time=start_time,
    )
    processor_job.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file_1
    assoc1.processor_job = processor_job
    assoc1.save()

    assoc = ProcessorJobOriginalFileAssociation()
    assoc.original_file = og_file_2
    assoc.processor_job = processor_job
    assoc.save()

    return processor_job
示例#14
0
    def test_jobs_sanity(self):
        """Just makes sure creating Jobs doesn't fail"""

        s_job = SurveyJob()
        s_job.save()

        processor_job = ProcessorJob()
        processor_job.pipeline_applied = "test0"
        processor_job.save()

        dl_job = DownloaderJob()
        dl_job.downloader_task = "XYZ"
        dl_job.accession_code = "123"
        dl_job.save()
    def test_download_file(self, mock_send_job):
        mock_send_job.return_value = None
        dlj = DownloaderJob()
        dlj.save()
        og = OriginalFile()
        og.source_filename = "Aegilops_tauschii.ASM34733v1.37.gtf.gz"
        og.source_url = self.gtf_download_url
        og.is_archive = True
        og.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()

        transcriptome_index.download_transcriptome(dlj.pk)
示例#16
0
    def test_download_file_unmated_reads(self):
        dlj = DownloaderJob()
        dlj.accession_code = "SRR1603661"
        dlj.save()
        og_1 = OriginalFile()
        og_1.source_filename = "SRR1603661_1.fastq.gz"
        og_1.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_1.fastq.gz"
        og_1.expected_md5 = "502a9a482bfa5aa75865ccc0105ad13c"
        og_1.expected_size_in_bytes = 6751980628
        og_1.is_archive = True
        og_1.save()
        og_2 = OriginalFile()
        og_2.source_filename = "SRR1603661_2.fastq.gz"
        og_2.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_2.fastq.gz"
        og_1.expected_md5 = "fffd24457418d255991f54ec82a39d57"
        og_1.expected_size_in_bytes = 6949912932
        og_2.is_archive = True
        og_2.save()
        sample = Sample()
        sample.accession_code = "SRR1603661"
        sample.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og_1
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og_1
        assoc.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og_2
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og_2
        assoc.save()
        result, downloaded_files = sra.download_sra(dlj.pk)
        utils.end_downloader_job(dlj, result)

        self.assertTrue(result)
        self.assertEqual(downloaded_files[0].sha1,
                         "52bf22472069d04fa7767429f6ab78ebd10c0152")
        self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
示例#17
0
    def test_no_rnaseq(self):
        """Makes sure that no RNA-Seq data gets downloaded even if there's a job for it.
        """
        dlj = DownloaderJob()
        dlj.accession_code = 'GSE103217'
        dlj.save()

        original_file = OriginalFile()
        original_file.filename = "GSE103217_family.xml.tgz"
        original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103217/miniml/GSE103217_family.xml.tgz"
        original_file.source_filename = "GSE103217_family.xml.tgz"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSE103217'
        sample.technology = "RNA-SEQ"
        sample.manufacturer = "ILLUMINA"
        sample.platform_accession_code = "Illumina HiSeq 2500"
        sample.save()

        og_assoc = OriginalFileSampleAssociation()
        og_assoc.sample = sample
        og_assoc.original_file = original_file
        og_assoc.save()

        download_result = geo.download_geo(dlj.id)

        self.assertFalse(download_result)
        dlj.refresh_from_db()

        self.assertFalse(dlj.success)

        # It's not necessarily that we didn't extract any files, but
        # none that were usable so it looks like none.
        self.assertEqual(dlj.failure_reason,
                         "Failed to extract any downloaded files.")
示例#18
0
 def test_download_file_swapper(self):
     dlj = DownloaderJob()
     dlj.accession_code = "SRR9117853"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "SRR9117853.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = "SRR9117853"
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result = sra._download_file(og.source_url, dlj, "/tmp/doomed", force_ftp=False)
     self.assertTrue(result)
示例#19
0
def requeue_downloader_job(last_job: DownloaderJob) -> None:
    """Queues a new downloader job.

    The new downloader job will have num_retries one greater than
    last_job.num_retries.
    """
    num_retries = last_job.num_retries + 1

    new_job = DownloaderJob(num_retries=num_retries,
                            downloader_task=last_job.downloader_task,
                            accession_code=last_job.accession_code)
    new_job.save()

    for original_file in last_job.original_files.all():
        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=new_job, original_file=original_file)

    logger.debug(
        "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.",
        last_job.id, new_job.id)
    try:
        if send_job(Downloaders[last_job.downloader_task],
                    job=new_job,
                    is_dispatch=True):
            last_job.retried = True
            last_job.success = False
            last_job.retried_job = new_job
            last_job.save()
        else:
            # Can't communicate with nomad just now, leave the job for a later loop.
            new_job.delete()
    except:
        logger.error(
            "Failed to requeue Downloader Job which had ID %d with a new Downloader Job with ID %d.",
            last_job.id, new_job.id)
        # Can't communicate with nomad just now, leave the job for a later loop.
        new_job.delete()
示例#20
0
    def test_download_multiple_zips(self, mock_send_job):
        """Tests that each sample gets one processor job no matter what.

        https://github.com/AlexsLemonade/refinebio/pull/351 deals with
        a bug where every file that was extracted to a directory got a
        processor job queued for it each time a downloader job ran
        which pointed to that directory. This test makes sure this bug
        stays squashed.

        It does so by running two downloader jobs for the same
        experiment which use two different zip files. Before this bug
        was squashed this would have resulted in the first sample
        getting a second processor job queued for it because the
        second downloader job would have found the file in the
        directory.
        """
        dlj1 = DownloaderJob()
        dlj1.accession_code = 'E-MEXP-433'
        dlj1.save()

        original_file = OriginalFile()
        original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip"
        original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj1
        assoc.save()

        sample = Sample()
        sample.accession_code = 'E-MEXP-433-Waldhof_020604_R30_01-2753_U133A'
        sample.technology = "MICROARRAY"
        sample.manufacturer = "AFFYMETRIX"
        sample.has_raw = True
        # This is fake, but we don't currently support any agilent
        # platforms so we're using a platform that is supported.
        sample.platform_accession_code = "hgu133a"
        sample.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample, original_file=original_file)

        dlj2 = DownloaderJob()
        dlj2.accession_code = 'E-MEXP-433'
        dlj2.save()

        original_file = OriginalFile()
        original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.2.zip"
        original_file.source_filename = "N08_U133A.CEL"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj2
        assoc.save()

        sample = Sample()
        sample.accession_code = 'E-MEXP-433-N08_U133A'
        sample.technology = "MICROARRAY"
        sample.manufacturer = "AFFYMETRIX"
        sample.has_raw = True
        # This is fake, but we don't currently support any agilent
        # platforms so we're using a platform that is supported.
        sample.platform_accession_code = "hgu133a"
        sample.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample, original_file=original_file)

        array_express.download_array_express(dlj1.id)
        array_express.download_array_express(dlj2.id)

        self.assertEqual(ProcessorJob.objects.all().count(), 2)
示例#21
0
    def test_dharma(self):

        dlj1 = DownloaderJob()
        dlj1.accession_code = 'D1'
        dlj1.worker_id = get_instance_id()
        dlj1.start_time = datetime.datetime.now()
        dlj1.save()

        dlj2 = DownloaderJob()
        dlj2.accession_code = 'D2'
        dlj2.worker_id = get_instance_id()
        dlj2.start_time = datetime.datetime.now()
        dlj2.save()

        dlj3 = DownloaderJob()
        dlj3.accession_code = 'D3'
        dlj3.worker_id = get_instance_id()
        dlj3.save()

        original_file = OriginalFile()
        original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip"
        original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj3
        assoc.save()

        sample = Sample()
        sample.accession_code = 'Blahblahblah'
        sample.technology = "MICROARRAY"
        sample.manufacturer = "AFFYMETRIX"
        sample.has_raw = True
        sample.platform_accession_code = "hgu133a"
        sample.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample, original_file=original_file)

        exited = False
        try:
            utils.start_job(dlj3.id,
                            max_downloader_jobs_per_node=2,
                            force_harakiri=True)
        except SystemExit as e:
            # This is supposed to happen!
            self.assertTrue(True)
            exited = True
        except Exception as e:
            # This isn't!
            self.assertTrue(False)
        self.assertTrue(exited)

        exited = False
        try:
            utils.start_job(dlj3.id,
                            max_downloader_jobs_per_node=15,
                            force_harakiri=True)
        except SystemExit as e:
            # This is not supposed to happen!
            self.assertTrue(False)
            exited = True
        except Exception as e:
            # This is!
            self.assertTrue(True)
        self.assertFalse(exited)
示例#22
0
    def test_download_geo(self, mock_send_task):
        """ Tests the main 'download_geo' function. """

        dlj = DownloaderJob()
        dlj.accession_code = 'GSE22427'
        dlj.save()

        original_file = OriginalFile()
        original_file.filename = "GSE22427_non-normalized.txt.gz"
        original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz"
        original_file.source_filename = "GSE22427_non-normalized.txt.gz"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSE22427'
        sample.technology = "MICROARRAY"
        sample.manufacturer = "AGILENT"
        sample.has_raw = True
        # This is fake, but we don't currently support any agilent
        # platforms so we're using a platform that is supported.
        sample.platform_accession_code = "Illumina_RatRef-12_V1.0"
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.sample = sample
        sample_annotation.data = {
            'label_protocol_ch1': 'Agilent',
            'label_protocol_ch2': 'Agilent'
        }
        sample_annotation.save()

        og_assoc = OriginalFileSampleAssociation()
        og_assoc.sample = sample
        og_assoc.original_file = original_file
        og_assoc.save()

        download_result = geo.download_geo(dlj.id)

        file_assocs = OriginalFileSampleAssociation.objects.filter(
            sample=sample)
        self.assertEqual(file_assocs.count(), 2)

        for file_assoc in file_assocs:
            original_file = file_assoc.original_file
            if original_file.filename.endswith(".gz"):
                # We delete the archive after we extract from it
                self.assertFalse(original_file.is_downloaded)
            else:
                self.assertTrue(original_file.is_downloaded)

        # Make sure it worked
        self.assertTrue(download_result)
        self.assertTrue(dlj.failure_reason is None)
        self.assertTrue(len(ProcessorJob.objects.all()) > 0)
        self.assertEqual(ProcessorJob.objects.all()[0].pipeline_applied,
                         "AGILENT_TWOCOLOR_TO_PCL")
        self.assertEqual(ProcessorJob.objects.all()[0].ram_amount, 2048)
示例#23
0
    def test_download_and_extract_file(self):

        # Download function requires a DownloaderJob object,
        # can be blank for the simple case.
        dlj = DownloaderJob()
        dlj.save()

        # *_family.xml.tgz
        geo._download_file(
            'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10241/miniml/GSE10241_family.xml.tgz',
            '/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz', dlj)
        files = geo._extract_tgz(
            '/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz',
            'GSE10241')

        self.assertEqual(8, len(files))

        # GPL File
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE10241/raw/GPL6102-tbl-1.txt'))

        # GSM Files
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE10241/raw/GSM258515-tbl-1.txt'))
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE10241/raw/GSM258516-tbl-1.txt'))
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE10241/raw/GSM258530-tbl-1.txt'))
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE10241/raw/GSM258517-tbl-1.txt'))

        # Original family file
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE10241/raw/GSE10241_family.xml'))
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz'))
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tar'))

        # .txt.gz
        geo._download_file(
            'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254828/suppl/GSM254828.txt.gz',
            '/home/user/data_store/GSM254828/raw/GSM254828.txt.gz', dlj)
        files = geo._extract_gz(
            '/home/user/data_store/GSM254828/raw/GSM254828.txt.gz',
            'GSM254828')
        self.assertEqual(1, len(files))
        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSM254828/raw/GSM254828.txt'))

        geo._download_file(
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz",
            '/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt.gz',
            dlj)
        files = geo._extract_gz(
            '/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt.gz',
            'GSE22427')
        self.assertEqual(1, len(files))

        self.assertTrue(
            os.path.isfile(
                '/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt'
            ))
    def test_organism_shepherd_command(self, mock_nomad, mock_send_job,
                                       mock_get_active_volumes):
        """Tests that the organism shepherd requeues jobs in the right order.

        The situation we're setting up is basically this:
          * There are two experiments.
          * One of them has 1/2 samples processed, the other 0/1
          * One of them needs a DownloaderJob requeued and the other
            needs a ProcessorJob requued.

        And what we're going to test for is:
          * Both of the jobs that need to be requeued are requeued.
          * The experiment with a processed sample is requeued first
            because it has a higher completion percentage.
        """
        # First, set up our mocks to prevent network calls.
        mock_send_job.return_value = True
        active_volumes = {"1", "2", "3"}
        mock_get_active_volumes.return_value = active_volumes

        def mock_init_nomad(host, port=0, timeout=0):
            ret_value = MagicMock()
            ret_value.jobs = MagicMock()
            ret_value.jobs.get_jobs = MagicMock()
            ret_value.jobs.get_jobs.side_effect = lambda: []
            return ret_value

        mock_nomad.side_effect = mock_init_nomad
        zebrafish = Organism(name="DANIO_RERIO",
                             taxonomy_id=1337,
                             is_scientific_name=True)
        zebrafish.save()

        # Experiment that is 0% complete.
        zero_percent_experiment = Experiment(accession_code='ERP037000')
        zero_percent_experiment.technology = 'RNA-SEQ'
        zero_percent_experiment.save()

        organism_assoc = ExperimentOrganismAssociation.objects.create(
            organism=zebrafish, experiment=zero_percent_experiment)

        zero_percent = OriginalFile()
        zero_percent.filename = "ERR037001.fastq.gz"
        zero_percent.source_filename = "ERR037001.fastq.gz"
        zero_percent.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR037/ERR037001/ERR037001_1.fastq.gz"
        zero_percent.is_archive = True
        zero_percent.save()

        zero_percent_sample = Sample()
        zero_percent_sample.accession_code = 'ERR037001'
        zero_percent_sample.organism = zebrafish
        zero_percent_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = zero_percent_sample
        assoc.original_file = zero_percent
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = zero_percent_sample
        assoc.experiment = zero_percent_experiment
        assoc.save()

        # TODO: fix names of all the variables to be appropriate for this test case.
        zero_percent_dl_job = DownloaderJob()
        zero_percent_dl_job.accession_code = zero_percent_sample.accession_code
        zero_percent_dl_job.downloader_task = "SRA"
        zero_percent_dl_job.start_time = timezone.now()
        zero_percent_dl_job.end_time = timezone.now()
        zero_percent_dl_job.success = False
        zero_percent_dl_job.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = zero_percent_dl_job
        assoc.original_file = zero_percent
        assoc.save()

        # Experiment that is 50% complete.
        fify_percent_experiment = Experiment(accession_code='ERP036000')
        fify_percent_experiment.technology = 'RNA-SEQ'
        fify_percent_experiment.save()

        organism_assoc = ExperimentOrganismAssociation.objects.create(
            organism=zebrafish, experiment=fify_percent_experiment)

        ## First sample, this one has been processed.
        successful_pj = ProcessorJob()
        successful_pj.accession_code = "ERR036000"
        successful_pj.pipeline_applied = "SALMON"
        successful_pj.ram_amount = 12288
        successful_pj.start_time = timezone.now()
        successful_pj.end_time = timezone.now()
        successful_pj.success = True
        successful_pj.save()

        successful_og = OriginalFile()
        successful_og.filename = "ERR036000.fastq.gz"
        successful_og.source_filename = "ERR036000.fastq.gz"
        successful_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        successful_og.is_archive = True
        successful_og.save()

        successful_sample = Sample()
        successful_sample.accession_code = 'ERR036000'
        successful_sample.organism = zebrafish
        successful_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = successful_sample
        assoc.original_file = successful_og
        assoc.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.processor_job = successful_pj
        assoc.original_file = successful_og
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = successful_sample
        assoc.experiment = fify_percent_experiment
        assoc.save()

        ## Second sample, this one hasn't been processed.
        fifty_percent_unprocessed_og = OriginalFile()
        fifty_percent_unprocessed_og.filename = "ERR036001.fastq.gz"
        fifty_percent_unprocessed_og.source_filename = "ERR036001.fastq.gz"
        fifty_percent_unprocessed_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036001/ERR036001_1.fastq.gz"
        fifty_percent_unprocessed_og.is_archive = True
        fifty_percent_unprocessed_og.save()

        fifty_percent_unprocessed_sample = Sample()
        fifty_percent_unprocessed_sample.accession_code = 'ERR036001'
        fifty_percent_unprocessed_sample.organism = zebrafish
        fifty_percent_unprocessed_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = fifty_percent_unprocessed_sample
        assoc.original_file = fifty_percent_unprocessed_og
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = fifty_percent_unprocessed_sample
        assoc.experiment = fify_percent_experiment
        assoc.save()

        fifty_percent_processor_job = ProcessorJob()
        fifty_percent_processor_job.pipeline_applied = "SALMON"
        fifty_percent_processor_job.accession_code = fifty_percent_unprocessed_sample.accession_code
        fifty_percent_processor_job.ram_amount = 12288
        fifty_percent_processor_job.start_time = timezone.now()
        fifty_percent_processor_job.end_time = timezone.now()
        fifty_percent_processor_job.success = False
        fifty_percent_processor_job.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.processor_job = fifty_percent_processor_job
        assoc.original_file = fifty_percent_unprocessed_og
        assoc.save()

        # Setup is done, actually run the command.
        args = []
        options = {"organism_name": "DANIO_RERIO"}
        call_command("organism_shepherd", *args, **options)

        # Verify that the jobs were called in the correct order.
        mock_calls = mock_send_job.mock_calls

        first_call_job_type = mock_calls[0][1][0]
        first_call_job_object = mock_calls[0][2]["job"]
        self.assertEqual(first_call_job_type, ProcessorPipeline.SALMON)
        self.assertEqual(first_call_job_object.pipeline_applied,
                         fifty_percent_processor_job.pipeline_applied)
        self.assertEqual(first_call_job_object.ram_amount,
                         fifty_percent_processor_job.ram_amount)
        self.assertIn(first_call_job_object.volume_index, active_volumes)

        fifty_percent_processor_job.refresh_from_db()
        self.assertEqual(first_call_job_object,
                         fifty_percent_processor_job.retried_job)

        second_call_job_type = mock_calls[1][1][0]
        second_call_job_object = mock_calls[1][2]["job"]
        self.assertEqual(second_call_job_type, Downloaders.SRA)
        self.assertEqual(second_call_job_object.accession_code,
                         zero_percent_dl_job.accession_code)
        self.assertEqual(second_call_job_object.downloader_task,
                         zero_percent_dl_job.downloader_task)

        zero_percent_dl_job.refresh_from_db()
        self.assertEqual(second_call_job_object,
                         zero_percent_dl_job.retried_job)
示例#25
0
    def test_download_aspera_and_ftp(self):
        """ Tests the main 'download_geo' function. """

        dlj = DownloaderJob()
        dlj.accession_code = 'GSE22427'
        dlj.save()

        original_file = OriginalFile()
        original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz"
        original_file.source_filename = "GSE22427_non-normalized.txt.gz"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSE22427'
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.sample = sample
        sample_annotation.data = {
            'label_protocol_ch1': 'Agilent',
            'label_protocol_ch2': 'Agilent'
        }
        sample_annotation.save()

        og_assoc = OriginalFileSampleAssociation()
        og_assoc.sample = sample
        og_assoc.original_file = original_file
        og_assoc.save()

        LOCAL_ROOT_DIR = "/home/user/data_store"
        os.makedirs(LOCAL_ROOT_DIR + '/' + sample.accession_code,
                    exist_ok=True)
        dl_file_path = LOCAL_ROOT_DIR + '/' + sample.accession_code + '/' + original_file.source_url.split(
            '/')[-1]

        # Aspera
        result = geo._download_file(original_file.source_url,
                                    file_path=dl_file_path,
                                    job=dlj,
                                    force_ftp=False)
        self.assertTrue(result)
        self.assertTrue(os.path.exists(dl_file_path))
        os.remove(dl_file_path)

        # FTP
        result = geo._download_file(original_file.source_url,
                                    file_path=dl_file_path,
                                    job=dlj,
                                    force_ftp=True)
        self.assertTrue(result)
        self.assertTrue(os.path.exists(dl_file_path))
        os.remove(dl_file_path)

        # Aspera, fail
        result = geo._download_file_aspera("https://rich.zone/cool_horse.jpg",
                                           target_file_path=dl_file_path,
                                           downloader_job=dlj,
                                           attempt=5)
        self.assertFalse(result)
        self.assertTrue(dlj.failure_reason != None)
示例#26
0
    def test_no_repeat_jobs(self):
        """Make sure that queue_downloader_jobs queues all expected Downloader
        jobs for a given experiment.
        """
        # First, create an experiment with two samples associated with it
        # and create two original files for each of those samples.
        experiment_object = Experiment()
        experiment_object.accession_code = "Experiment1"
        experiment_object.save()

        sample_object = Sample()
        sample_object.accession_code = "Sample1"
        sample_object.platform_accession_code = "Illumina Genome Analyzer"
        sample_object.platform_accession_name = "Illumina Genome Analyzer"
        sample_object.technology = "RNA-SEQ"
        sample_object.manufacturer = "ILLUMINA"
        sample_object.source_database = "SRA"
        sample_object.save()

        original_file_1 = OriginalFile()
        original_file_1.source_url = "first_url"
        original_file_1.source_filename = "first_filename"
        original_file_1.is_downloaded = False
        original_file_1.has_raw = True
        original_file_1.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.original_file = original_file_1
        original_file_sample_association.sample = sample_object
        original_file_sample_association.save()

        original_file_2 = OriginalFile()
        original_file_2.source_url = "second_url"
        original_file_2.source_filename = "second_filename"
        original_file_2.is_downloaded = False
        original_file_2.has_raw = True
        original_file_2.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.original_file = original_file_2
        original_file_sample_association.sample = sample_object
        original_file_sample_association.save()

        dlj = DownloaderJob()
        dlj.save()

        DownloaderJobOriginalFileAssociation(
            downloader_job=dlj, original_file=original_file_1
        ).save()

        DownloaderJobOriginalFileAssociation(
            downloader_job=dlj, original_file=original_file_2
        ).save()

        survey_job = SurveyJob(source_type="SRA")
        survey_job.save()
        surveyor = SraSurveyor(survey_job)

        surveyor.queue_downloader_job_for_original_files(
            [original_file_1, original_file_2], experiment_object.accession_code
        )

        # We made one DownloaderJob in this test, so
        # queue_downloader_job_for_original_files didn't have anything
        # to do, so there should still be only one:
        self.assertEqual(1, DownloaderJob.objects.all().count())
示例#27
0
    def queue_downloader_jobs(self, experiment: Experiment,
                              samples: List[Sample]):
        """This enqueues DownloaderJobs on a per-file basis.

        There is a complementary function below for enqueueing multi-file
        DownloaderJobs.
        """
        files_to_download = []
        for sample in samples:
            files_for_sample = OriginalFile.objects.filter(sample=sample,
                                                           is_downloaded=False)
            for og_file in files_for_sample:
                files_to_download.append(og_file)

        download_urls_with_jobs = {}
        for original_file in files_to_download:

            # We don't need to create multiple downloaders for the same file.
            # However, we do want to associate original_files with the
            # DownloaderJobs that will download them.
            if original_file.source_url in download_urls_with_jobs.keys():
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=download_urls_with_jobs[
                        original_file.source_url],
                    original_file=original_file,
                )
                continue

            # There is already a downloader job associated with this file.
            old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter(
                original_file__source_url=original_file.source_url).count()
            if old_assocs_count > 0:
                logger.debug(
                    "We found an existing DownloaderJob for this file/url.",
                    original_file_id=original_file.id,
                )
                continue

            sample_object = original_file.samples.first()
            downloader_task = determine_downloader_task(sample_object)

            if downloader_task == Downloaders.NONE:
                logger.info(
                    "No valid downloader task found for sample.",
                    sample=sample_object.id,
                    original_file=original_file.id,
                )
            else:
                downloader_job = DownloaderJob()
                downloader_job.downloader_task = downloader_task.value
                downloader_job.accession_code = experiment.accession_code
                downloader_job.save()

                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                download_urls_with_jobs[
                    original_file.source_url] = downloader_job

                try:
                    logger.info(
                        "Queuing downloader job for URL: " +
                        original_file.source_url,
                        survey_job=self.survey_job.id,
                        downloader_job=downloader_job.id,
                    )
                    send_job(downloader_task, downloader_job)
                except Exception:
                    # If we fail to queue the job, it will be requeued.
                    pass
示例#28
0
    def test_download_and_extract_file(self):

        # Download function requires a DownloaderJob object,
        # can be blank for the simple case.
        dlj = DownloaderJob()
        dlj.save()

        # *_family.xml.tgz
        geo._download_file(
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10241/miniml/GSE10241_family.xml.tgz",
            "/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz",
            dlj,
        )
        archive_file = geo.ArchivedFile(
            "/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz")
        files = [file for file in archive_file.get_files()]

        # There should be 8 files in total in the directory, 2 downloaded and 6 extracted
        # `archive_file.get_files()` only returns the files that are extracted from the archives
        # instead of enumerating over all files.
        self.assertEqual(6, len(files))

        # GPL File
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE10241/raw/GPL6102-tbl-1.txt"))

        # GSM Files
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE10241/raw/GSM258515-tbl-1.txt"))
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE10241/raw/GSM258516-tbl-1.txt"))
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE10241/raw/GSM258530-tbl-1.txt"))
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE10241/raw/GSM258517-tbl-1.txt"))

        # Original family file
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE10241/raw/GSE10241_family.xml"))
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz"))
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tar"))

        # .txt.gz
        geo._download_file(
            "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254828/suppl/GSM254828.txt.gz",
            "/home/user/data_store/GSM254828/raw/GSM254828.txt.gz",
            dlj,
        )
        archive_file = geo.ArchivedFile(
            "/home/user/data_store/GSM254828/raw/GSM254828.txt.gz")
        files = [file for file in archive_file.get_files()]
        self.assertEqual(1, len(files))
        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSM254828/raw/GSM254828.txt"))

        geo._download_file(
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz",
            "/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt.gz",
            dlj,
        )
        archive_file = geo.ArchivedFile(
            "/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt.gz"
        )
        files = [file for file in archive_file.get_files()]
        self.assertEqual(1, len(files))

        self.assertTrue(
            os.path.isfile(
                "/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt"
            ))
示例#29
0
    def setUp(self):
        # Saving this for if we have protected endpoints
        # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword')
        # self.client.login(username='******', password='******')
        # self.user = User.objects.create(username="******")

        experiment = Experiment()
        experiment.accession_code = "GSE000"
        experiment.alternate_accession_code = "E-GEOD-000"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.save()
        self.experiment = experiment

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below)
        for i in range(26):
            Organism(name=("TEST_ORGANISM_{}".format(i)),
                     taxonomy_id=(1234 + i)).save()

        ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA",
                              taxonomy_id=9646,
                              is_scientific_name=True)
        ailuropoda.save()
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()
        self.sample = sample

        # add qn target for sample organism
        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {"organism_id": ailuropoda.id, "is_qn": True}
        cra.save()

        ailuropoda.qn_target = result
        ailuropoda.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()
        experiment.num_total_samples = 1
        experiment.num_processed_samples = 1
        experiment.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        processor = Processor()
        processor.name = "Salmon Quant"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        computational_result_short = ComputationalResult(processor=processor)
        computational_result_short.save()

        organism_index = OrganismIndex()
        organism_index.index_type = "TRANSCRIPTOME_SHORT"
        organism_index.organism = self.danio_rerio
        organism_index.result = computational_result_short
        organism_index.absolute_directory_path = (
            "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT")
        organism_index.is_public = True
        organism_index.s3_url = "not_blank"
        organism_index.save()

        return
示例#30
0
    def test_create_missing_jobs(self):
        """Tests that files which should have downloader jobs get them created."""

        # 1. create a sample with an original file and a downloader job
        original_file_with_downloader = OriginalFile()
        original_file_with_downloader.filename = "processed.CEL"
        original_file_with_downloader.source_filename = "processed.CEL"
        original_file_with_downloader.is_downloaded = True
        original_file_with_downloader.is_archive = False
        original_file_with_downloader.save()

        sample_with_downloader = Sample()
        sample_with_downloader.accession_code = "MA_doesnt_need_processor"
        sample_with_downloader.technology = "MICROARRAY"
        sample_with_downloader.source_database = "GEO"
        sample_with_downloader.platform_accession_code = "bovine"
        sample_with_downloader.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample_with_downloader,
            original_file=original_file_with_downloader)

        downloader_job = DownloaderJob()
        downloader_job.success = True
        downloader_job.worker_id = "worker_1"
        downloader_job.volume_index = "1"
        downloader_job.save()

        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=downloader_job,
            original_file=original_file_with_downloader)

        # 2. create a sample with an original file and no downloader job
        original_file = OriginalFile()
        original_file.filename = "tarball.gz"
        original_file.source_filename = "tarball.gz"
        original_file.is_downloaded = True
        original_file.is_archive = True
        original_file.save()

        sample_no_downloader = Sample()
        sample_no_downloader.accession_code = "sample_no_downloader"
        sample_no_downloader.technology = "MICROARRAY"
        sample_no_downloader.source_database = "GEO"
        sample_no_downloader.platform_accession_code = "bovine"  # must be a supported platform
        sample_no_downloader.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            sample=sample_no_downloader, original_file=original_file)

        # 3. Setup is done, actually run the command.
        command = Command()
        command.handle()

        ## Test that a missing downloader job was created.
        self.assertEqual(
            1,
            DownloaderJobOriginalFileAssociation.objects.filter(
                original_file=original_file).count(),
        )

        ## Test that a downloader job that wasn't missing wasn't created.
        ## Of course, we created one in test setup, so we're really
        ## checking that it's still only 1.
        self.assertEqual(
            1,
            DownloaderJobOriginalFileAssociation.objects.filter(
                original_file=original_file_with_downloader).count(),
        )