Python OriginalFileSampleAssociation.original_file示例，data_refinery_common.models.OriginalFileSampleAssociation.original_file Python示例

示例#1

0

显示文件

 def test_download_file_ncbi(self, mock_send_job):
     mock_send_job.return_value = None
     
     dlj = DownloaderJob()
     dlj.accession_code = "DRR002116"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "DRR002116.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = 'DRR002116'
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result, downloaded_files = sra.download_sra(dlj.pk)
     utils.end_downloader_job(dlj, result)
     self.assertTrue(result)
     self.assertEqual(downloaded_files[0].sha1, 'd5374e7fe047d4f76b165c3f5148ab2df9d42cea')
     self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))

示例#2

0

显示文件

    def test_download_file(self, mock_send_job):
        mock_send_job.return_value = None
        
        dlj = DownloaderJob()
        dlj.accession_code = "ERR036"
        dlj.save()

        og = OriginalFile()
        og.source_filename = "ERR036000.fastq.gz"
        og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        og.is_archive = True
        og.save()

        sample = Sample()
        sample.accession_code = 'ERR036000'
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()

        success = sra.download_sra(dlj.pk)

示例#3

0

显示文件

文件： test_sra.py 项目： erflynn/refinebio

 def test_download_file_ncbi(self):
     dlj = DownloaderJob()
     dlj.accession_code = "SRR9117853"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "SRR9117853.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = "SRR9117853"
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result, downloaded_files = sra.download_sra(dlj.pk)
     utils.end_downloader_job(dlj, result)
     self.assertTrue(result)
     self.assertEqual(downloaded_files[0].sha1, "e7ad484fe6f134ba7d1b2664e58cc15ae5a958cc")
     self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))

示例#4

0

显示文件

    def test_download_file(self):
        dlj = DownloaderJob()
        dlj.accession_code = "ERR036"
        dlj.save()

        og = OriginalFile()
        og.source_filename = "ERR036000.fastq.gz"
        og.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        og.is_archive = True
        og.save()

        sample = Sample()
        sample.accession_code = "ERR036000"
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()

        result, downloaded_files = sra.download_sra(dlj.pk)

        self.assertTrue(result)
        self.assertEqual(downloaded_files[0].sha1,
                         "1dfe5460a4101fe87feeffec0cb2e053f6695961")
        self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))

示例#5

0

显示文件

文件： test_sra.py 项目： modulexcite/refinebio

    def test_download_file_swapper(self, mock_send_job):
        mock_send_job.return_value = None

        dlj = DownloaderJob()
        dlj.accession_code = "DRR002116"
        dlj.save()
        og = OriginalFile()
        og.source_filename = "DRR002116.sra"
        og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra"
        og.is_archive = True
        og.save()
        sample = Sample()
        sample.accession_code = 'DRR002116'
        sample.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og
        assoc.save()
        result = sra._download_file(og.source_url,
                                    dlj,
                                    "/tmp",
                                    force_ftp=False)
        self.assertTrue(result)

示例#6

0

显示文件

文件： test_salmon.py 项目： modulexcite/refinebio

def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "SALMON"
    pj.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    samp = Sample()
    samp.accession_code = "SALMON" # So the test files go to the right place
    samp.organism = c_elegans
    samp.source_database = 'SRA'
    samp.technology = 'RNA-SEQ'
    samp.save()

    prepare_organism_indices()

    og_file = OriginalFile()
    og_file.source_filename = "ERR1562482_1.fastq.gz"
    og_file.filename = "ERR1562482_1.fastq.gz"
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz"
    og_file.is_downloaded = True
    og_file.save()

    og_file2 = OriginalFile()
    og_file2.source_filename = "ERR1562482_2.fastq.gz"
    og_file2.filename = "ERR1562482_2.fastq.gz"
    og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz"
    og_file2.is_downloaded = True
    og_file2.save()

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    og_file_samp_assoc2 = OriginalFileSampleAssociation()
    og_file_samp_assoc2.original_file = og_file2
    og_file_samp_assoc2.sample = samp
    og_file_samp_assoc2.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file2
    assoc1.processor_job = pj
    assoc1.save()

    return pj, [og_file, og_file2]

示例#7

0

显示文件

def prepare_job(length):

    pj = ProcessorJob()
    pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper()
    pj.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                taxonomy_id=1001)

    samp = Sample()
    samp.organism = homo_sapiens
    samp.accession_code = "derp" + length
    samp.save()

    og_file = OriginalFile()
    og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz"
    og_file.filename = "aegilops_tauschii_" + length + ".fa.gz"
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz"
    og_file.is_downloaded = True
    og_file.save()

    og_file2 = OriginalFile()
    og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz"
    og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz"
    og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz"
    og_file2.is_downloaded = True
    og_file2.save()

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    og_file_samp_assoc2 = OriginalFileSampleAssociation()
    og_file_samp_assoc2.original_file = og_file2
    og_file_samp_assoc2.sample = samp
    og_file_samp_assoc2.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    assoc2 = ProcessorJobOriginalFileAssociation()
    assoc2.original_file = og_file2
    assoc2.processor_job = pj
    assoc2.save()

    return pj

示例#8

0

显示文件

文件： test_utils.py 项目： modulexcite/refinebio

def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "AFFY_TO_PCL"
    pj.save()

    original_file = OriginalFile()
    original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip"
    original_file.filename = "GSM1426071_CD_colon_active_1.CEL"
    original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL"
    original_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = original_file
    assoc1.processor_job = pj
    assoc1.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    sample = Sample()
    sample.title = "Heyo"
    sample.organism = c_elegans
    sample.is_processed = False
    sample.save()

    ogsa = OriginalFileSampleAssociation()
    ogsa.sample = sample
    ogsa.original_file = original_file
    ogsa.save()

    return pj

示例#9

0

显示文件

文件： test_salmon.py 项目： Quiltomics/refinebio

def prepare_dotsra_job(filename="ERR1562482.sra"):
    pj = ProcessorJob()
    pj.pipeline_applied = "SALMON"
    pj.id = random.randint(111, 999999)
    pj.save()

    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    samp = Sample()
    samp.accession_code = "SALMON" # So the test files go to the right place
    samp.organism = c_elegans
    samp.save()

    prepare_organism_indices()

    og_file = OriginalFile()
    og_file.source_filename = filename
    og_file.filename = filename
    og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/" + filename
    og_file.save()

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    return pj, [og_file]

示例#10

0

显示文件

文件： geo.py 项目： modulexcite/refinebio

def _get_actual_file_if_queueable(
        extracted_subfile: Dict,
        original_file: OriginalFile,
        samples: List[Sample]) -> OriginalFile:
    """Returns the actual file from the archive if it should be queued.

    If the file has been processed or has an unstarted DownloaderJob,
    None will be returned.

    `extracted_subfile` should be a Dict containing metadata about the
    file that was extracted from an archive.

    `original_file` should be the file associated with the CURRENT
    DownloaderJob.

    `samples` are the samples that the actual file should be associated
    with if it has to be created.
    """
    # Check to see if we've made this original file before:
    potential_existing_files = OriginalFile.objects.filter(
        source_filename=original_file.source_filename,
        filename=extracted_subfile['filename'],
        is_archive=False
    )
    if potential_existing_files.count() > 0:
        # We've already created this record, let's see if we actually
        # needed to download it or if we just got it because we needed
        # a file in the same archive.
        actual_file = potential_existing_files[0]

        if actual_file.needs_processing():
            if not actual_file.is_downloaded:
                actual_file.is_downloaded = True
                actual_file.save()
            return actual_file
        else:
            return None

    else:
        actual_file = OriginalFile()
        actual_file.is_downloaded = True
        actual_file.is_archive = False
        actual_file.absolute_file_path = extracted_subfile['absolute_path']
        actual_file.filename = extracted_subfile['filename']
        actual_file.calculate_size()
        actual_file.calculate_sha1()
        actual_file.has_raw = True
        actual_file.source_url = original_file.source_url
        actual_file.source_filename = original_file.source_filename
        actual_file.save()

        for sample in samples:
            original_file_sample_association = OriginalFileSampleAssociation()
            original_file_sample_association.sample = sample
            original_file_sample_association.original_file = actual_file
            original_file_sample_association.save()

        return actual_file

示例#11

0

显示文件

def prepare_illumina_job(organism):
    pj = ProcessorJob()
    pj.pipeline_applied = "ILLUMINA_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz"
    og_file.filename = "GSE22427_non-normalized.txt"
    og_file.absolute_file_path = (
        "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt")
    og_file.is_downloaded = True
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    sample_names = [
        "LV-C&si-Control-1",
        "LV-C&si-Control-2",
        "LV-C&si-Control-3",
        "LV-C&si-EZH2-1",
        "LV-C&si-EZH2-2",
        "LV-C&si-EZH2-3",
        "LV-EZH2&si-EZH2-1",
        "LV-EZH2&si-EZH2-2",
        "LV-EZH2&si-EZH2-3",
        "LV-T350A&si-EZH2-1",
        "LV-T350A&si-EZH2-2",
        "LV-T350A&si-EZH2-3",
    ]

    for name in sample_names:
        sample = Sample()
        sample.accession_code = name
        sample.title = name
        sample.organism = organism
        sample.save()

        sa = SampleAnnotation()
        sa.sample = sample
        sa.data = {"description": [name]}
        sa.is_ccdl = False
        sa.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

    sample = Sample.objects.get(title="LV-T350A&si-EZH2-3")
    sample.title = "ignoreme_for_description"
    sample.accession_code = "ignoreme_for_description"
    sample.save()

    return pj

示例#12

0

显示文件

    def test_download_file_unmated_reads(self):
        dlj = DownloaderJob()
        dlj.accession_code = "SRR1603661"
        dlj.save()
        og_1 = OriginalFile()
        og_1.source_filename = "SRR1603661_1.fastq.gz"
        og_1.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_1.fastq.gz"
        og_1.expected_md5 = "502a9a482bfa5aa75865ccc0105ad13c"
        og_1.expected_size_in_bytes = 6751980628
        og_1.is_archive = True
        og_1.save()
        og_2 = OriginalFile()
        og_2.source_filename = "SRR1603661_2.fastq.gz"
        og_2.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_2.fastq.gz"
        og_1.expected_md5 = "fffd24457418d255991f54ec82a39d57"
        og_1.expected_size_in_bytes = 6949912932
        og_2.is_archive = True
        og_2.save()
        sample = Sample()
        sample.accession_code = "SRR1603661"
        sample.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og_1
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og_1
        assoc.save()
        assoc = OriginalFileSampleAssociation()
        assoc.sample = sample
        assoc.original_file = og_2
        assoc.save()
        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = dlj
        assoc.original_file = og_2
        assoc.save()
        result, downloaded_files = sra.download_sra(dlj.pk)
        utils.end_downloader_job(dlj, result)

        self.assertTrue(result)
        self.assertEqual(downloaded_files[0].sha1,
                         "52bf22472069d04fa7767429f6ab78ebd10c0152")
        self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))

示例#13

0

显示文件

文件： test_sra.py 项目： erflynn/refinebio

 def test_download_file_swapper(self):
     dlj = DownloaderJob()
     dlj.accession_code = "SRR9117853"
     dlj.save()
     og = OriginalFile()
     og.source_filename = "SRR9117853.sra"
     og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra"
     og.is_archive = True
     og.save()
     sample = Sample()
     sample.accession_code = "SRR9117853"
     sample.save()
     assoc = OriginalFileSampleAssociation()
     assoc.sample = sample
     assoc.original_file = og
     assoc.save()
     assoc = DownloaderJobOriginalFileAssociation()
     assoc.downloader_job = dlj
     assoc.original_file = og
     assoc.save()
     result = sra._download_file(og.source_url, dlj, "/tmp/doomed", force_ftp=False)
     self.assertTrue(result)

示例#14

0

显示文件

文件： test_illumina.py 项目： AlexsLemonade/refinebio

def prepare_illumina_job(job_info: Dict) -> ProcessorJob:
    pj = ProcessorJob()
    pj.pipeline_applied = "ILLUMINA_TO_PCL"
    pj.save()

    og_file = OriginalFile()
    og_file.source_filename = job_info["source_filename"]
    og_file.filename = job_info["filename"]
    og_file.absolute_file_path = job_info["absolute_file_path"]
    og_file.is_downloaded = True
    og_file.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    for s in job_info["samples"]:
        # For convenience, if you give a list of strings we'll just use the
        # strings as both titles and accessions.
        annotation = None
        if type(s) == str:
            accession_code = s
            title = s
        elif type(s) == tuple and list(map(type, s)) == [str, str]:
            accession_code, title = s
        elif type(s) == tuple and list(map(type, s)) == [str, str, dict]:
            accession_code, title, annotation = s
        else:
            raise ValueError(f"Invalid sample type for sample {s}")

        sample = Sample()
        sample.accession_code = accession_code
        sample.title = title
        sample.organism = job_info["organism"]
        sample.save()

        sa = SampleAnnotation()
        sa.sample = sample
        sa.data = annotation if annotation is not None else {
            "description": [title]
        }
        sa.is_ccdl = False
        sa.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

    return pj

示例#15

0

显示文件

文件： test_no_op.py 项目： erflynn/refinebio

    def test_convert_illumina_no_header(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931
        # ILMN_2209417    10.0000 0.2029
        # ILMN_1765401    152.0873    0.0000
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt"
        )
        og_file.filename = "GSM1089291-tbl-1.txt"
        og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt"
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000105675 10
        # ENSG00000085721 152.0873
        # ENSG00000278494 152.0873
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         786207)

示例#16

0

显示文件

文件： test_no_op.py 项目： erflynn/refinebio

    def test_convert_processed_illumina(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # Reporter Identifier VALUE   Detection Pval
        # ILMN_1343291    14.943602   0
        # ILMN_1343295    13.528082   0
        og_file = OriginalFile()
        og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/"
        og_file.filename = "GSM557500_sample_table.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt")
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000156508 14.943602
        # ENSG00000111640 13.528082
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         920374)
        self.assertTrue(
            no_op.check_output_quality(final_context["output_file_path"]))

示例#17

0

显示文件

    def test_good_detection(self):
        """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """
        from data_refinery_workers.processors import illumina

        pj = ProcessorJob()
        pj.pipeline_applied = "ILLUMINA_TO_PCL"
        pj.save()

        og_file = OriginalFile()
        og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz"
        og_file.filename = "GSE54661_non_normalized.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = pj
        assoc1.save()

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        sample = Sample()
        sample.accession_code = "ABCD-1234"
        sample.title = "hypoxia_Signal"
        sample.organism = organism
        sample.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

        final_context = illumina.illumina_to_pcl(pj.pk)
        self.assertEqual(final_context["platform"], "illuminaHumanv3")

        for key in final_context["samples"][0].sampleannotation_set.all(
        )[0].data.keys():
            self.assertTrue(key in [
                "detected_platform", "detection_percentage",
                "mapped_percentage"
            ])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)

示例#18

0

显示文件

文件： test_no_op.py 项目： AlexsLemonade/refinebio

    def test_convert_illumina_bad_cols(self):
        """
        In future, this test may be deprecated. For now it just alerts that it needs attention.
        """
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931  11.0000 0.123
        # ILMN_2209417    10.0000 0.2029  11.1234 0.543
        # LMN_1765401    152.0873    0.0000  99.999  0.19
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt"
        )
        og_file.filename = "GSM1089291-tbl-1-modified.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        final_context = no_op.no_op_processor(job.pk)
        self.assertFalse(final_context["success"])
        self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)

示例#19

0

显示文件

文件： test_transcriptome_index.py 项目： AlexsLemonade/refinebio

def prepare_job(length):
    pj = ProcessorJob()
    pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper()
    pj.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                taxonomy_id=1001)

    samp = Sample()
    samp.organism = homo_sapiens
    samp.accession_code = "derp" + length
    samp.save()

    [og_file, og_file2] = prepare_original_files(length)

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    og_file_samp_assoc2 = OriginalFileSampleAssociation()
    og_file_samp_assoc2.original_file = og_file2
    og_file_samp_assoc2.sample = samp
    og_file_samp_assoc2.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    assoc2 = ProcessorJobOriginalFileAssociation()
    assoc2.original_file = og_file2
    assoc2.processor_job = pj
    assoc2.save()

    return pj

示例#20

0

显示文件

文件： test_no_op.py 项目： AlexsLemonade/refinebio

def prepare_job(job_info: dict) -> ProcessorJob:
    job = ProcessorJob()
    job.pipeline_applied = "NO_OP"
    job.save()

    og_file = OriginalFile()
    og_file.source_filename = job_info["source_filename"]
    og_file.filename = job_info["filename"]
    og_file.absolute_file_path = job_info["absolute_file_path"]
    og_file.is_downloaded = True
    og_file.save()

    sample = Sample()
    sample.accession_code = job_info["accession_code"]
    sample.title = job_info["accession_code"]
    sample.platform_accession_code = job_info["platform_accession_code"]

    manufacturer = job_info.get("manufacturer", None)
    if manufacturer is not None:
        sample.manufacturer = manufacturer

    # The illumina samples need the human organism
    if manufacturer == "ILLUMINA":
        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()
        sample.organism = homo_sapiens

    sample.save()

    assoc = OriginalFileSampleAssociation()
    assoc.original_file = og_file
    assoc.sample = sample
    assoc.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = job
    assoc1.save()

    return job

示例#21

0

显示文件

文件： test_geo.py 项目： modulexcite/refinebio

    def test_no_rnaseq(self):
        """Makes sure that no RNA-Seq data gets downloaded even if there's a job for it.
        """
        dlj = DownloaderJob()
        dlj.accession_code = 'GSE103217'
        dlj.save()

        original_file = OriginalFile()
        original_file.filename = "GSE103217_family.xml.tgz"
        original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103217/miniml/GSE103217_family.xml.tgz"
        original_file.source_filename = "GSE103217_family.xml.tgz"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSE103217'
        sample.technology = "RNA-SEQ"
        sample.manufacturer = "ILLUMINA"
        sample.platform_accession_code = "Illumina HiSeq 2500"
        sample.save()

        og_assoc = OriginalFileSampleAssociation()
        og_assoc.sample = sample
        og_assoc.original_file = original_file
        og_assoc.save()

        download_result = geo.download_geo(dlj.id)

        self.assertFalse(download_result)
        dlj.refresh_from_db()

        self.assertFalse(dlj.success)

        # It's not necessarily that we didn't extract any files, but
        # none that were usable so it looks like none.
        self.assertEqual(dlj.failure_reason,
                         "Failed to extract any downloaded files.")

示例#22

0

显示文件

文件： test_api_general.py 项目： arjunkrish/refinebio

    def setUp(self):
        # Saving this for if we have protected endpoints
        # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword')
        # self.client.login(username='******', password='******')
        # self.user = User.objects.create(username="******")

        experiment = Experiment()
        experiment.accession_code = "GSE000"
        experiment.alternate_accession_code = "E-GEOD-000"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.save()
        self.experiment = experiment

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below)
        for i in range(26):
            Organism(name=("TEST_ORGANISM_{}".format(i)),
                     taxonomy_id=(1234 + i)).save()

        ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA",
                              taxonomy_id=9646,
                              is_scientific_name=True)
        ailuropoda.save()
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()
        self.sample = sample

        # add qn target for sample organism
        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {"organism_id": ailuropoda.id, "is_qn": True}
        cra.save()

        ailuropoda.qn_target = result
        ailuropoda.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()
        experiment.num_total_samples = 1
        experiment.num_processed_samples = 1
        experiment.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        processor = Processor()
        processor.name = "Salmon Quant"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        computational_result_short = ComputationalResult(processor=processor)
        computational_result_short.save()

        organism_index = OrganismIndex()
        organism_index.index_type = "TRANSCRIPTOME_SHORT"
        organism_index.organism = self.danio_rerio
        organism_index.result = computational_result_short
        organism_index.absolute_directory_path = (
            "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT")
        organism_index.is_public = True
        organism_index.s3_url = "not_blank"
        organism_index.save()

        return

示例#23

0

显示文件

    def test_queue_downloader_jobs_for_original_files(self, mock_send_task):
        """Make sure that queue_downloader_jobs queues all expected Downloader
        jobs for a given experiment.
        """
        # First, create an experiment with two samples associated with it
        # and create two original files for each of those samples.
        experiment_object = Experiment()
        experiment_object.accession_code = "Experiment1"
        experiment_object.save()

        sample_object_1 = Sample()
        sample_object_1.accession_code = "Sample1"
        sample_object_1.platform_accession_code = "Illumina Genome Analyzer"
        sample_object_1.platform_accession_name = "Illumina Genome Analyzer"
        sample_object_1.technology = "RNA-SEQ"
        sample_object_1.manufacturer = "ILLUMINA"
        sample_object_1.source_database = "SRA"
        sample_object_1.save()
        sample_object_2 = Sample()
        sample_object_2.accession_code = "Sample2"
        sample_object_2.platform_accession_code = "Illumina Genome Analyzer"
        sample_object_2.platform_accession_name = "Illumina Genome Analyzer"
        sample_object_2.technology = "RNA-SEQ"
        sample_object_2.manufacturer = "ILLUMINA"
        sample_object_2.source_database = "SRA"
        sample_object_2.save()

        association = ExperimentSampleAssociation()
        association.experiment = experiment_object
        association.sample = sample_object_1
        association.save()

        association = ExperimentSampleAssociation()
        association.experiment = experiment_object
        association.sample = sample_object_2
        association.save()

        sample_1_original_files = []
        sample_2_original_files = []

        original_file = OriginalFile()
        original_file.source_url = "first_url"
        original_file.source_filename = "first_filename"
        original_file.is_downloaded = False
        original_file.has_raw = True
        original_file.save()
        sample_1_original_files.append(original_file)

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.original_file = original_file
        original_file_sample_association.sample = sample_object_1
        original_file_sample_association.save()

        original_file = OriginalFile()
        original_file.source_url = "second_url"
        original_file.source_filename = "second_filename"
        original_file.is_downloaded = False
        original_file.has_raw = True
        original_file.save()
        sample_2_original_files.append(original_file)

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.original_file = original_file
        original_file_sample_association.sample = sample_object_1
        original_file_sample_association.save()

        original_file = OriginalFile()
        original_file.source_url = "third_url"
        original_file.source_filename = "third_filename"
        original_file.is_downloaded = False
        original_file.has_raw = True
        original_file.save()
        sample_2_original_files.append(original_file)

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.original_file = original_file
        original_file_sample_association.sample = sample_object_2
        original_file_sample_association.save()

        original_file = OriginalFile()
        original_file.source_url = "fourth_url"
        original_file.source_filename = "fourth_filename"
        original_file.is_downloaded = False
        original_file.has_raw = True
        original_file.save()
        sample_2_original_files.append(original_file)

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.original_file = original_file
        original_file_sample_association.sample = sample_object_2
        original_file_sample_association.save()

        survey_job = SurveyJob(source_type="SRA")
        survey_job.save()
        surveyor = SraSurveyor(survey_job)

        surveyor.queue_downloader_job_for_original_files(
            sample_1_original_files, experiment_object.accession_code
        )
        surveyor.queue_downloader_job_for_original_files(
            sample_2_original_files, experiment_object.accession_code
        )

        self.assertEqual(DownloaderJob.objects.all().count(), 2)

示例#24

0

显示文件

    def test_no_repeat_jobs(self):
        """Make sure that queue_downloader_jobs queues all expected Downloader
        jobs for a given experiment.
        """
        # First, create an experiment with two samples associated with it
        # and create two original files for each of those samples.
        experiment_object = Experiment()
        experiment_object.accession_code = "Experiment1"
        experiment_object.save()

        sample_object = Sample()
        sample_object.accession_code = "Sample1"
        sample_object.platform_accession_code = "Illumina Genome Analyzer"
        sample_object.platform_accession_name = "Illumina Genome Analyzer"
        sample_object.technology = "RNA-SEQ"
        sample_object.manufacturer = "ILLUMINA"
        sample_object.source_database = "SRA"
        sample_object.save()

        original_file_1 = OriginalFile()
        original_file_1.source_url = "first_url"
        original_file_1.source_filename = "first_filename"
        original_file_1.is_downloaded = False
        original_file_1.has_raw = True
        original_file_1.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.original_file = original_file_1
        original_file_sample_association.sample = sample_object
        original_file_sample_association.save()

        original_file_2 = OriginalFile()
        original_file_2.source_url = "second_url"
        original_file_2.source_filename = "second_filename"
        original_file_2.is_downloaded = False
        original_file_2.has_raw = True
        original_file_2.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.original_file = original_file_2
        original_file_sample_association.sample = sample_object
        original_file_sample_association.save()

        dlj = DownloaderJob()
        dlj.save()

        DownloaderJobOriginalFileAssociation(
            downloader_job=dlj, original_file=original_file_1
        ).save()

        DownloaderJobOriginalFileAssociation(
            downloader_job=dlj, original_file=original_file_2
        ).save()

        survey_job = SurveyJob(source_type="SRA")
        survey_job.save()
        surveyor = SraSurveyor(survey_job)

        surveyor.queue_downloader_job_for_original_files(
            [original_file_1, original_file_2], experiment_object.accession_code
        )

        # We made one DownloaderJob in this test, so
        # queue_downloader_job_for_original_files didn't have anything
        # to do, so there should still be only one:
        self.assertEqual(1, DownloaderJob.objects.all().count())

示例#25

0

显示文件

文件： test_organism_shepherd.py 项目： modulexcite/refinebio

    def test_organism_shepherd_command(self, mock_nomad, mock_send_job,
                                       mock_get_active_volumes):
        """Tests that the organism shepherd requeues jobs in the right order.

        The situation we're setting up is basically this:
          * There are two experiments.
          * One of them has 1/2 samples processed, the other 0/1
          * One of them needs a DownloaderJob requeued and the other
            needs a ProcessorJob requued.

        And what we're going to test for is:
          * Both of the jobs that need to be requeued are requeued.
          * The experiment with a processed sample is requeued first
            because it has a higher completion percentage.
        """
        # First, set up our mocks to prevent network calls.
        mock_send_job.return_value = True
        active_volumes = {"1", "2", "3"}
        mock_get_active_volumes.return_value = active_volumes

        def mock_init_nomad(host, port=0, timeout=0):
            ret_value = MagicMock()
            ret_value.jobs = MagicMock()
            ret_value.jobs.get_jobs = MagicMock()
            ret_value.jobs.get_jobs.side_effect = lambda: []
            return ret_value

        mock_nomad.side_effect = mock_init_nomad
        zebrafish = Organism(name="DANIO_RERIO",
                             taxonomy_id=1337,
                             is_scientific_name=True)
        zebrafish.save()

        # Experiment that is 0% complete.
        zero_percent_experiment = Experiment(accession_code='ERP037000')
        zero_percent_experiment.technology = 'RNA-SEQ'
        zero_percent_experiment.save()

        organism_assoc = ExperimentOrganismAssociation.objects.create(
            organism=zebrafish, experiment=zero_percent_experiment)

        zero_percent = OriginalFile()
        zero_percent.filename = "ERR037001.fastq.gz"
        zero_percent.source_filename = "ERR037001.fastq.gz"
        zero_percent.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR037/ERR037001/ERR037001_1.fastq.gz"
        zero_percent.is_archive = True
        zero_percent.save()

        zero_percent_sample = Sample()
        zero_percent_sample.accession_code = 'ERR037001'
        zero_percent_sample.organism = zebrafish
        zero_percent_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = zero_percent_sample
        assoc.original_file = zero_percent
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = zero_percent_sample
        assoc.experiment = zero_percent_experiment
        assoc.save()

        # TODO: fix names of all the variables to be appropriate for this test case.
        zero_percent_dl_job = DownloaderJob()
        zero_percent_dl_job.accession_code = zero_percent_sample.accession_code
        zero_percent_dl_job.downloader_task = "SRA"
        zero_percent_dl_job.start_time = timezone.now()
        zero_percent_dl_job.end_time = timezone.now()
        zero_percent_dl_job.success = False
        zero_percent_dl_job.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.downloader_job = zero_percent_dl_job
        assoc.original_file = zero_percent
        assoc.save()

        # Experiment that is 50% complete.
        fify_percent_experiment = Experiment(accession_code='ERP036000')
        fify_percent_experiment.technology = 'RNA-SEQ'
        fify_percent_experiment.save()

        organism_assoc = ExperimentOrganismAssociation.objects.create(
            organism=zebrafish, experiment=fify_percent_experiment)

        ## First sample, this one has been processed.
        successful_pj = ProcessorJob()
        successful_pj.accession_code = "ERR036000"
        successful_pj.pipeline_applied = "SALMON"
        successful_pj.ram_amount = 12288
        successful_pj.start_time = timezone.now()
        successful_pj.end_time = timezone.now()
        successful_pj.success = True
        successful_pj.save()

        successful_og = OriginalFile()
        successful_og.filename = "ERR036000.fastq.gz"
        successful_og.source_filename = "ERR036000.fastq.gz"
        successful_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz"
        successful_og.is_archive = True
        successful_og.save()

        successful_sample = Sample()
        successful_sample.accession_code = 'ERR036000'
        successful_sample.organism = zebrafish
        successful_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = successful_sample
        assoc.original_file = successful_og
        assoc.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.processor_job = successful_pj
        assoc.original_file = successful_og
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = successful_sample
        assoc.experiment = fify_percent_experiment
        assoc.save()

        ## Second sample, this one hasn't been processed.
        fifty_percent_unprocessed_og = OriginalFile()
        fifty_percent_unprocessed_og.filename = "ERR036001.fastq.gz"
        fifty_percent_unprocessed_og.source_filename = "ERR036001.fastq.gz"
        fifty_percent_unprocessed_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036001/ERR036001_1.fastq.gz"
        fifty_percent_unprocessed_og.is_archive = True
        fifty_percent_unprocessed_og.save()

        fifty_percent_unprocessed_sample = Sample()
        fifty_percent_unprocessed_sample.accession_code = 'ERR036001'
        fifty_percent_unprocessed_sample.organism = zebrafish
        fifty_percent_unprocessed_sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.sample = fifty_percent_unprocessed_sample
        assoc.original_file = fifty_percent_unprocessed_og
        assoc.save()

        assoc = ExperimentSampleAssociation()
        assoc.sample = fifty_percent_unprocessed_sample
        assoc.experiment = fify_percent_experiment
        assoc.save()

        fifty_percent_processor_job = ProcessorJob()
        fifty_percent_processor_job.pipeline_applied = "SALMON"
        fifty_percent_processor_job.accession_code = fifty_percent_unprocessed_sample.accession_code
        fifty_percent_processor_job.ram_amount = 12288
        fifty_percent_processor_job.start_time = timezone.now()
        fifty_percent_processor_job.end_time = timezone.now()
        fifty_percent_processor_job.success = False
        fifty_percent_processor_job.save()

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.processor_job = fifty_percent_processor_job
        assoc.original_file = fifty_percent_unprocessed_og
        assoc.save()

        # Setup is done, actually run the command.
        args = []
        options = {"organism_name": "DANIO_RERIO"}
        call_command("organism_shepherd", *args, **options)

        # Verify that the jobs were called in the correct order.
        mock_calls = mock_send_job.mock_calls

        first_call_job_type = mock_calls[0][1][0]
        first_call_job_object = mock_calls[0][2]["job"]
        self.assertEqual(first_call_job_type, ProcessorPipeline.SALMON)
        self.assertEqual(first_call_job_object.pipeline_applied,
                         fifty_percent_processor_job.pipeline_applied)
        self.assertEqual(first_call_job_object.ram_amount,
                         fifty_percent_processor_job.ram_amount)
        self.assertIn(first_call_job_object.volume_index, active_volumes)

        fifty_percent_processor_job.refresh_from_db()
        self.assertEqual(first_call_job_object,
                         fifty_percent_processor_job.retried_job)

        second_call_job_type = mock_calls[1][1][0]
        second_call_job_object = mock_calls[1][2]["job"]
        self.assertEqual(second_call_job_type, Downloaders.SRA)
        self.assertEqual(second_call_job_object.accession_code,
                         zero_percent_dl_job.accession_code)
        self.assertEqual(second_call_job_object.downloader_task,
                         zero_percent_dl_job.downloader_task)

        zero_percent_dl_job.refresh_from_db()
        self.assertEqual(second_call_job_object,
                         zero_percent_dl_job.retried_job)

示例#26

0

显示文件

文件： array_express.py 项目： Quiltomics/refinebio

    def create_samples_from_api(self, experiment: Experiment,
                                platform_dict: Dict) -> List[Sample]:
        """Generates a Sample item for each sample in an AE experiment.

        There are many possible data situations for a sample:

            - If the sample only has raw data available:
                - If it is on a platform that we support:
                    Download this raw data and process it
                - If it is not on a platform we support:
                    Don't download anything, don't process anything
            - If the sample has both raw and derived data:
                - If the raw data is on a platform we support:
                    Download the raw data and process it, abandon the derived data
                - If the raw data is not on a platform we support
                    Download the derived data and no-op it, abandon the raw data
            - If the sample only has derived data:
                Download the derived data and no-op it.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples

        """

        created_samples = []

        samples_endpoint = SAMPLES_URL.format(experiment.accession_code)
        r = utils.requests_retry_session().get(samples_endpoint, timeout=60)
        samples = r.json()["experiment"]["sample"]

        # The SDRF is the complete metadata record on a sample/property basis.
        # We run this through our harmonizer and then attach the properties
        # to our created samples.
        SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
        sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code)
        sdrf_samples = harmony.parse_sdrf(sdrf_url)
        harmonized_samples = harmony.harmonize(sdrf_samples)

        # An experiment can have many samples
        for sample_data in samples:

            # For some reason, this sample has no files associated with it.
            if "file" not in sample_data or len(sample_data['file']) == 0:
                continue

            # Each sample is given an experimenatlly-unique title.
            flat_sample = utils.flatten(sample_data)
            title = harmony.extract_title(flat_sample)

            # A sample may actually have many sub files.
            # If there is raw data, take that.
            # If not, take the derived.
            has_raw = False
            for sub_file in sample_data['file']:

                # For ex: E-GEOD-15645
                if isinstance(sub_file['comment'], list):
                    sub_file_mod = sub_file
                    sub_file_mod['comment'] = sub_file['comment'][0]
                else:
                    sub_file_mod = sub_file

                # Some have the 'data' field, but not the actual data
                # Ex: E-GEOD-9656
                if sub_file_mod['type'] == "data" and sub_file_mod[
                        'comment'].get('value', None) != None:
                    has_raw = True
                if 'raw' in sub_file_mod['comment'].get('value', ''):
                    has_raw = True

            skip_sample = False
            for sub_file in sample_data['file']:

                # Don't get the raw data if it's only a 1-color sample.
                if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data):
                    has_raw = False

                # Skip derived data if we have it raw.
                if has_raw and "derived data" in sub_file['type']:
                    continue

                download_url = None
                filename = sub_file["name"]

                # sub_file["comment"] is only a list if there's
                # more than one comment...
                comments = sub_file["comment"]
                if isinstance(comments, list):
                    # Could be: "Derived ArrayExpress Data Matrix FTP
                    # file" or: "ArrayExpress FTP file". If there is
                    # no comment with a name including "FTP file" then
                    # we don't know where to download it so we need to
                    # mark this job as an error. Therefore don't catch
                    # the potential exception where download_url
                    # doesn't get defined.
                    for comment in comments:
                        if "FTP file" in comment["name"]:
                            download_url = comment["value"]
                            break
                else:
                    download_url = comments["value"]

                if not download_url:
                    logger.error(
                        "Sample %s did not specify a download url, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

                if not filename:
                    logger.error(
                        "Sample %s did not specify a filename, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

            if skip_sample:
                continue

            # The accession code is not a simple matter to determine.
            sample_source_name = sample_data["source"].get("name", "")
            sample_assay_name = sample_data["assay"].get("name", "")
            sample_accession_code = self.determine_sample_accession(
                experiment.accession_code, sample_source_name,
                sample_assay_name, filename)

            # Figure out the Organism for this sample
            organism_name = UNKNOWN
            for characteristic in sample_data["characteristic"]:
                if characteristic["category"].upper() == "ORGANISM":
                    organism_name = characteristic["value"].upper()

            if organism_name == UNKNOWN:
                logger.error(
                    "Sample %s did not specify the organism name.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
                organism = None
                continue
            else:
                organism = Organism.get_object_for_name(organism_name)

            # Create the sample object
            try:
                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)

                # If input experiment includes new protocol information,
                # update sample's protocol_info.
                existing_protocols = sample_object.protocol_info
                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols, experiment.protocol_description,
                    experiment.source_url + '/protocols')
                if is_updated:
                    sample_object.protocol_info = protocol_info
                    sample_obejct.save()

                logger.debug(
                    "Sample %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
            except Sample.DoesNotExist:
                sample_object = Sample()

                # The basics
                sample_object.source_database = "ARRAY_EXPRESS"
                sample_object.title = title
                sample_object.accession_code = sample_accession_code
                sample_object.source_archive_url = samples_endpoint
                sample_object.organism = organism
                sample_object.platform_name = platform_dict[
                    "platform_accession_name"]
                sample_object.platform_accession_code = platform_dict[
                    "platform_accession_code"]
                sample_object.manufacturer = platform_dict["manufacturer"]
                sample_object.technology = "MICROARRAY"

                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols=[],
                    experiment_protocol=experiment.protocol_description,
                    protocol_url=experiment.source_url + '/protocols')
                # Do not check is_updated the first time because we must
                # save a list so we can append to it later.
                sample_object.protocol_info = protocol_info

                sample_object.save()

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)
                sample_object.save()

                sample_annotation = SampleAnnotation()
                sample_annotation.data = sample_data
                sample_annotation.sample = sample_object
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                original_file = OriginalFile()
                original_file.filename = filename
                original_file.source_filename = filename
                original_file.source_url = download_url
                original_file.is_downloaded = False
                original_file.is_archive = True
                original_file.has_raw = has_raw
                original_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.original_file = original_file
                original_file_sample_association.sample = sample_object
                original_file_sample_association.save()

                created_samples.append(sample_object)

                logger.debug(
                    "Created " + str(sample_object),
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id,
                    sample=sample_object.id)

            # Create associations if they don't already exist
            ExperimentSampleAssociation.objects.get_or_create(
                experiment=experiment, sample=sample_object)

            ExperimentOrganismAssociation.objects.get_or_create(
                experiment=experiment, organism=organism)

        return created_samples

示例#27

0

显示文件

文件： test_geo.py 项目： modulexcite/refinebio

    def test_download_aspera_and_ftp(self):
        """ Tests the main 'download_geo' function. """

        dlj = DownloaderJob()
        dlj.accession_code = 'GSE22427'
        dlj.save()

        original_file = OriginalFile()
        original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz"
        original_file.source_filename = "GSE22427_non-normalized.txt.gz"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSE22427'
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.sample = sample
        sample_annotation.data = {
            'label_protocol_ch1': 'Agilent',
            'label_protocol_ch2': 'Agilent'
        }
        sample_annotation.save()

        og_assoc = OriginalFileSampleAssociation()
        og_assoc.sample = sample
        og_assoc.original_file = original_file
        og_assoc.save()

        LOCAL_ROOT_DIR = "/home/user/data_store"
        os.makedirs(LOCAL_ROOT_DIR + '/' + sample.accession_code,
                    exist_ok=True)
        dl_file_path = LOCAL_ROOT_DIR + '/' + sample.accession_code + '/' + original_file.source_url.split(
            '/')[-1]

        # Aspera
        result = geo._download_file(original_file.source_url,
                                    file_path=dl_file_path,
                                    job=dlj,
                                    force_ftp=False)
        self.assertTrue(result)
        self.assertTrue(os.path.exists(dl_file_path))
        os.remove(dl_file_path)

        # FTP
        result = geo._download_file(original_file.source_url,
                                    file_path=dl_file_path,
                                    job=dlj,
                                    force_ftp=True)
        self.assertTrue(result)
        self.assertTrue(os.path.exists(dl_file_path))
        os.remove(dl_file_path)

        # Aspera, fail
        result = geo._download_file_aspera("https://rich.zone/cool_horse.jpg",
                                           target_file_path=dl_file_path,
                                           downloader_job=dlj,
                                           attempt=5)
        self.assertFalse(result)
        self.assertTrue(dlj.failure_reason != None)

示例#28

0

显示文件

文件： test_geo.py 项目： modulexcite/refinebio

    def test_download_geo(self, mock_send_task):
        """ Tests the main 'download_geo' function. """

        dlj = DownloaderJob()
        dlj.accession_code = 'GSE22427'
        dlj.save()

        original_file = OriginalFile()
        original_file.filename = "GSE22427_non-normalized.txt.gz"
        original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz"
        original_file.source_filename = "GSE22427_non-normalized.txt.gz"
        original_file.save()

        assoc = DownloaderJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.downloader_job = dlj
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSE22427'
        sample.technology = "MICROARRAY"
        sample.manufacturer = "AGILENT"
        sample.has_raw = True
        # This is fake, but we don't currently support any agilent
        # platforms so we're using a platform that is supported.
        sample.platform_accession_code = "Illumina_RatRef-12_V1.0"
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.sample = sample
        sample_annotation.data = {
            'label_protocol_ch1': 'Agilent',
            'label_protocol_ch2': 'Agilent'
        }
        sample_annotation.save()

        og_assoc = OriginalFileSampleAssociation()
        og_assoc.sample = sample
        og_assoc.original_file = original_file
        og_assoc.save()

        download_result = geo.download_geo(dlj.id)

        file_assocs = OriginalFileSampleAssociation.objects.filter(
            sample=sample)
        self.assertEqual(file_assocs.count(), 2)

        for file_assoc in file_assocs:
            original_file = file_assoc.original_file
            if original_file.filename.endswith(".gz"):
                # We delete the archive after we extract from it
                self.assertFalse(original_file.is_downloaded)
            else:
                self.assertTrue(original_file.is_downloaded)

        # Make sure it worked
        self.assertTrue(download_result)
        self.assertTrue(dlj.failure_reason is None)
        self.assertTrue(len(ProcessorJob.objects.all()) > 0)
        self.assertEqual(ProcessorJob.objects.all()[0].pipeline_applied,
                         "AGILENT_TWOCOLOR_TO_PCL")
        self.assertEqual(ProcessorJob.objects.all()[0].ram_amount, 2048)

示例#29

0

显示文件

文件： geo.py 项目： Quiltomics/refinebio

def download_geo(job_id: int) -> None:
    """The main function for the GEO Downloader.

    Downloads a single tar file containing the files representing
    samples relating to a single experiement stored in
    GEO.
    """
    job = utils.start_job(job_id)

    file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
        downloader_job=job)

    original_file = file_assocs[0].original_file
    url = original_file.source_url
    accession_code = job.accession_code

    sample_assocs = OriginalFileSampleAssociation.objects.filter(
        original_file=original_file)
    related_samples = Sample.objects.filter(
        id__in=sample_assocs.values('sample_id'))

    # First, download the sample archive URL.
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    # The files for all of the samples are
    # contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True)
    dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + url.split(
        '/')[-1]

    logger.debug("Starting to download: " + url,
                 job_id=job_id,
                 accession_code=accession_code)
    _download_file(url, dl_file_path, job)
    original_file.absolute_file_path = dl_file_path
    original_file.is_downloaded = True
    original_file.save()

    has_raw = True
    unpacked_sample_files = []

    # These files are tarred, and also subsequently gzipped
    if '.tar' in dl_file_path:
        try:
            extracted_files = _extract_tar(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting tar file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            filename = og_file['filename']
            if '_' in filename:
                sample_id = filename.split('_')[0]
            else:
                sample_id = filename.split('.')[0]

            try:
                sample = Sample.objects.get(accession_code=sample_id)
            except Exception as e:
                # We don't have this sample, but it's not a total failure. This happens.
                continue

            try:
                # Files from the GEO supplemental file are gzipped inside of the tarball. Great!
                archive_file = OriginalFile.objects.get(
                    source_filename__contains=sample_id)
                archive_file.is_downloaded = True
                archive_file.is_archive = True
                archive_file.absolute_file_path = og_file['absolute_path']
                archive_file.calculate_size()
                archive_file.calculate_sha1()
                archive_file.save()

                if '.gz' in og_file['filename']:
                    extracted_subfile = _extract_gz(og_file['absolute_path'],
                                                    accession_code)
                else:
                    extracted_subfile = [og_file]

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = extracted_subfile[0][
                    'absolute_path']
                actual_file.filename = extracted_subfile[0]['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = True
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

                archive_file.delete_local_file()
                archive_file.is_downloaded = False
                archive_file.save()

                unpacked_sample_files.append(actual_file)
            except Exception as e:
                # TODO - is this worth failing a job for?
                logger.debug(
                    "Found a file we didn't have an OriginalFile for! Why did this happen?: "
                    + og_file['filename'],
                    exc_info=1,
                    file=og_file['filename'],
                    sample_id=sample_id,
                    accession_code=accession_code)
                # If we don't know why we have it, get rid of it.
                os.remove(og_file["absolute_path"])

    # This is a .tgz file.
    elif '.tgz' in dl_file_path:
        # If this is the MINiML file, it has been preprocessed
        if '_family.xml.tgz' in dl_file_path:
            has_raw = False

        try:
            extracted_files = _extract_tgz(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting tgz file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            if '.txt' in og_file['filename']:
                try:
                    gsm_id = og_file['filename'].split('-')[0]
                    sample = Sample.objects.get(accession_code=gsm_id)
                except Exception as e:
                    os.remove(og_file["absolute_path"])
                    continue

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = og_file['absolute_path']
                actual_file.filename = og_file['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = has_raw
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

                unpacked_sample_files.append(actual_file)

    # These files are only gzipped.
    # These are generally the _actually_ raw (rather than the non-raw data in a RAW file) data
    elif '.gz' in dl_file_path:
        try:
            extracted_files = _extract_gz(dl_file_path, accession_code)
        except Exception as e:
            job.failure_reason = e
            logger.exception("Error occured while extracting gz file.",
                             path=dl_file_path,
                             exception=str(e))
            utils.end_downloader_job(job, success=False)
            return

        for og_file in extracted_files:

            filename = og_file['filename']
            sample_id = filename.split('.')[0]

            try:
                # The archive we downloaded
                archive_file = OriginalFile.objects.get(
                    source_filename__contains=filename)
                archive_file.is_downloaded = True
                archive_file.is_archive = True
                archive_file.absolute_file_path = dl_file_path
                archive_file.calculate_size()
                archive_file.calculate_sha1()
                archive_file.save()

                actual_file = OriginalFile()
                actual_file.is_downloaded = True
                actual_file.is_archive = False
                actual_file.absolute_file_path = og_file['absolute_path']
                actual_file.filename = og_file['filename']
                actual_file.calculate_size()
                actual_file.calculate_sha1()
                actual_file.has_raw = True
                actual_file.source_url = original_file.source_url
                actual_file.source_filename = original_file.source_filename
                actual_file.save()

                for sample in related_samples:
                    new_association = OriginalFileSampleAssociation()
                    new_association.original_file = actual_file
                    new_association.sample = sample
                    new_association.save()

                archive_file.delete_local_file()
                archive_file.is_downloaded = False
                archive_file.save()

                unpacked_sample_files.append(actual_file)
            except Exception as e:
                logger.debug(
                    "Found a file we didn't have an OriginalFile for! Why did this happen?: "
                    + og_file['filename'],
                    exc_info=1,
                    file=og_file['filename'],
                    sample_id=sample_id,
                    accession_code=accession_code)
                os.remove(og_file["absolute_path"])

    # This is probably just a .txt file
    else:
        filename = dl_file_path.split('/')[-1]
        sample_id = filename.split('_')[0]

        actual_file = OriginalFile()
        actual_file.is_downloaded = True
        actual_file.is_archive = False
        actual_file.absolute_file_path = dl_file_path
        actual_file.filename = filename
        actual_file.calculate_size()
        actual_file.calculate_sha1()
        actual_file.has_raw = True
        actual_file.source_url = original_file.source_url
        actual_file.source_filename = original_file.source_filename
        actual_file.save()

        for sample in related_samples:
            new_association = OriginalFileSampleAssociation()
            new_association.original_file = actual_file
            new_association.sample = sample
            new_association.save()

        unpacked_sample_files.append(actual_file)

    if len(unpacked_sample_files) > 0:
        success = True
        logger.debug("File downloaded and extracted successfully.",
                     url=url,
                     dl_file_path=dl_file_path,
                     downloader_job=job_id)
    else:
        success = False
        logger.info("Unable to extract any files.",
                    url=url,
                    dl_file_path=dl_file_path,
                    downloader_job=job_id)
        job.failure_reason = "Failed to extract any downloaded files."

    if success:
        utils.create_processor_jobs_for_original_files(unpacked_sample_files,
                                                       job)

    if original_file.is_archive:
        original_file.delete_local_file()

    utils.end_downloader_job(job, success)

    return success

示例#30

0

显示文件

文件： geo.py 项目： erflynn/refinebio

def download_geo(job_id: int) -> None:
    """The main function for the GEO Downloader.

    Downloads a single tar file containing the files representing
    samples relating to a single experiment stored in
    GEO.
    """
    job = utils.start_job(job_id)
    accession_code = job.accession_code
    original_file = job.original_files.first()

    if not original_file:
        job.failure_reason = "No files associated with the job."
        logger.error("No files associated with the job.", downloader_job=job_id)
        utils.end_downloader_job(job, success=False)
        return

    url = original_file.source_url
    related_samples = original_file.samples.exclude(technology="RNA-SEQ")

    # First, download the sample archive URL.
    # Then, unpack all the ones downloaded.
    # Then create processor jobs!

    # The files for all of the samples are contained within the same zip file. Therefore only
    # download the one.
    os.makedirs(LOCAL_ROOT_DIR + "/" + accession_code, exist_ok=True)
    dl_file_path = LOCAL_ROOT_DIR + "/" + accession_code + "/" + url.split("/")[-1]

    logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code)
    _download_file(url, dl_file_path, job)
    original_file.absolute_file_path = dl_file_path
    original_file.is_downloaded = True
    original_file.save()

    unpacked_sample_files = []

    try:
        # enumerate all files inside the archive
        archived_files = list(ArchivedFile(dl_file_path).get_files())
    except FileExtractionError as e:
        job.failure_reason = e
        logger.exception(
            "Error occurred while extracting file.", path=dl_file_path, exception=str(e)
        )
        utils.end_downloader_job(job, success=False)
        return

    for og_file in archived_files:
        sample = og_file.get_sample()

        # We don't want RNA-Seq data from GEO:
        # https://github.com/AlexsLemonade/refinebio/issues/966
        if sample and sample.technology == "RNA-SEQ":
            logger.warn("RNA-Seq sample found in GEO downloader job.", sample=sample)
            continue

        if not sample and (
            not og_file.is_processable() or og_file.experiment_accession_code() != accession_code
        ):
            # skip the files that we know are not processable and can't be associated with a sample
            # also skip the files were we couldn't find a sample and they don't mention the current experiment
            continue

        potential_existing_file = OriginalFile.objects.filter(
            source_filename=original_file.source_filename,
            filename=og_file.filename,
            is_archive=False,
        ).first()
        if potential_existing_file:
            # We've already created this record, let's see if we actually
            # needed to download it or if we just got it because we needed
            # a file in the same archive.
            if potential_existing_file.needs_processing():
                if not potential_existing_file.is_downloaded:
                    potential_existing_file.is_downloaded = True
                    potential_existing_file.save()

                unpacked_sample_files.append(potential_existing_file)
            continue

        # Then this is a new file and we should create an original file for it
        actual_file = OriginalFile()
        actual_file.is_downloaded = True
        actual_file.is_archive = False
        actual_file.absolute_file_path = og_file.file_path
        actual_file.filename = og_file.filename
        actual_file.calculate_size()
        actual_file.calculate_sha1()
        actual_file.has_raw = True
        actual_file.source_url = original_file.source_url
        actual_file.source_filename = original_file.source_filename
        actual_file.save()

        # try to see if the file should be associated with a sample
        if sample:
            original_file_sample_association = OriginalFileSampleAssociation()
            original_file_sample_association.sample = sample
            original_file_sample_association.original_file = actual_file
            original_file_sample_association.save()
        else:
            # if not, we can associate this file with all samples in the experiment
            for sample in related_samples:
                original_file_sample_association = OriginalFileSampleAssociation()
                original_file_sample_association.sample = sample
                original_file_sample_association.original_file = actual_file
                original_file_sample_association.save()

        unpacked_sample_files.append(actual_file)

    if len(unpacked_sample_files) > 0:
        success = True
        logger.debug(
            "File downloaded and extracted successfully.",
            url=url,
            dl_file_path=dl_file_path,
            downloader_job=job_id,
        )
    else:
        success = False
        logger.info(
            "Unable to extract any files.",
            url=url,
            dl_file_path=dl_file_path,
            downloader_job=job_id,
        )
        job.failure_reason = "Failed to extract any downloaded files."

    if success:
        create_processor_jobs_for_original_files(unpacked_sample_files, job)

    if original_file.is_archive:
        original_file.delete_local_file()

    utils.end_downloader_job(job, success)

    return success