def create_processor_job(self, pipeline="AFFY_TO_PCL", ram_amount=2048): job = ProcessorJob( pipeline_applied=pipeline, nomad_job_id="PROCESSOR/dispatch-1528945054-e8eaf540", ram_amount=ram_amount, num_retries=0, volume_index="1", success=None) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = og_file assoc.processor_job = job assoc.save() return job
def prepare_original_files(length: str) -> List[OriginalFile]: og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/" "AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz") og_file.source_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/" "aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" ) og_file.is_downloaded = True # We need to add the URL here so that _extract_assembly_information works properly og_file.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = ( "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/" "AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz") og_file2.source_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/" "aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz") og_file2.is_downloaded = True # We need to add the URL here so that _extract_assembly_information works properly og_file2.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz" og_file2.save() return [og_file, og_file2]
def _generate_files(self, species: Dict) -> None: url_builder = ensembl_url_builder_factory(species) fasta_download_url = url_builder.build_transcriptome_url() gtf_download_url = url_builder.build_gtf_url() platform_accession_code = species.pop("division") self._clean_metadata(species) all_new_files = [] fasta_filename = url_builder.filename_species + ".fa.gz" original_file = OriginalFile() original_file.source_filename = fasta_filename original_file.source_url = fasta_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) gtf_filename = url_builder.filename_species + ".gtf.gz" original_file = OriginalFile() original_file.source_filename = gtf_filename original_file.source_url = gtf_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) return all_new_files
def create_downloader_job(self): job = DownloaderJob( downloader_task="SRA", nomad_job_id="DOWNLOADER/dispatch-1528945054-e8eaf540", num_retries=0, accession_code="NUNYA", success=None) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.downloader_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file assoc.downloader_job = job assoc.save() return job
def create_downloader_job(): job = DownloaderJob( downloader_task="SRA", batch_job_id="DEFAULT", num_retries=0, accession_code="NUNYA", success=None, ) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.downloader_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file assoc.downloader_job = job assoc.save() return job
def _generate_files(self, species: Dict) -> None: url_builder = ensembl_url_builder_factory(species) fasta_download_url = url_builder.build_transcriptome_url() gtf_download_url = url_builder.build_gtf_url() # Getting the object will ensure it is created in the DB. Organism.get_or_create_object_for_id(url_builder.taxonomy_id) all_new_files = [] fasta_filename = url_builder.filename_species + ".fa.gz" original_file = OriginalFile() original_file.source_filename = fasta_filename original_file.source_url = fasta_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) gtf_filename = url_builder.filename_species + ".gtf.gz" original_file = OriginalFile() original_file.source_filename = gtf_filename original_file.source_url = gtf_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) return all_new_files
def create_processor_job(pipeline="AFFY_TO_PCL", ram_amount=2048, start_time=None): og_file_1 = OriginalFile() og_file_1.source_filename = "doesn't matter" og_file_1.filename = "this either" og_file_1.absolute_file_path = "nor this" og_file_1.save() og_file_2 = OriginalFile() og_file_2.source_filename = "doesn't matter" og_file_2.filename = "this either" og_file_2.absolute_file_path = "nor this" og_file_2.save() downloader_job = None if pipeline == "AFFY_TO_PCL": downloader_job = DownloaderJob( downloader_task="SRA", batch_job_id="DEFAULT", num_retries=0, accession_code="NUNYA", success=None, ) downloader_job.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file_2 assoc.downloader_job = downloader_job assoc.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file_1 assoc1.downloader_job = downloader_job assoc1.save() processor_job = ProcessorJob( downloader_job=downloader_job, pipeline_applied=pipeline, batch_job_id="PROCESSOR/dispatch-1528945054-e8eaf540", ram_amount=ram_amount, num_retries=0, success=None, start_time=start_time, ) processor_job.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file_1 assoc1.processor_job = processor_job assoc1.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = og_file_2 assoc.processor_job = processor_job assoc.save() return processor_job
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.source_database = 'SRA' samp.technology = 'RNA-SEQ' samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = "ERR1562482_1.fastq.gz" og_file.filename = "ERR1562482_1.fastq.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "ERR1562482_2.fastq.gz" og_file2.filename = "ERR1562482_2.fastq.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file2 assoc1.processor_job = pj assoc1.save() return pj, [og_file, og_file2]
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=1001) samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() original_file = OriginalFile() original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" original_file.filename = "GSM1426071_CD_colon_active_1.CEL" original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" original_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = original_file assoc1.processor_job = pj assoc1.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") sample = Sample() sample.title = "Heyo" sample.organism = c_elegans sample.is_processed = False sample.save() ogsa = OriginalFileSampleAssociation() ogsa.sample = sample ogsa.original_file = original_file ogsa.save() return pj
def prepare_dotsra_job(filename="ERR1562482.sra"): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.id = random.randint(111, 999999) pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = filename og_file.filename = filename og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/" + filename og_file.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj, [og_file]
def test_download_file_ncbi(self): dlj = DownloaderJob() dlj.accession_code = "SRR9117853" dlj.save() og = OriginalFile() og.source_filename = "SRR9117853.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = "SRR9117853" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "e7ad484fe6f134ba7d1b2664e58cc15ae5a958cc") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_download_file_ncbi(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, 'd5374e7fe047d4f76b165c3f5148ab2df9d42cea') self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_download_file(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "ERR036" dlj.save() og = OriginalFile() og.source_filename = "ERR036000.fastq.gz" og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'ERR036000' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() success = sra.download_sra(dlj.pk)
def _replace_dotsra_with_fastq_files( sample: Sample, downloader_job: DownloaderJob, original_file: OriginalFile) -> List[OriginalFile]: """Replaces a .SRA file with two .fastq files. This function should only be called on a sample which has unmated reads, so it makes the assumption that the sample passed into it has at least two read files in ENA. """ read_one_url = _build_ena_file_url(sample.accession_code, "_1") read_two_url = _build_ena_file_url(sample.accession_code, "_2") # Technically this is a different file, but deleting this one and # its associations just to recreate another with the same # associations seems rather pointless. original_file.source_url = read_one_url original_file.source_filename = read_one_url.split("/")[-1] original_file.save() read_two_original_file = OriginalFile.objects.get_or_create( source_url=read_two_url, source_filename=read_two_url.split("/")[-1], has_raw=True)[0] OriginalFileSampleAssociation.objects.get_or_create( original_file=read_two_original_file, sample=sample) DownloaderJobOriginalFileAssociation.objects.get_or_create( original_file=read_two_original_file, downloader_job=downloader_job) return [original_file, read_two_original_file]
def test_download_file_swapper(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result = sra._download_file(og.source_url, dlj, "/tmp", force_ftp=False) self.assertTrue(result)
def test_download_file(self): dlj = DownloaderJob() dlj.accession_code = "ERR036" dlj.save() og = OriginalFile() og.source_filename = "ERR036000.fastq.gz" og.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" og.is_archive = True og.save() sample = Sample() sample.accession_code = "ERR036000" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "1dfe5460a4101fe87feeffec0cb2e053f6695961") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_assembly_information(self): og_file = OriginalFile() og_file.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" og_file.source_filename = "aegilops_tauschii_short.fa.gz" og_file2 = OriginalFile() og_file2.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz" og_file2.source_filename = "aegilops_tauschii_short.gtf.gz" job_context = { "original_files": [og_file, og_file2], "computed_files": [] } job_context = transcriptome_index._extract_assembly_information( job_context) self.assertEqual("39", job_context["assembly_version"]) self.assertEqual("ASM34733v1", job_context["assembly_name"])
def _get_actual_file_if_queueable( extracted_subfile: Dict, original_file: OriginalFile, samples: List[Sample]) -> OriginalFile: """Returns the actual file from the archive if it should be queued. If the file has been processed or has an unstarted DownloaderJob, None will be returned. `extracted_subfile` should be a Dict containing metadata about the file that was extracted from an archive. `original_file` should be the file associated with the CURRENT DownloaderJob. `samples` are the samples that the actual file should be associated with if it has to be created. """ # Check to see if we've made this original file before: potential_existing_files = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=extracted_subfile['filename'], is_archive=False ) if potential_existing_files.count() > 0: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. actual_file = potential_existing_files[0] if actual_file.needs_processing(): if not actual_file.is_downloaded: actual_file.is_downloaded = True actual_file.save() return actual_file else: return None else: actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile['absolute_path'] actual_file.filename = extracted_subfile['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() return actual_file
def prepare_illumina_job(organism): pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz" og_file.filename = "GSE22427_non-normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt") og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() sample_names = [ "LV-C&si-Control-1", "LV-C&si-Control-2", "LV-C&si-Control-3", "LV-C&si-EZH2-1", "LV-C&si-EZH2-2", "LV-C&si-EZH2-3", "LV-EZH2&si-EZH2-1", "LV-EZH2&si-EZH2-2", "LV-EZH2&si-EZH2-3", "LV-T350A&si-EZH2-1", "LV-T350A&si-EZH2-2", "LV-T350A&si-EZH2-3", ] for name in sample_names: sample = Sample() sample.accession_code = name sample.title = name sample.organism = organism sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = {"description": [name]} sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() return pj
def test_download_file_unmated_reads(self): dlj = DownloaderJob() dlj.accession_code = "SRR1603661" dlj.save() og_1 = OriginalFile() og_1.source_filename = "SRR1603661_1.fastq.gz" og_1.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_1.fastq.gz" og_1.expected_md5 = "502a9a482bfa5aa75865ccc0105ad13c" og_1.expected_size_in_bytes = 6751980628 og_1.is_archive = True og_1.save() og_2 = OriginalFile() og_2.source_filename = "SRR1603661_2.fastq.gz" og_2.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_2.fastq.gz" og_1.expected_md5 = "fffd24457418d255991f54ec82a39d57" og_1.expected_size_in_bytes = 6949912932 og_2.is_archive = True og_2.save() sample = Sample() sample.accession_code = "SRR1603661" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og_1 assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og_1 assoc.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og_2 assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og_2 assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "52bf22472069d04fa7767429f6ab78ebd10c0152") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def _make_original_file_with_contents(contents: str) -> OriginalFile: _, path = tempfile.mkstemp(suffix=".txt") with open(path, "w") as f: f.write(contents) og_file = OriginalFile() og_file.source_filename = path og_file.filename = os.path.basename(path) og_file.absolute_file_path = os.path.realpath(path) og_file.is_downloaded = True og_file.save() return og_file
def prepare_illumina_job(job_info: Dict) -> ProcessorJob: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() for s in job_info["samples"]: # For convenience, if you give a list of strings we'll just use the # strings as both titles and accessions. annotation = None if type(s) == str: accession_code = s title = s elif type(s) == tuple and list(map(type, s)) == [str, str]: accession_code, title = s elif type(s) == tuple and list(map(type, s)) == [str, str, dict]: accession_code, title, annotation = s else: raise ValueError(f"Invalid sample type for sample {s}") sample = Sample() sample.accession_code = accession_code sample.title = title sample.organism = job_info["organism"] sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = annotation if annotation is not None else { "description": [title] } sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() return pj
def test_convert_illumina_no_header(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 # ILMN_2209417 10.0000 0.2029 # ILMN_1765401 152.0873 0.0000 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt" ) og_file.filename = "GSM1089291-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt" og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000105675 10 # ENSG00000085721 152.0873 # ENSG00000278494 152.0873 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 786207)
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_convert_processed_illumina(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # Reporter Identifier VALUE Detection Pval # ILMN_1343291 14.943602 0 # ILMN_1343295 13.528082 0 og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/" og_file.filename = "GSM557500_sample_table.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt") og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000156508 14.943602 # ENSG00000111640 13.528082 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 920374) self.assertTrue( no_op.check_output_quality(final_context["output_file_path"]))
def test_convert_illumina_bad_cols(self): """ In future, this test may be deprecated. For now it just alerts that it needs attention. """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 11.0000 0.123 # ILMN_2209417 10.0000 0.2029 11.1234 0.543 # LMN_1765401 152.0873 0.0000 99.999 0.19 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt" ) og_file.filename = "GSM1089291-tbl-1-modified.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt" ) og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertFalse(final_context["success"]) self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
def test_download_file(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.save() og = OriginalFile() og.source_filename = "Aegilops_tauschii.ASM34733v1.37.gtf.gz" og.source_url = self.gtf_download_url og.is_archive = True og.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() transcriptome_index.download_transcriptome(dlj.pk)
def prepare_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" og_file.filename = "GSM1426071_CD_colon_active_1.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_agilent_twocolor_job(): pj = ProcessorJob() pj.pipeline_applied = "AGILENT_TWOCOLOR_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE22900&format=file" og_file.filename = "GSM466597_95899_agilent.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/AGILENT_TWOCOLOR/GSM466597_95899_agilent.txt" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj