def prepare_original_files(length: str) -> List[OriginalFile]: og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/" "AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz") og_file.source_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/" "aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" ) og_file.is_downloaded = True # We need to add the URL here so that _extract_assembly_information works properly og_file.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = ( "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/" "AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz") og_file2.source_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/" "aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz") og_file2.is_downloaded = True # We need to add the URL here so that _extract_assembly_information works properly og_file2.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz" og_file2.save() return [og_file, og_file2]
def _generate_files(self, species: Dict) -> None: url_builder = ensembl_url_builder_factory(species) fasta_download_url = url_builder.build_transcriptome_url() gtf_download_url = url_builder.build_gtf_url() # Getting the object will ensure it is created in the DB. Organism.get_or_create_object_for_id(url_builder.taxonomy_id) all_new_files = [] fasta_filename = url_builder.filename_species + ".fa.gz" original_file = OriginalFile() original_file.source_filename = fasta_filename original_file.source_url = fasta_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) gtf_filename = url_builder.filename_species + ".gtf.gz" original_file = OriginalFile() original_file.source_filename = gtf_filename original_file.source_url = gtf_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) return all_new_files
def _generate_files(self, species: Dict) -> None: url_builder = ensembl_url_builder_factory(species) fasta_download_url = url_builder.build_transcriptome_url() gtf_download_url = url_builder.build_gtf_url() platform_accession_code = species.pop("division") self._clean_metadata(species) all_new_files = [] fasta_filename = url_builder.filename_species + ".fa.gz" original_file = OriginalFile() original_file.source_filename = fasta_filename original_file.source_url = fasta_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) gtf_filename = url_builder.filename_species + ".gtf.gz" original_file = OriginalFile() original_file.source_filename = gtf_filename original_file.source_url = gtf_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) return all_new_files
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.source_database = 'SRA' samp.technology = 'RNA-SEQ' samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = "ERR1562482_1.fastq.gz" og_file.filename = "ERR1562482_1.fastq.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "ERR1562482_2.fastq.gz" og_file2.filename = "ERR1562482_2.fastq.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file2 assoc1.processor_job = pj assoc1.save() return pj, [og_file, og_file2]
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=1001) samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def _get_actual_file_if_queueable( extracted_subfile: Dict, original_file: OriginalFile, samples: List[Sample]) -> OriginalFile: """Returns the actual file from the archive if it should be queued. If the file has been processed or has an unstarted DownloaderJob, None will be returned. `extracted_subfile` should be a Dict containing metadata about the file that was extracted from an archive. `original_file` should be the file associated with the CURRENT DownloaderJob. `samples` are the samples that the actual file should be associated with if it has to be created. """ # Check to see if we've made this original file before: potential_existing_files = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=extracted_subfile['filename'], is_archive=False ) if potential_existing_files.count() > 0: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. actual_file = potential_existing_files[0] if actual_file.needs_processing(): if not actual_file.is_downloaded: actual_file.is_downloaded = True actual_file.save() return actual_file else: return None else: actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile['absolute_path'] actual_file.filename = extracted_subfile['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() return actual_file
def prepare_illumina_job(organism): pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz" og_file.filename = "GSE22427_non-normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt") og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() sample_names = [ "LV-C&si-Control-1", "LV-C&si-Control-2", "LV-C&si-Control-3", "LV-C&si-EZH2-1", "LV-C&si-EZH2-2", "LV-C&si-EZH2-3", "LV-EZH2&si-EZH2-1", "LV-EZH2&si-EZH2-2", "LV-EZH2&si-EZH2-3", "LV-T350A&si-EZH2-1", "LV-T350A&si-EZH2-2", "LV-T350A&si-EZH2-3", ] for name in sample_names: sample = Sample() sample.accession_code = name sample.title = name sample.organism = organism sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = {"description": [name]} sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() return pj
def _make_original_file_with_contents(contents: str) -> OriginalFile: _, path = tempfile.mkstemp(suffix=".txt") with open(path, "w") as f: f.write(contents) og_file = OriginalFile() og_file.source_filename = path og_file.filename = os.path.basename(path) og_file.absolute_file_path = os.path.realpath(path) og_file.is_downloaded = True og_file.save() return og_file
def prepare_illumina_job(job_info: Dict) -> ProcessorJob: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() for s in job_info["samples"]: # For convenience, if you give a list of strings we'll just use the # strings as both titles and accessions. annotation = None if type(s) == str: accession_code = s title = s elif type(s) == tuple and list(map(type, s)) == [str, str]: accession_code, title = s elif type(s) == tuple and list(map(type, s)) == [str, str, dict]: accession_code, title, annotation = s else: raise ValueError(f"Invalid sample type for sample {s}") sample = Sample() sample.accession_code = accession_code sample.title = title sample.organism = job_info["organism"] sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = annotation if annotation is not None else { "description": [title] } sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() return pj
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def delete_if_blacklisted(original_file: OriginalFile) -> OriginalFile: extension = original_file.filename.split(".")[-1] if extension.lower() in BLACKLISTED_EXTENSIONS: logger.debug( "Original file had a blacklisted extension of %s, skipping", extension, original_file=original_file.id) original_file.delete_local_file() original_file.is_downloaded = False original_file.save() return None return original_file
def test_convert_illumina_no_header(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 # ILMN_2209417 10.0000 0.2029 # ILMN_1765401 152.0873 0.0000 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt" ) og_file.filename = "GSM1089291-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt" og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000105675 10 # ENSG00000085721 152.0873 # ENSG00000278494 152.0873 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 786207)
def test_convert_processed_illumina(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # Reporter Identifier VALUE Detection Pval # ILMN_1343291 14.943602 0 # ILMN_1343295 13.528082 0 og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/" og_file.filename = "GSM557500_sample_table.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt") og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000156508 14.943602 # ENSG00000111640 13.528082 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 920374) self.assertTrue( no_op.check_output_quality(final_context["output_file_path"]))
def test_convert_illumina_bad_cols(self): """ In future, this test may be deprecated. For now it just alerts that it needs attention. """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 11.0000 0.123 # ILMN_2209417 10.0000 0.2029 11.1234 0.543 # LMN_1765401 152.0873 0.0000 99.999 0.19 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt" ) og_file.filename = "GSM1089291-tbl-1-modified.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt" ) og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertFalse(final_context["success"]) self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
def prepare_agilent_twocolor_job(): pj = ProcessorJob() pj.pipeline_applied = "AGILENT_TWOCOLOR_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE22900&format=file" og_file.filename = "GSM466597_95899_agilent.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/AGILENT_TWOCOLOR/GSM466597_95899_agilent.txt" og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_non_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM45nnn/GSM45588/suppl/GSM45588.CEL.gz" og_file.filename = "GSM45588.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM45588.CEL" og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" og_file.filename = "GSM1426071_CD_colon_active_1.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_job(job_info: dict) -> ProcessorJob: job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = job_info["accession_code"] sample.title = job_info["accession_code"] sample.platform_accession_code = job_info["platform_accession_code"] manufacturer = job_info.get("manufacturer", None) if manufacturer is not None: sample.manufacturer = manufacturer # The illumina samples need the human organism if manufacturer == "ILLUMINA": homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() return job
def test_queue_downloader_jobs_for_original_files(self, mock_send_task): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object_1 = Sample() sample_object_1.accession_code = "Sample1" sample_object_1.platform_accession_code = "Illumina Genome Analyzer" sample_object_1.platform_accession_name = "Illumina Genome Analyzer" sample_object_1.technology = "RNA-SEQ" sample_object_1.manufacturer = "ILLUMINA" sample_object_1.source_database = "SRA" sample_object_1.save() sample_object_2 = Sample() sample_object_2.accession_code = "Sample2" sample_object_2.platform_accession_code = "Illumina Genome Analyzer" sample_object_2.platform_accession_name = "Illumina Genome Analyzer" sample_object_2.technology = "RNA-SEQ" sample_object_2.manufacturer = "ILLUMINA" sample_object_2.source_database = "SRA" sample_object_2.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_1 association.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_2 association.save() sample_1_original_files = [] sample_2_original_files = [] original_file = OriginalFile() original_file.source_url = "first_url" original_file.source_filename = "first_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_1_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "second_url" original_file.source_filename = "second_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "third_url" original_file.source_filename = "third_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "fourth_url" original_file.source_filename = "fourth_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( sample_1_original_files, experiment_object.accession_code ) surveyor.queue_downloader_job_for_original_files( sample_2_original_files, experiment_object.accession_code ) self.assertEqual(DownloaderJob.objects.all().count(), 2)
def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]: """Generates a Sample item for each sample in an AE experiment. There are many possible data situations for a sample: - If the sample only has raw data available: - If it is on a platform that we support: Download this raw data and process it - If it is not on a platform we support: Don't download anything, don't process anything - If the sample has both raw and derived data: - If the raw data is on a platform we support: Download the raw data and process it, abandon the derived data - If the raw data is not on a platform we support Download the derived data and no-op it, abandon the raw data - If the sample only has derived data: Download the derived data and no-op it. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples """ created_samples = [] samples_endpoint = SAMPLES_URL.format(experiment.accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # The SDRF is the complete metadata record on a sample/property basis. # We run this through our harmonizer and then attach the properties # to our created samples. SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) # An experiment can have many samples for sample_data in samples: # For some reason, this sample has no files associated with it. if "file" not in sample_data or len(sample_data['file']) == 0: continue # Each sample is given an experimenatlly-unique title. flat_sample = utils.flatten(sample_data) title = harmony.extract_title(flat_sample) # A sample may actually have many sub files. # If there is raw data, take that. # If not, take the derived. has_raw = False for sub_file in sample_data['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file # Some have the 'data' field, but not the actual data # Ex: E-GEOD-9656 if sub_file_mod['type'] == "data" and sub_file_mod[ 'comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True skip_sample = False for sub_file in sample_data['file']: # Don't get the raw data if it's only a 1-color sample. if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data): has_raw = False # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue download_url = None filename = sub_file["name"] # sub_file["comment"] is only a list if there's # more than one comment... comments = sub_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if "FTP file" in comment["name"]: download_url = comment["value"] break else: download_url = comments["value"] if not download_url: logger.error( "Sample %s did not specify a download url, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if not filename: logger.error( "Sample %s did not specify a filename, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if skip_sample: continue # The accession code is not a simple matter to determine. sample_source_name = sample_data["source"].get("name", "") sample_assay_name = sample_data["assay"].get("name", "") sample_accession_code = self.determine_sample_accession( experiment.accession_code, sample_source_name, sample_assay_name, filename) # Figure out the Organism for this sample organism_name = UNKNOWN for characteristic in sample_data["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == UNKNOWN: logger.error( "Sample %s did not specify the organism name.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) organism = None continue else: organism = Organism.get_object_for_name(organism_name) # Create the sample object try: # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. sample_object = Sample.objects.get( accession_code=sample_accession_code) # If input experiment includes new protocol information, # update sample's protocol_info. existing_protocols = sample_object.protocol_info protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols, experiment.protocol_description, experiment.source_url + '/protocols') if is_updated: sample_object.protocol_info = protocol_info sample_obejct.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() # The basics sample_object.source_database = "ARRAY_EXPRESS" sample_object.title = title sample_object.accession_code = sample_accession_code sample_object.source_archive_url = samples_endpoint sample_object.organism = organism sample_object.platform_name = platform_dict[ "platform_accession_name"] sample_object.platform_accession_code = platform_dict[ "platform_accession_code"] sample_object.manufacturer = platform_dict["manufacturer"] sample_object.technology = "MICROARRAY" protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment.protocol_description, protocol_url=experiment.source_url + '/protocols') # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() # Directly assign the harmonized properties harmonized_sample = harmonized_samples[title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) sample_object.save() sample_annotation = SampleAnnotation() sample_annotation.data = sample_data sample_annotation.sample = sample_object sample_annotation.is_ccdl = False sample_annotation.save() original_file = OriginalFile() original_file.filename = filename original_file.source_filename = filename original_file.source_url = download_url original_file.is_downloaded = False original_file.is_archive = True original_file.has_raw = has_raw original_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object original_file_sample_association.save() created_samples.append(sample_object) logger.debug( "Created " + str(sample_object), experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sample=sample_object.id) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment, organism=organism) return created_samples
def test_no_repeat_jobs(self): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object = Sample() sample_object.accession_code = "Sample1" sample_object.platform_accession_code = "Illumina Genome Analyzer" sample_object.platform_accession_name = "Illumina Genome Analyzer" sample_object.technology = "RNA-SEQ" sample_object.manufacturer = "ILLUMINA" sample_object.source_database = "SRA" sample_object.save() original_file_1 = OriginalFile() original_file_1.source_url = "first_url" original_file_1.source_filename = "first_filename" original_file_1.is_downloaded = False original_file_1.has_raw = True original_file_1.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_1 original_file_sample_association.sample = sample_object original_file_sample_association.save() original_file_2 = OriginalFile() original_file_2.source_url = "second_url" original_file_2.source_filename = "second_filename" original_file_2.is_downloaded = False original_file_2.has_raw = True original_file_2.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_2 original_file_sample_association.sample = sample_object original_file_sample_association.save() dlj = DownloaderJob() dlj.save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_1 ).save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_2 ).save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( [original_file_1, original_file_2], experiment_object.accession_code ) # We made one DownloaderJob in this test, so # queue_downloader_job_for_original_files didn't have anything # to do, so there should still be only one: self.assertEqual(1, DownloaderJob.objects.all().count())
def test_convert_simple_pcl(self): """ """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ID_REF, VALUE og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-51013/" og_file.filename = "GSM1234847_sample_table.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1234847_sample_table.txt" og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = "GSM1234847" sample.title = "GSM1234847" sample.platform_accession_code = 'A-AFFY-38' sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) # No header - ex # AFFX-BioB-3_at 0.74218756 og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10188/miniml/GSE10188_family.xml.tgz" og_file.filename = "GSM269747-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM269747-tbl-1.txt" og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = "GSM269747" sample.title = "GSM269747" sample.platform_accession_code = 'GPL1319' sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context['success']) self.assertTrue(os.path.exists(final_context['output_file_path'])) self.assertEqual(os.path.getsize(final_context['output_file_path']), 346535)
def test_create_missing_jobs(self): """Tests that files which should have downloader jobs get them created.""" # 1. create a sample with an original file and a downloader job original_file_with_downloader = OriginalFile() original_file_with_downloader.filename = "processed.CEL" original_file_with_downloader.source_filename = "processed.CEL" original_file_with_downloader.is_downloaded = True original_file_with_downloader.is_archive = False original_file_with_downloader.save() sample_with_downloader = Sample() sample_with_downloader.accession_code = "MA_doesnt_need_processor" sample_with_downloader.technology = "MICROARRAY" sample_with_downloader.source_database = "GEO" sample_with_downloader.platform_accession_code = "bovine" sample_with_downloader.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_with_downloader, original_file=original_file_with_downloader) downloader_job = DownloaderJob() downloader_job.success = True downloader_job.worker_id = "worker_1" downloader_job.volume_index = "1" downloader_job.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file_with_downloader) # 2. create a sample with an original file and no downloader job original_file = OriginalFile() original_file.filename = "tarball.gz" original_file.source_filename = "tarball.gz" original_file.is_downloaded = True original_file.is_archive = True original_file.save() sample_no_downloader = Sample() sample_no_downloader.accession_code = "sample_no_downloader" sample_no_downloader.technology = "MICROARRAY" sample_no_downloader.source_database = "GEO" sample_no_downloader.platform_accession_code = "bovine" # must be a supported platform sample_no_downloader.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_no_downloader, original_file=original_file) # 3. Setup is done, actually run the command. command = Command() command.handle() ## Test that a missing downloader job was created. self.assertEqual( 1, DownloaderJobOriginalFileAssociation.objects.filter( original_file=original_file).count(), ) ## Test that a downloader job that wasn't missing wasn't created. ## Of course, we created one in test setup, so we're really ## checking that it's still only 1. self.assertEqual( 1, DownloaderJobOriginalFileAssociation.objects.filter( original_file=original_file_with_downloader).count(), )
def test_create_missing_jobs(self): """Tests that files which should have processor jobs get them created. Specifically files that fall into this category are files that had successful downloader jobs but for some reason do not have processor jobs. It's not yet known why this is happening, but part of this management command is logging about them to get a grasp of how many there are. We want this test to cover both Microarray and RNA-Seq. We also need to test both that files which need processor jobs have them created, but also that files which don't need them don't get them created. Therefore we need at least 4 original files: * Microarray needing processor job. * Microarray not needing processor job. * RNA-Seq needing processor job. * RNA-Seq not needing processor job. However Microarray can have files which shouldn't get processor jobs, so we're going to make one of those as well. Also Microarray jobs can download multiple files which get a processor job each, so we're going to make an additional Microarray file and associate it with the same downloader job so we can make sure two processor jobs are created based on that one downloader job. """ # Microarray File/Samples/Jobs ma_og_doesnt_need_processor = OriginalFile() ma_og_doesnt_need_processor.filename = "processed.CEL" ma_og_doesnt_need_processor.is_downloaded = True ma_og_doesnt_need_processor.is_archive = False ma_og_doesnt_need_processor.save() ma_sample_doesnt_need_processor = Sample() ma_sample_doesnt_need_processor.accession_code = "MA_doesnt_need_processor" ma_sample_doesnt_need_processor.save() OriginalFileSampleAssociation.objects.get_or_create( sample=ma_sample_doesnt_need_processor, original_file=ma_og_doesnt_need_processor) ma_dl_job_doesnt_need_processor = DownloaderJob() ma_dl_job_doesnt_need_processor.success = True ma_dl_job_doesnt_need_processor.worker_id = "worker_1" ma_dl_job_doesnt_need_processor.volume_index = "1" ma_dl_job_doesnt_need_processor.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=ma_dl_job_doesnt_need_processor, original_file=ma_og_doesnt_need_processor) ma_processor_job = ProcessorJob() ma_processor_job.success = True ma_processor_job.worker_id = "worker_1" ma_dl_job_doesnt_need_processor.volume_index = "1" ma_processor_job.save() ProcessorJobOriginalFileAssociation.objects.get_or_create( processor_job=ma_processor_job, original_file=ma_og_doesnt_need_processor) ma_og_needs_processor_1 = OriginalFile() ma_og_needs_processor_1.filename = "something.CEL" ma_og_needs_processor_1.is_downloaded = True ma_og_needs_processor_1.is_archive = False ma_og_needs_processor_1.save() ma_og_needs_processor_2 = OriginalFile() ma_og_needs_processor_2.filename = "something_else.CEL" ma_og_needs_processor_2.is_downloaded = True ma_og_needs_processor_2.is_archive = False ma_og_needs_processor_2.save() ma_og_archive = OriginalFile() ma_og_archive.filename = "tarball.gz" ma_og_archive.is_downloaded = True ma_og_archive.is_archive = True ma_og_archive.save() ma_sample_needs_processor_1 = Sample() ma_sample_needs_processor_1.accession_code = "MA_needs_processor_1" ma_sample_needs_processor_1.save() OriginalFileSampleAssociation.objects.get_or_create( sample=ma_sample_needs_processor_1, original_file=ma_og_needs_processor_1) OriginalFileSampleAssociation.objects.get_or_create( sample=ma_sample_needs_processor_1, original_file=ma_og_archive) ma_sample_needs_processor_2 = Sample() ma_sample_needs_processor_2.accession_code = "MA_needs_processor_2" ma_sample_needs_processor_2.save() OriginalFileSampleAssociation.objects.get_or_create( sample=ma_sample_needs_processor_2, original_file=ma_og_needs_processor_2) OriginalFileSampleAssociation.objects.get_or_create( sample=ma_sample_needs_processor_2, original_file=ma_og_archive) ma_dl_job_needs_processor = DownloaderJob() ma_dl_job_needs_processor.success = True ma_dl_job_needs_processor.worker_id = "worker_1" ma_dl_job_doesnt_need_processor.volume_index = "1" ma_dl_job_needs_processor.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=ma_dl_job_needs_processor, original_file=ma_og_needs_processor_1) DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=ma_dl_job_needs_processor, original_file=ma_og_needs_processor_2) DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=ma_dl_job_needs_processor, original_file=ma_og_archive) # RNA-Seq File/Samples/Jobs rna_og_doesnt_need_processor = OriginalFile() rna_og_doesnt_need_processor.filename = "processed.fastq" rna_og_doesnt_need_processor.is_downloaded = True rna_og_doesnt_need_processor.is_archive = False rna_og_doesnt_need_processor.save() rna_sample_doesnt_need_processor = Sample() rna_sample_doesnt_need_processor.accession_code = "RNA_doesnt_need_processor" rna_sample_doesnt_need_processor.save() OriginalFileSampleAssociation.objects.get_or_create( sample=rna_sample_doesnt_need_processor, original_file=rna_og_doesnt_need_processor) rna_dl_job_doesnt_need_processor = DownloaderJob() rna_dl_job_doesnt_need_processor.success = True rna_dl_job_doesnt_need_processor.worker_id = "worker_1" rna_dl_job_doesnt_need_processor.volume_index = "1" rna_dl_job_doesnt_need_processor.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=rna_dl_job_doesnt_need_processor, original_file=rna_og_doesnt_need_processor) rna_processor_job = ProcessorJob() # Failed ProcessorJobs will be retried, so they still count. rna_processor_job.success = False rna_processor_job.worker_id = "worker_1" rna_dl_job_doesnt_need_processor.volume_index = "1" rna_processor_job.save() ProcessorJobOriginalFileAssociation.objects.get_or_create( processor_job=rna_processor_job, original_file=rna_og_doesnt_need_processor) rna_og_needs_processor = OriginalFile() rna_og_needs_processor.filename = "something.fastq" rna_og_needs_processor.is_downloaded = True rna_og_needs_processor.is_archive = False rna_og_needs_processor.save() rna_sample_needs_processor = Sample() rna_sample_needs_processor.accession_code = "RNA_needs_processor" rna_sample_needs_processor.save() OriginalFileSampleAssociation.objects.get_or_create( sample=rna_sample_needs_processor, original_file=rna_og_needs_processor) rna_dl_job_needs_processor = DownloaderJob() rna_dl_job_needs_processor.success = True rna_dl_job_needs_processor.worker_id = "worker_1" rna_dl_job_doesnt_need_processor.volume_index = "1" rna_dl_job_needs_processor.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=rna_dl_job_needs_processor, original_file=rna_og_needs_processor) # Setup is done, actually run the command. command = Command() command.handle() # Test Microarray was handled correctly. ## Test that a missing processor job was created. self.assertEqual( 1, ProcessorJobOriginalFileAssociation.objects.filter( original_file=ma_og_needs_processor_1).count()) self.assertEqual( 1, ProcessorJobOriginalFileAssociation.objects.filter( original_file=ma_og_needs_processor_2).count()) self.assertEqual( 0, ProcessorJobOriginalFileAssociation.objects.filter( original_file=ma_og_archive).count()) ## Test that a processor job that wasn't missing wasn't created. ## Of course, we created one in test setup, so we're really ## checking that it's still only 1. self.assertEqual( 1, ProcessorJobOriginalFileAssociation.objects.filter( original_file=ma_og_doesnt_need_processor).count()) # Test Microarray was handled correctly. ## Test that the missing processor job was created. self.assertEqual( 1, ProcessorJobOriginalFileAssociation.objects.filter( original_file=rna_og_needs_processor).count()) ## Test that a processor job that wasn't missing wasn't created. ## Of course, we created one in test setup, so we're really ## checking that it's still only 1. self.assertEqual( 1, ProcessorJobOriginalFileAssociation.objects.filter( original_file=rna_og_doesnt_need_processor).count())
def download_geo(job_id: int) -> None: """The main function for the GEO Downloader. Downloads a single tar file containing the files representing samples relating to a single experiment stored in GEO. """ job = utils.start_job(job_id) accession_code = job.accession_code original_file = job.original_files.first() if not original_file: job.failure_reason = "No files associated with the job." logger.error("No files associated with the job.", downloader_job=job_id) utils.end_downloader_job(job, success=False) return url = original_file.source_url related_samples = original_file.samples.exclude(technology="RNA-SEQ") # First, download the sample archive URL. # Then, unpack all the ones downloaded. # Then create processor jobs! # The files for all of the samples are contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + "/" + accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + "/" + accession_code + "/" + url.split("/")[-1] logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code) _download_file(url, dl_file_path, job) original_file.absolute_file_path = dl_file_path original_file.is_downloaded = True original_file.save() unpacked_sample_files = [] try: # enumerate all files inside the archive archived_files = list(ArchivedFile(dl_file_path).get_files()) except FileExtractionError as e: job.failure_reason = e logger.exception( "Error occurred while extracting file.", path=dl_file_path, exception=str(e) ) utils.end_downloader_job(job, success=False) return for og_file in archived_files: sample = og_file.get_sample() # We don't want RNA-Seq data from GEO: # https://github.com/AlexsLemonade/refinebio/issues/966 if sample and sample.technology == "RNA-SEQ": logger.warn("RNA-Seq sample found in GEO downloader job.", sample=sample) continue if not sample and ( not og_file.is_processable() or og_file.experiment_accession_code() != accession_code ): # skip the files that we know are not processable and can't be associated with a sample # also skip the files were we couldn't find a sample and they don't mention the current experiment continue potential_existing_file = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=og_file.filename, is_archive=False, ).first() if potential_existing_file: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. if potential_existing_file.needs_processing(): if not potential_existing_file.is_downloaded: potential_existing_file.is_downloaded = True potential_existing_file.save() unpacked_sample_files.append(potential_existing_file) continue # Then this is a new file and we should create an original file for it actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file.file_path actual_file.filename = og_file.filename actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() # try to see if the file should be associated with a sample if sample: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() else: # if not, we can associate this file with all samples in the experiment for sample in related_samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() unpacked_sample_files.append(actual_file) if len(unpacked_sample_files) > 0: success = True logger.debug( "File downloaded and extracted successfully.", url=url, dl_file_path=dl_file_path, downloader_job=job_id, ) else: success = False logger.info( "Unable to extract any files.", url=url, dl_file_path=dl_file_path, downloader_job=job_id, ) job.failure_reason = "Failed to extract any downloaded files." if success: create_processor_jobs_for_original_files(unpacked_sample_files, job) if original_file.is_archive: original_file.delete_local_file() utils.end_downloader_job(job, success) return success
def download_geo(job_id: int) -> None: """The main function for the GEO Downloader. Downloads a single tar file containing the files representing samples relating to a single experiement stored in GEO. """ job = utils.start_job(job_id) file_assocs = DownloaderJobOriginalFileAssociation.objects.filter( downloader_job=job) original_file = file_assocs[0].original_file url = original_file.source_url accession_code = job.accession_code sample_assocs = OriginalFileSampleAssociation.objects.filter( original_file=original_file) related_samples = Sample.objects.filter( id__in=sample_assocs.values('sample_id')) # First, download the sample archive URL. # Then, unpack all the ones downloaded. # Then create processor jobs! # The files for all of the samples are # contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + url.split( '/')[-1] logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code) _download_file(url, dl_file_path, job) original_file.absolute_file_path = dl_file_path original_file.is_downloaded = True original_file.save() has_raw = True unpacked_sample_files = [] # These files are tarred, and also subsequently gzipped if '.tar' in dl_file_path: try: extracted_files = _extract_tar(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting tar file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: filename = og_file['filename'] if '_' in filename: sample_id = filename.split('_')[0] else: sample_id = filename.split('.')[0] try: sample = Sample.objects.get(accession_code=sample_id) except Exception as e: # We don't have this sample, but it's not a total failure. This happens. continue try: # Files from the GEO supplemental file are gzipped inside of the tarball. Great! archive_file = OriginalFile.objects.get( source_filename__contains=sample_id) archive_file.is_downloaded = True archive_file.is_archive = True archive_file.absolute_file_path = og_file['absolute_path'] archive_file.calculate_size() archive_file.calculate_sha1() archive_file.save() if '.gz' in og_file['filename']: extracted_subfile = _extract_gz(og_file['absolute_path'], accession_code) else: extracted_subfile = [og_file] actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile[0][ 'absolute_path'] actual_file.filename = extracted_subfile[0]['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() archive_file.delete_local_file() archive_file.is_downloaded = False archive_file.save() unpacked_sample_files.append(actual_file) except Exception as e: # TODO - is this worth failing a job for? logger.debug( "Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], exc_info=1, file=og_file['filename'], sample_id=sample_id, accession_code=accession_code) # If we don't know why we have it, get rid of it. os.remove(og_file["absolute_path"]) # This is a .tgz file. elif '.tgz' in dl_file_path: # If this is the MINiML file, it has been preprocessed if '_family.xml.tgz' in dl_file_path: has_raw = False try: extracted_files = _extract_tgz(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting tgz file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: if '.txt' in og_file['filename']: try: gsm_id = og_file['filename'].split('-')[0] sample = Sample.objects.get(accession_code=gsm_id) except Exception as e: os.remove(og_file["absolute_path"]) continue actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file['absolute_path'] actual_file.filename = og_file['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = has_raw actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() unpacked_sample_files.append(actual_file) # These files are only gzipped. # These are generally the _actually_ raw (rather than the non-raw data in a RAW file) data elif '.gz' in dl_file_path: try: extracted_files = _extract_gz(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting gz file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: filename = og_file['filename'] sample_id = filename.split('.')[0] try: # The archive we downloaded archive_file = OriginalFile.objects.get( source_filename__contains=filename) archive_file.is_downloaded = True archive_file.is_archive = True archive_file.absolute_file_path = dl_file_path archive_file.calculate_size() archive_file.calculate_sha1() archive_file.save() actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file['absolute_path'] actual_file.filename = og_file['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in related_samples: new_association = OriginalFileSampleAssociation() new_association.original_file = actual_file new_association.sample = sample new_association.save() archive_file.delete_local_file() archive_file.is_downloaded = False archive_file.save() unpacked_sample_files.append(actual_file) except Exception as e: logger.debug( "Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], exc_info=1, file=og_file['filename'], sample_id=sample_id, accession_code=accession_code) os.remove(og_file["absolute_path"]) # This is probably just a .txt file else: filename = dl_file_path.split('/')[-1] sample_id = filename.split('_')[0] actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = dl_file_path actual_file.filename = filename actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in related_samples: new_association = OriginalFileSampleAssociation() new_association.original_file = actual_file new_association.sample = sample new_association.save() unpacked_sample_files.append(actual_file) if len(unpacked_sample_files) > 0: success = True logger.debug("File downloaded and extracted successfully.", url=url, dl_file_path=dl_file_path, downloader_job=job_id) else: success = False logger.info("Unable to extract any files.", url=url, dl_file_path=dl_file_path, downloader_job=job_id) job.failure_reason = "Failed to extract any downloaded files." if success: utils.create_processor_jobs_for_original_files(unpacked_sample_files, job) if original_file.is_archive: original_file.delete_local_file() utils.end_downloader_job(job, success) return success