def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() original_file = OriginalFile() original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" original_file.filename = "GSM1426071_CD_colon_active_1.CEL" original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" original_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = original_file assoc1.processor_job = pj assoc1.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") sample = Sample() sample.title = "Heyo" sample.organism = c_elegans sample.is_processed = False sample.save() ogsa = OriginalFileSampleAssociation() ogsa.sample = sample ogsa.original_file = original_file ogsa.save() return pj
def create_processor_job_for_original_files(original_files: List[OriginalFile], volume_index: int): """ Create a processor job and queue a processor task for sample related to an experiment. """ # If there's no original files then we've created all the jobs we need to! if len(original_files) == 0: return # For anything that has raw data there should only be one Sample per OriginalFile sample_object = original_files[0].samples.first() pipeline_to_apply = determine_processor_pipeline(sample_object, original_files[0]) if pipeline_to_apply == ProcessorPipeline.NONE: logger.info("No valid processor pipeline found to apply to sample.", sample=sample_object.id, original_file=original_files[0].id) for original_file in original_files: original_file.delete_local_file() original_file.is_downloaded = False original_file.save() else: processor_job = ProcessorJob() processor_job.pipeline_applied = pipeline_to_apply.value processor_job.ram_amount = determine_ram_amount( sample_object, processor_job) processor_job.volume_index = volume_index processor_job.save() for original_file in original_files: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job assoc.save() logger.debug("Queuing processor job.", processor_job=processor_job.id) send_job(pipeline_to_apply, processor_job)
def prepare_dotsra_job(filename="ERR1562482.sra"): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.id = random.randint(111, 999999) pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = filename og_file.filename = filename og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/" + filename og_file.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj, [og_file]
def create_long_and_short_processor_jobs(files_to_process): """ Creates two processor jobs for the files needed for this transcriptome""" processor_job_long = ProcessorJob() processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG" processor_job_long.ram_amount = 8192 processor_job_long.save() for original_file in files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_long assoc.save() send_job(ProcessorPipeline[processor_job_long.pipeline_applied], processor_job_long) processor_job_short = ProcessorJob() processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT" processor_job_short.ram_amount = 8192 processor_job_short.save() for original_file in files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_short assoc.save() send_job(ProcessorPipeline[processor_job_short.pipeline_applied], processor_job_short)
def create_processor_job(self, pipeline="AFFY_TO_PCL", ram_amount=2048): job = ProcessorJob( pipeline_applied=pipeline, nomad_job_id="PROCESSOR/dispatch-1528945054-e8eaf540", ram_amount=ram_amount, num_retries=0, volume_index="1", success=None) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = og_file assoc.processor_job = job assoc.save() return job
def run_tximport(): """Creates a tximport job for all eligible experiments.""" eligible_experiments = (Experiment.objects.annotate( num_organisms=Count("organisms")).filter( num_organisms=1, technology="RNA-SEQ", num_processed_samples=0).prefetch_related("samples__results")) paginator = Paginator(eligible_experiments, PAGE_SIZE) page = paginator.page() # Next is to figure out how many samples were processed for # each experiment. Should be able to reuse code from salmon # cause it does this stuff. tximport_pipeline = ProcessorPipeline.TXIMPORT while True: creation_count = 0 for experiment in page.object_list: quant_results = get_quant_results_for_experiment(experiment) if should_run_tximport(experiment, quant_results, True): processor_job = ProcessorJob() processor_job.pipeline_applied = tximport_pipeline.value processor_job.ram_amount = 8192 # This job doesn't need to run on a specific volume # but it uses the same Nomad job as Salmon jobs which # do require the volume index. processor_job.volume_index = random.choice( list(get_active_volumes())) processor_job.save() assoc = ProcessorJobOriginalFileAssociation() # Any original file linked to any sample of the # experiment will work. Tximport is somewhat special # in that it doesn't actuallhy use original files so # this is just used to point to the experiment. assoc.original_file = experiment.samples.all( )[0].original_files.all()[0] assoc.processor_job = processor_job assoc.save() creation_count += 1 try: send_job(tximport_pipeline, processor_job) except Exception: # If we cannot queue the job now the Foreman will do # it later. pass logger.info( "Created %d tximport jobs for experiments past the thresholds.", creation_count) if not page.has_next(): break else: page = paginator.page(page.next_page_number())
def prepare_illumina_job(organism): pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz" og_file.filename = "GSE22427_non-normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt") og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() sample_names = [ "LV-C&si-Control-1", "LV-C&si-Control-2", "LV-C&si-Control-3", "LV-C&si-EZH2-1", "LV-C&si-EZH2-2", "LV-C&si-EZH2-3", "LV-EZH2&si-EZH2-1", "LV-EZH2&si-EZH2-2", "LV-EZH2&si-EZH2-3", "LV-T350A&si-EZH2-1", "LV-T350A&si-EZH2-2", "LV-T350A&si-EZH2-3", ] for name in sample_names: sample = Sample() sample.accession_code = name sample.title = name sample.organism = organism sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = {"description": [name]} sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() return pj
def prepare_illumina_job(job_info: Dict) -> ProcessorJob: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() for s in job_info["samples"]: # For convenience, if you give a list of strings we'll just use the # strings as both titles and accessions. annotation = None if type(s) == str: accession_code = s title = s elif type(s) == tuple and list(map(type, s)) == [str, str]: accession_code, title = s elif type(s) == tuple and list(map(type, s)) == [str, str, dict]: accession_code, title, annotation = s else: raise ValueError(f"Invalid sample type for sample {s}") sample = Sample() sample.accession_code = accession_code sample.title = title sample.organism = job_info["organism"] sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = annotation if annotation is not None else { "description": [title] } sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() return pj
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.source_database = 'SRA' samp.technology = 'RNA-SEQ' samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = "ERR1562482_1.fastq.gz" og_file.filename = "ERR1562482_1.fastq.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "ERR1562482_2.fastq.gz" og_file2.filename = "ERR1562482_2.fastq.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file2 assoc1.processor_job = pj assoc1.save() return pj, [og_file, og_file2]
def test_convert_illumina_no_header(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 # ILMN_2209417 10.0000 0.2029 # ILMN_1765401 152.0873 0.0000 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt" ) og_file.filename = "GSM1089291-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt" og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000105675 10 # ENSG00000085721 152.0873 # ENSG00000278494 152.0873 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 786207)
def test_convert_processed_illumina(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # Reporter Identifier VALUE Detection Pval # ILMN_1343291 14.943602 0 # ILMN_1343295 13.528082 0 og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/" og_file.filename = "GSM557500_sample_table.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt") og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000156508 14.943602 # ENSG00000111640 13.528082 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 920374) self.assertTrue( no_op.check_output_quality(final_context["output_file_path"]))
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_convert_illumina_bad_cols(self): """ In future, this test may be deprecated. For now it just alerts that it needs attention. """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 11.0000 0.123 # ILMN_2209417 10.0000 0.2029 11.1234 0.543 # LMN_1765401 152.0873 0.0000 99.999 0.19 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt" ) og_file.filename = "GSM1089291-tbl-1-modified.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt" ) og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertFalse(final_context["success"]) self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=1001) samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def prepare_non_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM45nnn/GSM45588/suppl/GSM45588.CEL.gz" og_file.filename = "GSM45588.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM45588.CEL" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def handle(self, *args, **options): """Run tximport on an experiment for the samples that have been processed by Salmon. """ if options["experiment_accession"] is None: logger.error( "The --experiment-accession argument must be provided") sys.exit(1) else: accession_code = options["experiment_accession"] # Find an OriginalFile associated with one of the Experiment's # samples which had Salmon run successfully on it. original_file = None experiment = Experiment.objects.get(accession_code=accession_code) for sample in experiment.samples.all(): # Only need to loop until we actually find an # original_file with a successful job. if original_file: break pjs_for_sample = sample.get_processor_jobs() for processor_job in list(pjs_for_sample): if processor_job.success: original_file = processor_job.original_files.first() if original_file: break if not original_file: logger.error( "Could not find a single sample in the experiment that had a successful Salmon job.", experiment=accession_code, ) sys.exit(1) job = ProcessorJob() job.pipeline_applied = "TXIMPORT" job.save() pjofa = ProcessorJobOriginalFileAssociation() pjofa.processor_job = job pjofa.original_file = original_file pjofa.save() tximport.tximport(job.id) sys.exit(0)
def prepare_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" og_file.filename = "GSM1426071_CD_colon_active_1.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def create_processor_jobs_for_original_files( original_files: List[OriginalFile], downloader_job: DownloaderJob = None): """ Create a processor jobs and queue a processor task for samples related to an experiment. """ for original_file in original_files: sample_object = original_file.samples.first() if not delete_if_blacklisted(original_file): continue pipeline_to_apply = determine_processor_pipeline( sample_object, original_file) if pipeline_to_apply == ProcessorPipeline.NONE: logger.info( "No valid processor pipeline found to apply to sample.", sample=sample_object.id, original_file=original_files[0].id) original_file.delete_local_file() original_file.is_downloaded = False original_file.save() else: processor_job = ProcessorJob() processor_job.pipeline_applied = pipeline_to_apply.value processor_job.ram_amount = determine_ram_amount( sample_object, processor_job) processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job assoc.save() if downloader_job: logger.debug("Queuing processor job.", processor_job=processor_job.id, original_file=original_file.id, downloader_job=downloader_job.id) else: logger.debug("Queuing processor job.", processor_job=processor_job.id, original_file=original_file.id) send_job(pipeline_to_apply, processor_job)
def prepare_agilent_twocolor_job(): pj = ProcessorJob() pj.pipeline_applied = "AGILENT_TWOCOLOR_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE22900&format=file" og_file.filename = "GSM466597_95899_agilent.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/AGILENT_TWOCOLOR/GSM466597_95899_agilent.txt" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def create_long_and_short_processor_jobs(downloader_job, long_files_to_process, short_files_to_process): """ Creates two processor jobs for the files needed for this transcriptome""" processor_job_long = ProcessorJob() processor_job_long.downloader_job = downloader_job processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG" processor_job_long.ram_amount = 4096 processor_job_long.save() for original_file in long_files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_long assoc.save() try: send_job(ProcessorPipeline[processor_job_long.pipeline_applied], processor_job_long) except Exception: # This is fine, the foreman will requeue these later. logger.exception( "Problem with submitting a long transcriptome index job.") processor_job_short = ProcessorJob() processor_job_short.downloader_job = downloader_job processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT" processor_job_short.ram_amount = 4096 processor_job_short.save() for original_file in short_files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_short assoc.save() try: send_job(ProcessorPipeline[processor_job_short.pipeline_applied], processor_job_short) except Exception: # This is fine, the foreman will requeue these later. logger.exception( "Problem with submitting a long transcriptome index job.")
def prepare_job(job_info: dict) -> ProcessorJob: job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = job_info["accession_code"] sample.title = job_info["accession_code"] sample.platform_accession_code = job_info["platform_accession_code"] manufacturer = job_info.get("manufacturer", None) if manufacturer is not None: sample.manufacturer = manufacturer # The illumina samples need the human organism if manufacturer == "ILLUMINA": homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() return job
def prepare_huex_v1_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1364nnn/GSM1364667/suppl/GSM1364667_U_110208_7-02-10_S2.CEL.gz" og_file.filename = "GSM1364667_U_110208_7-02-10_S2.CEL" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/CEL/GSM1364667_U_110208_7-02-10_S2.CEL" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def create_long_and_short_processor_jobs(files_to_process): """ Creates two processor jobs for the files needed for this transcriptome""" processor_job_long = ProcessorJob() processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG" processor_job_long.ram_amount = 4096 processor_job_long.save() for original_file in files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_long assoc.save() try: send_job(ProcessorPipeline[processor_job_long.pipeline_applied], processor_job_long) except Exception: # This is fine, the foreman will requeue these later. pass processor_job_short = ProcessorJob() processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT" processor_job_short.ram_amount = 4096 processor_job_short.save() for original_file in files_to_process: assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job_short assoc.save() try: send_job(ProcessorPipeline[processor_job_short.pipeline_applied], processor_job_short) except Exception: # This is fine, the foreman will requeue these later. pass
def test_no_job_created_when_failed_job_exists(self): experiment = setup_experiment([], ["GSM001"]) # create a failed job for that experiment processor_job = ProcessorJob() processor_job.pipeline_applied = ProcessorPipeline.SALMON processor_job.ram_amount = 1024 processor_job.success = False processor_job.retried = False processor_job.no_retry = False processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = experiment.samples.first().original_files.first() assoc.processor_job = processor_job assoc.save() # Run command update_salmon_all_experiments() dl_jobs = DownloaderJob.objects.all() self.assertEqual(dl_jobs.count(), 0)
def run_tximport_if_eligible(experiment: Experiment, dispatch_jobs=True) -> bool: """Checks if an experiment is eligible to have tximport run on it and creates a job for it. If the dispatch_jobs parameter is True a Batch job will be dispatched for it. Returns the ProcessorJob if a job was created or None if one was not. """ tximport_pipeline = ProcessorPipeline.TXIMPORT if get_tximport_inputs_if_eligible(experiment, True): processor_job = ProcessorJob() processor_job.pipeline_applied = tximport_pipeline.value processor_job.ram_amount = 32768 processor_job.save() assoc = ProcessorJobOriginalFileAssociation() # Any original file linked to any sample of the # experiment will work. Tximport is somewhat special # in that it doesn't actuallhy use original files so # this is just used to point to the experiment. assoc.original_file = experiment.samples.all()[0].original_files.all( )[0] assoc.processor_job = processor_job assoc.save() if dispatch_jobs: try: send_job(tximport_pipeline, processor_job) except Exception: # If we cannot queue the job now the Foreman will do # it later. pass return processor_job return None
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=1001) samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() [og_file, og_file2] = prepare_original_files(length) og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def setUp(self): # Saving this for if we have protected endpoints # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword') # self.client.login(username='******', password='******') # self.user = User.objects.create(username="******") experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.save() self.experiment = experiment experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below) for i in range(26): Organism(name=("TEST_ORGANISM_{}".format(i)), taxonomy_id=(1234 + i)).save() ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True) ailuropoda.save() self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.is_processed = True sample.organism = ailuropoda sample.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = ailuropoda sample.save() self.sample = sample # add qn target for sample organism result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = {"organism_id": ailuropoda.id, "is_qn": True} cra.save() ailuropoda.qn_target = result ailuropoda.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() experiment.num_total_samples = 1 experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() processor = Processor() processor.name = "Salmon Quant" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() computational_result_short = ComputationalResult(processor=processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = self.danio_rerio organism_index.result = computational_result_short organism_index.absolute_directory_path = ( "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT") organism_index.is_public = True organism_index.s3_url = "not_blank" organism_index.save() return
def _detect_platform(job_context: Dict) -> Dict: """ Determine the platform/database to process this sample with. They often provide something like "V2" or "V 2", but we don't trust them so we detect it ourselves. Related: https://github.com/AlexsLemonade/refinebio/issues/232 """ all_databases = { 'HOMO_SAPIENS': [ 'illuminaHumanv1', 'illuminaHumanv2', 'illuminaHumanv3', 'illuminaHumanv4', ], 'MUS_MUSCULUS': [ 'illuminaMousev1', 'illuminaMousev1p1', 'illuminaMousev2', ], 'RATTUS_NORVEGICUS': ['illuminaRatv1'] } sample0 = job_context['samples'][0] databases = all_databases[sample0.organism.name] # Loop over all of the possible platforms and find the one with the best match. highest = 0.0 high_mapped_percent = 0.0 high_db = None for platform in databases: try: result = subprocess.check_output([ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/detect_database.R", "--platform", platform, "--inputFile", job_context['input_file_path'], "--column", job_context['probeId'], ]) results = result.decode().split('\n') cleaned_result = float(results[0].strip()) if cleaned_result > highest: highest = cleaned_result high_db = platform high_mapped_percent = float(results[1].strip()) except Exception as e: logger.exception(e, processor_job_id=job_context["job"].id) continue # Record our sample detection outputs for every sample. for sample in job_context['samples']: sa = SampleAnnotation() sa.sample = sample sa.is_ccdl = True sa.data = { "detected_platform": high_db, "detection_percentage": highest, "mapped_percentage": high_mapped_percent } sa.save() # If the match is over 75%, record this and process it on that platform. if high_mapped_percent > 75.0: job_context['platform'] = high_db # The match percentage is too low - send this to the no-opper instead. else: logger.info("Match percentage too low, NO_OP'ing and aborting.", job=job_context['job_id']) processor_job = ProcessorJob() processor_job.pipeline_applied = "NO_OP" processor_job.volume_index = job_context["job"].volume_index processor_job.ram_amount = job_context["job"].ram_amount processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = job_context["original_files"][0] assoc.processor_job = processor_job assoc.save() try: send_job(ProcessorPipeline.NO_OP, processor_job) except Exception as e: # Nomad dispatch error, likely during local test. logger.error(e, job=processor_job) job_context['abort'] = True return job_context
def create_processor_jobs_for_original_files( original_files: List[OriginalFile], downloader_job: DownloaderJob = None): """ Create a processor jobs and queue a processor task for samples related to an experiment. """ for original_file in original_files: sample_object = original_file.samples.first() if not delete_if_blacklisted(original_file): continue # Fix for: https://github.com/AlexsLemonade/refinebio/issues/968 # Basically, we incorrectly detected technology/manufacturers # for many Affymetrix samples and this is a good place to fix # some of them. if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = 'MICROARRAY' sample_object.manufacturer = 'AFFYMETRTIX' sample_object.save() pipeline_to_apply = determine_processor_pipeline( sample_object, original_file) if pipeline_to_apply == ProcessorPipeline.NONE: logger.info( "No valid processor pipeline found to apply to sample.", sample=sample_object.id, original_file=original_files[0].id) original_file.delete_local_file() original_file.is_downloaded = False original_file.save() else: processor_job = ProcessorJob() processor_job.pipeline_applied = pipeline_to_apply.value processor_job.ram_amount = determine_ram_amount( sample_object, processor_job) processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = original_file assoc.processor_job = processor_job assoc.save() if downloader_job: logger.debug("Queuing processor job.", processor_job=processor_job.id, original_file=original_file.id, downloader_job=downloader_job.id) else: logger.debug("Queuing processor job.", processor_job=processor_job.id, original_file=original_file.id) try: send_job(pipeline_to_apply, processor_job) except: # If we cannot queue the job now the Foreman will do # it later. pass
def test_convert_simple_pcl(self): """ """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ID_REF, VALUE og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-51013/" og_file.filename = "GSM1234847_sample_table.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1234847_sample_table.txt" og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = "GSM1234847" sample.title = "GSM1234847" sample.platform_accession_code = 'A-AFFY-38' sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) # No header - ex # AFFX-BioB-3_at 0.74218756 og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10188/miniml/GSE10188_family.xml.tgz" og_file.filename = "GSM269747-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM269747-tbl-1.txt" og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = "GSM269747" sample.title = "GSM269747" sample.platform_accession_code = 'GPL1319' sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context['success']) self.assertTrue(os.path.exists(final_context['output_file_path'])) self.assertEqual(os.path.getsize(final_context['output_file_path']), 346535)