def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append("SCAN.UPC::SCAN_TwoColor") result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "AGILENT_TWOCOLOR" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the sample, # sync it S3 and save it. try: computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split( job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context["computed_files"].append(computed_file) except Exception: logger.exception( "Exception caught while moving file %s to S3", computed_file.filename, processor_job=job_context["job_id"], ) failure_reason = "Exception caught while moving file to S3" job_context["job"].failure_reason = failure_reason job_context["success"] = False return job_context for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.info("Created %s", result) job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: result = ComputationalResult() result.commands.append(" ".join(job_context['formatted_command'])) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "QN_REFERENCE" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() computed_file = ComputedFile() computed_file.absolute_file_path = job_context['target_file'] computed_file.filename = job_context['target_file'].split('/')[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_smashable = False computed_file.is_qn_target = True computed_file.result = result computed_file.save() annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context['samples']['ALL'][0].organism_id, "is_qn": True, "platform_accession_code": job_context['samples']['ALL'][0].platform_accession_code, "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]], "geneset": str(job_context["geneset"]), "num_valid_inputs": job_context["num_valid_inputs"] } annotation.save() # TODO: upload this to a public read bucket. # https://github.com/AlexsLemonade/refinebio/issues/586 job_context['result'] = result job_context['computed_files'] = [computed_file] job_context['annotation'] = annotation job_context['success'] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: if not job_context["create_results"]: return job_context result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "QN_REFERENCE" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() computed_file = ComputedFile() computed_file.absolute_file_path = job_context["target_file"] computed_file.filename = job_context["target_file"].split("/")[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_smashable = False computed_file.is_qn_target = True computed_file.result = result computed_file.save() annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"]["ALL"][0].organism_id, "is_qn": True, "platform_accession_code": job_context["samples"]["ALL"][0].platform_accession_code, "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]], "geneset": str(job_context["geneset"]), "num_valid_inputs": job_context["num_valid_inputs"], } annotation.save() job_context["result"] = result job_context["computed_files"] = [computed_file] job_context["annotation"] = annotation job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append('SCAN.UPC::SCANfast') result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "AFFYMETRIX_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Create a ComputedFile for the sample computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split(job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context['computed_files'].append(computed_file) for sample in job_context['samples']: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.debug("Created %s", result, processor_job=job_context["job_id"]) job_context["success"] = True return job_context
def test_qn_endpoints(self): # create two additional qn endpoints result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = { "organism_id": self.danio_rerio.id, # Danio "is_qn": True, "platform_accession_code": "zebrafish", "samples": [], "geneset": str(["RWWJ000001", "RWWJ000002"]), } cra.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = { "organism_id": self.homo_sapiens.id, # IDK "is_qn": True, "platform_accession_code": "zebrafishplusone", "samples": [], "geneset": str(["RWWJ000003", "RWWJ000004"]), } cra.save() self.homo_sapiens.qn_target = result self.homo_sapiens.save() self.danio_rerio.qn_target = result self.danio_rerio.save() response = self.client.get( reverse("qn_targets_available", kwargs={"version": API_VERSION})) # there's another qn endpoint that is created in the setup method of this test case self.assertEqual(len(response.json()), 3)
def setUp(self): # Saving this for if we have protected endpoints # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword') # self.client.login(username='******', password='******') # self.user = User.objects.create(username="******") experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.save() self.experiment = experiment experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below) for i in range(26): Organism(name=("TEST_ORGANISM_{}".format(i)), taxonomy_id=(1234 + i)).save() ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True) ailuropoda.save() self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.is_processed = True sample.organism = ailuropoda sample.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = ailuropoda sample.save() self.sample = sample # add qn target for sample organism result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = {"organism_id": ailuropoda.id, "is_qn": True} cra.save() ailuropoda.qn_target = result ailuropoda.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() experiment.num_total_samples = 1 experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() processor = Processor() processor.name = "Salmon Quant" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() computational_result_short = ComputationalResult(processor=processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = self.danio_rerio organism_index.result = computational_result_short organism_index.absolute_directory_path = ( "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT") organism_index.is_public = True organism_index.s3_url = "not_blank" organism_index.save() return
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result = ComputationalResult() result.commands.append(" ".join(job_context['formatted_command'])) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file, overwriting the previous smash job_context['merged_qn'].to_csv(job_context['smash_outfile'], sep='\t', encoding='utf-8') compendia_tsv_computed_file = ComputedFile() compendia_tsv_computed_file.absolute_file_path = job_context['smash_outfile'] compendia_tsv_computed_file.filename = job_context['smash_outfile'].split('/')[-1] compendia_tsv_computed_file.calculate_sha1() compendia_tsv_computed_file.calculate_size() compendia_tsv_computed_file.is_smashable = False compendia_tsv_computed_file.is_qn_target = False compendia_tsv_computed_file.result = result compendia_tsv_computed_file.save() organism_key = list(job_context['samples'].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context['samples'][organism_key][0].organism_id, "organism_name": job_context['samples'][organism_key][0].organism.name, "is_qn": False, "is_compendia": True, "samples": [sample.accession_code for sample in job_context["samples"][organism_key]], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context['experiments']] } annotation.save() # Save the related metadata file metadata_computed_file = ComputedFile() metadata_computed_file.absolute_file_path = job_context['metadata_tsv_paths'][0] metadata_computed_file.filename = job_context['metadata_tsv_paths'][0].split('/')[-1] metadata_computed_file.calculate_sha1() metadata_computed_file.calculate_size() metadata_computed_file.is_smashable = False metadata_computed_file.is_qn_target = False metadata_computed_file.result = result metadata_computed_file.save() # Create the resulting archive final_zip_base = "/home/user/data_store/smashed/" + str(job_context["dataset"].pk) + "_compendia" archive_path = shutil.make_archive(final_zip_base, 'zip', job_context["output_dir"]) # Save the related metadata file organism = job_context['samples'][organism_key][0].organism try: last_compendia = ComputedFile.objects.filter( is_compendia=True, compendia_organism=organism).order_by('-compendia_version')[-1] compendia_version = last_compendia.compendia_version + 1 except Exception as e: # This is the first compendia for this Organism compendia_version = 1 archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split('/')[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.compendia_organism = job_context['samples'][organism_key][0].organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() logger.info("Compendia created!", archive_path=archive_path, organism_name=job_context['samples'][organism_key][0].organism.name ) # Upload the result to S3 key = job_context['samples'][organism_key][0].organism.name + "_" + str(compendia_version) + "_" + str(int(time.time())) + ".zip" archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) job_context['result'] = result job_context['computed_files'] = [compendia_tsv_computed_file, metadata_computed_file, archive_computed_file] job_context['success'] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append(job_context['formatted_command']) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "ILLUMINA_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Split the result into smashable subfiles big_tsv = job_context["output_file_path"] data = pd.read_csv(big_tsv, sep='\t', header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: filename = frame.columns.values[0].replace('&', '').replace( "*", '').replace(";", '') + '.tsv' frame_path = job_context["work_dir"] + filename frame.to_csv(frame_path, sep='\t', encoding='utf-8') # This needs to be the same as the ones in the job context! try: sample = job_context['samples'].get(title=frame.columns.values[0]) except Sample.DoesNotExist: logger.error( "Could not find sample for column while splitting Illumina file.", title=frame.columns.values[0], processor_job=job_context["job_id"], file_path=big_tsv, ) continue computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = frame_path.split('/')[-1] computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context['computed_files'].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) logger.debug("Created %s", result) job_context["success"] = True job_context["individual_files"] = individual_files job_context["result"] = result return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ archive_path = job_context["archive_path"] compendia_organism = _get_organisms(job_context["samples"]).first() compendia_version = _get_next_compendia_version(compendia_organism) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_QUANTPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = FileUtils.get_filename(archive_path) archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.quant_sf_only = True archive_computed_file.compendia_organism = compendia_organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() compendium_result = CompendiumResult() compendium_result.quant_sf_only = True compendium_result.result = result compendium_result.primary_organism = compendia_organism compendium_result.compendium_version = compendia_version compendium_result.save() logger.info( "Quantpendia created! Uploading to S3.", job_id=job_context["job_id"], archive_path=archive_path, organism_name=compendia_organism.name, **get_process_stats() ) # Upload the result to S3 timestamp = str(int(time.time())) s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result_start = log_state("start create result object", job_context["job"].id) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True # Temporary until we re-enable the QN test step. result.is_public = False result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file job_context["csv_outfile"] = job_context["output_dir"] + job_context[ "organism_name"] + ".tsv" job_context["merged_qn"].to_csv(job_context["csv_outfile"], sep="\t", encoding="utf-8") organism_key = list(job_context["samples"].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"][organism_key][0].organism_id, "organism_name": job_context["organism_name"], "is_qn": False, "is_compendia": True, "samples": [ sample.accession_code for sample in job_context["samples"][organism_key] ], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context["experiments"]], "total_percent_imputed": job_context["total_percent_imputed"], } annotation.save() # Create the resulting archive final_zip_base = SMASHING_DIR + str( job_context["dataset"].pk) + "_compendia" # Copy LICENSE.txt and correct README.md files. if job_context["dataset"].quant_sf_only: readme_file = "/home/user/README_QUANT.md" else: readme_file = "/home/user/README_NORMALIZED.md" shutil.copy(readme_file, job_context["output_dir"] + "/README.md") shutil.copy("/home/user/LICENSE_DATASET.txt", job_context["output_dir"] + "/LICENSE.TXT") archive_path = shutil.make_archive(final_zip_base, "zip", job_context["output_dir"]) archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split("/")[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.save() # Compendia Result Helpers primary_organism = Organism.get_object_for_name( job_context["primary_organism"]) organisms = [ Organism.get_object_for_name(organism) for organism in job_context["all_organisms"] ] compendium_version = (CompendiumResult.objects.filter( primary_organism=primary_organism, quant_sf_only=False).count() + 1) # Save Compendia Result compendium_result = CompendiumResult() compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm compendium_result.compendium_version = compendium_version compendium_result.result = result compendium_result.primary_organism = primary_organism compendium_result.save() # create relations to all organisms contained in the compendia compendium_result_organism_associations = [] for compendium_organism in organisms: compendium_result_organism_association = CompendiumResultOrganismAssociation( ) compendium_result_organism_association.compendium_result = compendium_result compendium_result_organism_association.organism = compendium_organism compendium_result_organism_associations.append( compendium_result_organism_association) CompendiumResultOrganismAssociation.objects.bulk_create( compendium_result_organism_associations) job_context["compendium_result"] = compendium_result logger.info("Compendium created!", archive_path=archive_path, organism_name=job_context["organism_name"]) # Upload the result to S3 timestamp = str(int(time.time())) key = job_context["organism_name"] + "_" + str( compendium_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True log_state("end create result object", job_context["job"].id, result_start) # TEMPORARY for iterating on compendia more quickly. # Reset this so the end_job does clean up the job's non-input-data stuff. job_context["work_dir"] = job_context["old_work_dir"] return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append(job_context["formatted_command"]) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "ILLUMINA_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Split the result into smashable subfiles big_tsv = job_context["output_file_path"] data = pd.read_csv(big_tsv, sep="\t", header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: filename = ( frame.columns.values[0].replace("&", "").replace("*", "").replace(";", "") + ".tsv" ) frame_path = job_context["work_dir"] + filename frame.to_csv(frame_path, sep="\t", encoding="utf-8") # This needs to be the same as the ones in the job context! sample = _get_sample_for_column(frame.columns.values[0], job_context) if sample is None: job_context["job"].failure_reason = ( "Could not find sample for column " + frame.columns.values[0] + " while splitting Illumina file " + big_tsv ) job_context["success"] = False job_context["job"].no_retry = True return job_context computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = frame_path.split("/")[-1] computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context["computed_files"].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file ) individual_files.append(computed_file) logger.debug("Created %s", result) job_context["success"] = True job_context["individual_files"] = individual_files job_context["result"] = result return job_context