def _populate_index_object(job_context: Dict) -> Dict: """ """ result = ComputationalResult() result.commands.append(job_context["salmon_formatted_command"]) try: processor_key = "TX_INDEX" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.is_ccdl = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.save() job_context["pipeline"].steps.append(result.id) computed_file = ComputedFile() computed_file.absolute_file_path = job_context["computed_archive"] computed_file.filename = os.path.split(job_context["computed_archive"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = False computed_file.is_qc = False computed_file.save() organism_object = Organism.get_object_for_name( job_context["organism_name"]) index_object = OrganismIndex() index_object.organism = organism_object index_object.database_name = job_context["database_name"] index_object.release_version = job_context["assembly_version"] index_object.assembly_name = job_context["assembly_name"] index_object.salmon_version = job_context["salmon_version"] index_object.index_type = "TRANSCRIPTOME_" + job_context["length"].upper() # This is where the index will be extracted to. index_object.absolute_directory_path = (LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" + organism_object.name + "/" + job_context["length"]) index_object.result = result if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME: logger.info( "Uploading %s %s to s3", job_context["organism_name"], job_context["length"], processor_job=job_context["job_id"], ) timestamp = str(timezone.now().timestamp()).split(".")[0] s3_key = organism_object.name + "_" + index_object.index_type + "_" + timestamp + ".tar.gz" sync_result = computed_file.sync_to_s3( S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key, public=True) if sync_result: computed_file.delete_local_file() else: computed_file.delete() raise utils.ProcessorJobError( "Failed to upload transcriptome index to S3", success=False, computed_file_id=computed_file.id, ) else: logger.warn( "S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.", job_context["organism_name"], job_context["length"], processor_job=job_context["job_id"], ) index_object.save() # We uploaded the file ourselves since we wanted it to go to a # different bucket than end_job would put it in, therefore empty # this list so end_job doesn't try to upload it again. job_context["computed_files"] = [] job_context["result"] = result job_context["computed_file"] = computed_file job_context["index"] = index_object # If there's not a long and a short index for this organism yet, # don't delete the input. # XXX: This will break once we introduce additional versions of these. short_indices = OrganismIndex.objects.filter( organism=organism_object, index_type="TRANSCRIPTOME_SHORT", release_version=job_context["assembly_version"], ) long_indices = OrganismIndex.objects.filter( organism=organism_object, index_type="TRANSCRIPTOME_LONG", release_version=job_context["assembly_version"], ) if short_indices.count() < 1 or long_indices.count() < 1: # utils.end_job deletes these, so remove them so it doesn't. job_context["original_files"] = [] return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result_start = log_state("start create result object", job_context["job"].id) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file job_context["csv_outfile"] = job_context["output_dir"] + job_context[ "organism_name"] + ".tsv" job_context["merged_qn"].to_csv(job_context["csv_outfile"], sep="\t", encoding="utf-8") organism_key = list(job_context["samples"].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"][organism_key][0].organism_id, "organism_name": job_context["organism_name"], "is_qn": False, "is_compendia": True, "samples": [ sample.accession_code for sample in job_context["samples"][organism_key] ], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context["experiments"]], "total_percent_imputed": job_context["total_percent_imputed"], } annotation.save() # Create the resulting archive final_zip_base = SMASHING_DIR + str( job_context["dataset"].pk) + "_compendia" # Copy LICENSE.txt and correct README.md files. if job_context["dataset"].quant_sf_only: readme_file = "/home/user/README_QUANT.md" else: readme_file = "/home/user/README_NORMALIZED.md" shutil.copy(readme_file, job_context["output_dir"] + "/README.md") shutil.copy("/home/user/LICENSE_DATASET.txt", job_context["output_dir"] + "/LICENSE.TXT") archive_path = shutil.make_archive(final_zip_base, "zip", job_context["output_dir"]) archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split("/")[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.save() # Compendia Result Helpers organisms = [ Organism.get_object_for_name(organism) for organism in job_context["all_organisms"] ] compendium_version = job_context["compendium_version"] # Save Compendia Result compendium_result = CompendiumResult() compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm compendium_result.compendium_version = compendium_version compendium_result.result = result compendium_result.primary_organism = job_context["organism_object"] compendium_result.save() # create relations to all organisms contained in the compendia compendium_result_organism_associations = [] for compendium_organism in organisms: compendium_result_organism_association = CompendiumResultOrganismAssociation( ) compendium_result_organism_association.compendium_result = compendium_result compendium_result_organism_association.organism = compendium_organism compendium_result_organism_associations.append( compendium_result_organism_association) CompendiumResultOrganismAssociation.objects.bulk_create( compendium_result_organism_associations) job_context["compendium_result"] = compendium_result logger.info("Compendium created!", archive_path=archive_path, organism_name=job_context["organism_name"]) # Upload the result to S3 timestamp = str(int(time.time())) key = job_context["organism_name"] + "_" + str( compendium_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) if not uploaded_to_s3: archive_computed_file.delete() raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True log_state("end create result object", job_context["job"].id, result_start) # TEMPORARY for iterating on compendia more quickly. # Reset this so the end_job does clean up the job's non-input-data stuff. job_context["work_dir"] = job_context["old_work_dir"] return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ archive_path = job_context["archive_path"] compendia_organism = job_context["compendia_organism"] compendium_version = job_context["compendium_version"] result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_QUANTPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = FileUtils.get_filename(archive_path) archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.quant_sf_only = True archive_computed_file.compendia_organism = compendia_organism archive_computed_file.compendium_version = compendium_version archive_computed_file.save() compendium_result = CompendiumResult() compendium_result.quant_sf_only = True compendium_result.result = result compendium_result.primary_organism = compendia_organism compendium_result.compendium_version = compendium_version compendium_result.save() logger.info("Quantpendia created! Uploading to S3.", job_id=job_context["job_id"], archive_path=archive_path, organism_name=compendia_organism.name, **get_process_stats()) # Upload the result to S3 timestamp = str(int(time.time())) s3_key = compendia_organism.name + "_" + str( compendium_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key) if not uploaded_to_s3: archive_computed_file.delete() raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True return job_context