Пример #1
0
def setup_experiment(new_version_accessions: List[str],
                     old_version_accessions: List[str]) -> Dict:
    """ Create an experiment where some samples were processed with the newest version of salmon and
    other with an older one.
    """
    # Create the experiment
    experiment_accession = "SRP095529"
    data_dir = "/home/user/data_store/"
    experiment_dir = data_dir + experiment_accession
    experiment = Experiment.objects.create(accession_code=experiment_accession,
                                           technology="RNA-SEQ")

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    # Create the transcriptome processor and result:
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.9.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.9.1"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    quant_processor = Processor()
    quant_processor.name = "Salmon Quant"
    quant_processor.version = "salmon 0.9.1"
    quant_processor.docker_image = "dr_salmon"
    quant_processor.environment = '{"some": "environment"}'
    quant_processor.save()

    for accession_code in old_version_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
            platform_accession_code="IlluminaHiSeq1000",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index  # associate with OLD organism index
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    # Create another OrganismIndex with a newer version of
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.13.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.13.1"  # DIFFERENT SALMON VERSION
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    for accession_code in new_version_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
            platform_accession_code="IlluminaHiSeq1000",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index  # NEWER VERSION
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    return experiment
Пример #2
0
def _run_salmon(job_context: Dict) -> Dict:
    """Runs Salmon Quant."""
    logger.debug("Running Salmon..")

    # Salmon needs to be run differently for different sample types.
    if "input_file_path_2" in job_context:
        second_read_str = " -2 {}".format(job_context["input_file_path_2"])

        # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container:
        # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17
        command_str = ("salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}"
                       " -1 {input_one}{second_read_str} -p 16 -o {output_directory}"
                       " --seqBias --gcBias --dumpEq --writeUnmappedNames")

        formatted_command = command_str.format(index=job_context["index_directory"],
                                               input_one=job_context["input_file_path"],
                                               second_read_str=second_read_str,
                                               output_directory=job_context["output_directory"])
    else:
        # Related: https://github.com/COMBINE-lab/salmon/issues/83
        command_str = ("salmon --no-version-check quant -l A -i {index}"
                       " -r {input_one} -p 16 -o {output_directory}"
                       " --seqBias --dumpEq --writeUnmappedNames")

        formatted_command = command_str.format(index=job_context["index_directory"],
                                               input_one=job_context["input_file_path"],
                                               output_directory=job_context["output_directory"])

    logger.debug("Running Salmon Quant using the following shell command: %s",
                 formatted_command,
                 processor_job=job_context["job_id"])

    job_context['time_start'] = timezone.now()
    completed_command = subprocess.run(formatted_command.split(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
    job_context['time_end'] = timezone.now()

    ## To me, this looks broken: error codes are anything non-zero.
    ## However, Salmon (seems) to output with negative status codes
    ## even with successful executions.
    ## Possibly related: https://github.com/COMBINE-lab/salmon/issues/55
    if completed_command.returncode == 1:
        stderr = completed_command.stderr.decode().strip()
        error_start = stderr.upper().find("ERROR:")
        error_start = error_start if error_start != -1 else 0
        logger.error("Shell call to salmon failed with error message: %s",
                     stderr[error_start:],
                     processor_job=job_context["job_id"])

        job_context["job"].failure_reason = ("Shell call to salmon failed because: "
                                             + stderr[error_start:])
        job_context["success"] = False
    else:
        result = ComputationalResult()
        result.commands.append(formatted_command)
        result.time_start = job_context['time_start']
        result.time_end = job_context['time_end']
        result.organism_index = job_context["organism_index"]
        result.is_ccdl = True

        try:
            processor_key = "SALMON_QUANT"
            result.processor = utils.find_processor(processor_key)
        except Exception as e:
            return utils.handle_processor_exception(job_context, processor_key, e)

        # Zip up the output of Salmon Quant
        try:
            with tarfile.open(job_context['output_archive'], "w:gz") as tar:
                tar.add(job_context["output_directory"], arcname=os.sep)
        except Exception:
            logger.exception("Exception caught while zipping processed directory %s",
                             job_context["output_directory"],
                             processor_job=job_context["job_id"]
            )
            failure_template = "Exception caught while zipping processed directory {}"
            job_context["job"].failure_reason = failure_template.format(job_context['output_archive'])
            job_context["success"] = False
            return job_context

        salmon_quant_archive = ComputedFile()
        salmon_quant_archive.absolute_file_path = job_context["output_archive"]
        salmon_quant_archive.filename = os.path.split(job_context["output_archive"])[-1]
        salmon_quant_archive.calculate_sha1()
        salmon_quant_archive.calculate_size()
        salmon_quant_archive.is_public = True
        salmon_quant_archive.is_smashable = False
        salmon_quant_archive.is_qc = False

        quant_file = ComputedFile()
        quant_file.s3_bucket = S3_BUCKET_NAME
        quant_file.s3_key = "quant_files/sample_" + str(job_context["sample"].id) + "_quant.sf"
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = job_context["output_directory"] + "quant.sf"
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.calculate_sha1()
        quant_file.calculate_size()

        # If we're running in the cloud we need to upload the quant.sf
        # file so that it can be used by a job running on any machine
        # to run tximport. We can't use sync_to_s3 though because we
        # have to sync it before we can save the file so it cannot be
        # discovered by other jobs before it is uploaded.
        if settings.RUNNING_IN_CLOUD:
            try:
                S3.upload_file(
                    quant_file.absolute_file_path,
                    quant_file.s3_bucket,
                    quant_file.s3_key,
                    ExtraArgs={
                        'ACL': 'public-read',
                        'StorageClass': 'STANDARD_IA'
                    }
                )
            except Exception as e:
                logger.exception(e, processor_job=job_context["job_id"], sample=job_context["sample"].id)
                failure_template = "Exception caught while uploading quantfile to S3: {}"
                job_context["job"].failure_reason = failure_template.format(quant_file.absolute_file_path)
                job_context["success"] = False
                return job_context

        # Here select_for_update() is used as a mutex that forces multiple
        # jobs to execute this block of code in serial manner. See:
        # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update
        # Theorectically any rows in any table can be locked here, we're
        # locking all existing rows in ComputationalResult table.
        with transaction.atomic():
            ComputationalResult.objects.select_for_update()
            result.save()
            job_context["quant_result"] = result
            quant_file.result = result
            quant_file.save()

            job_context["result"] = result

            job_context['pipeline'].steps.append(result.id)
            SampleResultAssociation.objects.get_or_create(sample=job_context['sample'],
                                                          result=result)

            salmon_quant_archive.result = result
            salmon_quant_archive.save()
            job_context['computed_files'].append(salmon_quant_archive)

            tximport_inputs = _get_tximport_inputs(job_context)

        # tximport analysis is done outside of the transaction so that
        # the mutex wouldn't hold the other jobs too long.
        for experiment, quant_files in tximport_inputs.items():
            _tximport(job_context, experiment, quant_files)
            # If `tximport` on any related experiment fails, exit immediately.
            if not job_context["success"]:
                return job_context

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": job_context["index_length"]}
        kv.result = result
        kv.is_public = True
        kv.save()

        with open(os.path.join(job_context['output_directory'], 'lib_format_counts.json')) as lfc_file:
            format_count_data = json.load(lfc_file)
            kv = ComputationalResultAnnotation()
            kv.data = format_count_data
            kv.result = result
            kv.is_public = True
            kv.save()
        with open(os.path.join(job_context['output_directory'], 'aux_info', 'meta_info.json')) as mi_file:
            meta_info = json.load(mi_file)
            kv = ComputationalResultAnnotation()
            kv.data = meta_info
            kv.result = result
            kv.is_public = True
            kv.save()

        job_context["success"] = True

    return job_context
Пример #3
0
def _run_salmon(job_context: Dict) -> Dict:
    """Runs Salmon Quant."""
    logger.debug("Running Salmon..")

    # Salmon needs to be run differently for different sample types.
    # SRA files also get processed differently as we don't want to use fasterq-dump to extract
    # them to disk.
    if job_context.get("sra_input_file_path", None):

        # Single reads
        if job_context["sra_num_reads"] == 1:

            fifo = "/tmp/barney"
            os.mkfifo(fifo)

            dump_str = "fastq-dump --stdout {input_sra_file} > {fifo} &"
            formatted_dump_command = dump_str.format(
                input_sra_file=job_context["sra_input_file_path"], fifo=fifo)
            subprocess.Popen(formatted_dump_command,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)

            command_str = (
                "salmon --no-version-check quant -l A -i {index} "
                "-r {fifo} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames"
            )
            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_sra_file=job_context["sra_input_file_path"],
                fifo=fifo,
                output_directory=job_context["output_directory"],
            )
        # Paired are trickier
        else:

            # Okay, for some reason I can't explain, this only works in the temp directory,
            # otherwise the `tee` part will only output to one or the other of the streams (non-deterministically),
            # but not both. This doesn't appear to happen if the fifos are in tmp.
            alpha = "/tmp/alpha"
            os.mkfifo(alpha)
            beta = "/tmp/beta"
            os.mkfifo(beta)

            dump_str = "fastq-dump --stdout --split-files -I {input_sra_file} | tee >(grep '@.*\.1\s' -A3 --no-group-separator > {fifo_alpha}) >(grep '@.*\.2\s' -A3 --no-group-separator > {fifo_beta}) > /dev/null &"
            formatted_dump_command = dump_str.format(
                input_sra_file=job_context["sra_input_file_path"],
                fifo_alpha=alpha,
                fifo_beta=beta)
            subprocess.Popen(
                formatted_dump_command,
                shell=True,
                executable="/bin/bash",
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
            )

            command_str = (
                "salmon --no-version-check quant -l A -i {index} "
                "-1 {fifo_alpha} -2 {fifo_beta} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames"
            )
            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_sra_file=job_context["sra_input_file_path"],
                fifo_alpha=alpha,
                fifo_beta=beta,
                output_directory=job_context["output_directory"],
            )

    else:
        if "input_file_path_2" in job_context:
            second_read_str = " -2 {}".format(job_context["input_file_path_2"])

            # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container:
            # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17
            command_str = (
                "salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}"
                " -1 {input_one}{second_read_str} -p 16 -o {output_directory}"
                " --seqBias --gcBias --dumpEq --writeUnmappedNames")

            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_one=job_context["input_file_path"],
                second_read_str=second_read_str,
                output_directory=job_context["output_directory"],
            )
        else:
            # Related: https://github.com/COMBINE-lab/salmon/issues/83
            command_str = ("salmon --no-version-check quant -l A -i {index}"
                           " -r {input_one} -p 16 -o {output_directory}"
                           " --seqBias --dumpEq --writeUnmappedNames")

            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_one=job_context["input_file_path"],
                output_directory=job_context["output_directory"],
            )

    logger.debug(
        "Running Salmon Quant using the following shell command: %s",
        formatted_command,
        processor_job=job_context["job_id"],
    )

    # Salmon probably shouldn't take longer than three hours.
    timeout = 60 * 60 * 3
    job_context["time_start"] = timezone.now()
    try:
        completed_command = subprocess.run(
            formatted_command.split(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout,
        )
    except subprocess.TimeoutExpired:
        failure_reason = "Salmon timed out because it failed to complete within 3 hours."
        logger.error(
            failure_reason,
            sample_accesion_code=job_context["sample"].accession_code,
            processor_job=job_context["job_id"],
        )
        job_context["job"].failure_reason = failure_reason
        job_context["job"].no_retry = True
        job_context["success"] = False
        return job_context

    job_context["time_end"] = timezone.now()

    if completed_command.returncode == 1:
        stderr = completed_command.stderr.decode().strip()
        error_start = stderr.upper().find("ERROR:")
        error_start = error_start if error_start != -1 else 0
        logger.error(
            "Shell call to salmon failed with error message: %s",
            stderr[error_start:],
            processor_job=job_context["job_id"],
        )

        # If salmon has an error exit code then we don't want to retry it.
        job_context["job"].no_retry = True
        job_context["job"].failure_reason = (
            "Shell call to salmon failed because: " + stderr[error_start:])
        job_context["success"] = False
    else:
        result = ComputationalResult()
        result.commands.append(formatted_command)
        result.time_start = job_context["time_start"]
        result.time_end = job_context["time_end"]
        result.organism_index = job_context["organism_index"]
        result.is_ccdl = True

        try:
            processor_key = "SALMON_QUANT"
            result.processor = utils.find_processor(processor_key)
        except Exception as e:
            return utils.handle_processor_exception(job_context, processor_key,
                                                    e)

        # Zip up the output of Salmon Quant
        try:
            with tarfile.open(job_context["output_archive"], "w:gz") as tar:
                tar.add(job_context["output_directory"], arcname=os.sep)
        except Exception:
            logger.exception(
                "Exception caught while zipping processed directory %s",
                job_context["output_directory"],
                processor_job=job_context["job_id"],
            )
            failure_template = "Exception caught while zipping processed directory {}"
            job_context["job"].failure_reason = failure_template.format(
                job_context["output_archive"])
            job_context["success"] = False
            return job_context

        salmon_quant_archive = ComputedFile()
        salmon_quant_archive.absolute_file_path = job_context["output_archive"]
        salmon_quant_archive.filename = os.path.split(
            job_context["output_archive"])[-1]
        salmon_quant_archive.calculate_sha1()
        salmon_quant_archive.calculate_size()
        salmon_quant_archive.is_public = True
        salmon_quant_archive.is_smashable = False
        salmon_quant_archive.is_qc = False

        quant_file = ComputedFile()
        quant_file.s3_bucket = S3_BUCKET_NAME
        timestamp = str(timezone.now().timestamp()).split(".")[0]
        quant_file.s3_key = "quant_files/sample_{0}_{1}_quant.sf".format(
            job_context["sample"].id, timestamp)
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = job_context[
            "output_directory"] + "quant.sf"
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.calculate_sha1()
        quant_file.calculate_size()

        # If we're running in the cloud we need to upload the quant.sf
        # file so that it can be used by a job running on any machine
        # to run tximport. We can't use sync_to_s3 though because we
        # have to sync it before we can save the file so it cannot be
        # discovered by other jobs before it is uploaded.
        if settings.RUNNING_IN_CLOUD:
            try:
                S3.upload_file(
                    quant_file.absolute_file_path,
                    quant_file.s3_bucket,
                    quant_file.s3_key,
                    ExtraArgs={
                        "ACL": "public-read",
                        "StorageClass": "STANDARD_IA"
                    },
                )
            except Exception as e:
                logger.exception(e,
                                 processor_job=job_context["job_id"],
                                 sample=job_context["sample"].id)
                failure_template = "Exception caught while uploading quantfile to S3: {}"
                job_context["job"].failure_reason = failure_template.format(
                    quant_file.absolute_file_path)
                job_context["success"] = False
                return job_context

        # Here select_for_update() is used as a mutex that forces multiple
        # jobs to execute this block of code in serial manner. See:
        # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update
        # Theorectically any rows in any table can be locked here, we're
        # locking all existing rows in ComputationalResult table.
        with transaction.atomic():
            ComputationalResult.objects.select_for_update()
            result.save()
            job_context["quant_result"] = result
            quant_file.result = result
            quant_file.save()

            job_context["result"] = result

            job_context["pipeline"].steps.append(result.id)
            SampleResultAssociation.objects.get_or_create(
                sample=job_context["sample"], result=result)

            salmon_quant_archive.result = result
            salmon_quant_archive.save()
            job_context["computed_files"].append(salmon_quant_archive)

        kv = ComputationalResultAnnotation()
        kv.data = {
            "index_length": job_context["index_length"],
            "index_length_get": job_context.get("index_length_raw", None),
        }
        kv.result = result
        kv.is_public = True
        kv.save()

        try:
            with open(
                    os.path.join(job_context["output_directory"],
                                 "lib_format_counts.json")) as lfc_file:
                format_count_data = json.load(lfc_file)
                kv = ComputationalResultAnnotation()
                kv.data = format_count_data
                kv.result = result
                kv.is_public = True
                kv.save()
        except Exception:
            # See: https://github.com/AlexsLemonade/refinebio/issues/1167
            logger.exception(
                "Error parsing Salmon lib_format_counts JSON output!",
                processor_job=job_context["job_id"],
            )

        try:
            with open(
                    os.path.join(job_context["output_directory"], "aux_info",
                                 "meta_info.json")) as mi_file:
                meta_info = json.load(mi_file)
                kv = ComputationalResultAnnotation()
                kv.data = meta_info
                kv.result = result
                kv.is_public = True
                kv.save()
        except Exception:
            # See: https://github.com/AlexsLemonade/refinebio/issues/1167
            logger.exception("Error parsing Salmon meta_info JSON output!",
                             processor_job=job_context["job_id"])

        job_context["success"] = True

    return job_context
Пример #4
0
def prep_tximport_at_progress_point(complete_accessions: List[str],
                                    incomplete_accessions: List[str]) -> Dict:
    """Create an experiment and associated objects that tximport needs to run on it.

    Creates a sample for each accession contained in either input
    list. The samples in complete_accessions will be simlulated as
    already having salmon quant run on them. The samples in
    incomplete_accessions won't.
    """
    # Create the experiment
    experiment_accession = "SRP095529"
    data_dir = "/home/user/data_store/"
    experiment_dir = data_dir + experiment_accession
    experiment = Experiment.objects.create(accession_code=experiment_accession,
                                           technology="RNA-SEQ")

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    ExperimentOrganismAssociation.objects.get_or_create(experiment=experiment,
                                                        organism=zebrafish)

    # Create the transcriptome processor and result:
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.13.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.13.1"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    for accession_code in incomplete_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

    quant_processor = Processor()
    quant_processor.name = "Salmon Quant"
    quant_processor.version = "salmon 0.13.1"
    quant_processor.docker_image = "dr_salmon"
    quant_processor.environment = '{"some": "environment"}'
    quant_processor.save()
    tximport_processor = Processor()
    tximport_processor.name = "Tximport"
    tximport_processor.version = "salmon 0.13.1"
    tximport_processor.docker_image = "dr_salmon"
    tximport_processor.environment = '{"some": "environment"}'
    tximport_processor.save()

    # Create the already processed samples along with their
    # ComputationalResults and ComputedFiles. They don't need
    # original files for this test because we aren't going to run
    # salmon quant on them.
    for accession_code in complete_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        sample.most_recent_quant_file = quant_file
        sample.save()
        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)