Exemplo n.º 1
0
    def test_qn_management_command(self):
        """Test that the management command fires off and then does not create
        a job for an organism that does not have enough samples on the same
        platform."""

        homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606)
        homo_sapiens.save()

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()
        codes = ["1", "2", "3", "4", "5", "6"]
        # We don't have a 0.tsv

        for code in codes:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_accession_code = "A-MEXP-1171"
            sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
            sample.organism = homo_sapiens
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            computed_file = ComputedFile()
            computed_file.filename = code + ".tsv"
            computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            computed_file.size_in_bytes = int(code)
            computed_file.result = cr
            computed_file.is_smashable = True
            computed_file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = computed_file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

        out = StringIO()
        try:
            call_command("create_qn_target", organism="homo_sapiens", min=1, stdout=out)
        except SystemExit as e:  # this is okay!
            pass

        stdout = out.getvalue()
        self.assertFalse("Target file" in stdout)

        # There's not enough samples available in this scenario so we
        # shouldn't have even made a processor job.
        self.assertEqual(ProcessorJob.objects.count(), 0)
Exemplo n.º 2
0
def prepare_organism_indices():
    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    # This is a lie, but this image doesn't have the dependencies for TRANSCRIPTOME_INDEX
    computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = c_elegans
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT/celgans_short.tar.gz"
    comp_file.result = computational_result_short
    comp_file.size_in_bytes=1337
    comp_file.sha1="ABC"
    comp_file.save()

    # This is a lie, but this image doesn't have the dependencies for TX_IMPORT
    computational_result_long = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
    computational_result_long.save()
Exemplo n.º 3
0
    def test_qn_reference(self, mock_send_job):
        organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606)
        organism.save()

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()

        for code in [str(i) for i in range(1, 401)]:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_name = f"Affymetrix {organism.name}"
            sample.platform_accession_code = f"A-MEXP-{organism.name}"
            sample.manufacturer = "AFFYMETRIX"
            sample.organism = organism
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.has_raw = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            computed_file = ComputedFile()
            computed_file.filename = code + ".tsv"
            computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            computed_file.size_in_bytes = int(code)
            computed_file.result = cr
            computed_file.is_smashable = True
            computed_file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = computed_file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

            # We need more than one organism for the tests, but can't
            # repeat accesion codes, so halfway through just change the organism.
            if int(code) == 200:
                organism = Organism(name="MUS_MUSCULUS", taxonomy_id=111)
                organism.save()

        # Setup is done, actually run the command.
        command = Command()
        command.handle(organisms="HOMO_SAPIENS,MUS_MUSCULUS")

        self.assertEqual(len(mock_send_job.mock_calls), 2)
        self.assertEqual(ProcessorJob.objects.count(), 2)
Exemplo n.º 4
0
    def test_fail(self):
        """ Test our ability to fail """

        result = ComputationalResult()
        result.save()

        sample = Sample()
        sample.accession_code = 'XXX'
        sample.title = 'XXX'
        sample.organism = Organism.get_object_for_name("HOMO_SAPIENS")
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "NOT_REAL.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['XXX']}
        ds.aggregate_by = 'EXPERIMENT'
        ds.scale_by = 'MINMAX'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()
        dsid = ds.id

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)
        ds = Dataset.objects.get(id=dsid)
        print(ds.failure_reason)
        print(final_context['dataset'].failure_reason)
        self.assertNotEqual(final_context['unsmashable_files'], [])
Exemplo n.º 5
0
def get_organism_with_qn_target():
    result = ComputationalResult()
    result.save()

    qn_target = ComputedFile()
    qn_target.filename = "danio_target.tsv"
    qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
    qn_target.is_qn_target = True
    qn_target.size_in_bytes = "12345"
    qn_target.sha1 = "aabbccddeeff"
    qn_target.result = result
    qn_target.save()

    danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result)
    danio_rerio.save()
    return danio_rerio
Exemplo n.º 6
0
def prepare_job():

    # Create 10 job directories
    for i in range(JOBS):
        os.makedirs(LOCAL_ROOT_DIR + "/processor_job_" + str(i), exist_ok=True)

        # These live on prod volumes at locations such as:
        # /var/ebs/SRP057116/SRR1972985/SRR1972985.sra
        os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(i), exist_ok=True)
        os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(i) + "/SRR" + str(i),
                    exist_ok=True)

        sample = Sample()
        sample.accession_code = "SRR" + str(i)
        sample.save()

        cr = ComputationalResult()
        cr.save()

        cf = ComputedFile()
        cf.result = cr
        cf.size_in_bytes = 666
        cf.save()

        scfa = SampleComputedFileAssociation()
        scfa.sample = sample
        scfa.computed_file = cf
        scfa.save()

    # Create a job out of the range with index in it to make sure we
    # don't delete index directories since that's where transcriptome
    # indices get downloaded to.
    os.makedirs(LOCAL_ROOT_DIR + "/processor_job_" + str(JOBS + 1) + "_index",
                exist_ok=True)

    os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(JOBS + 1) + "/SRR" +
                str(JOBS + 1),
                exist_ok=True)
    sample = Sample()
    sample.accession_code = "SRR" + str(JOBS + 1)
    sample.save()

    # Save two jobs so that we trigger two special circumstances, one
    # where the job is still running and the other where the job isn't
    # in Batch anymore.
    pj = ProcessorJob()
    pj.pipeline_applied = "SALMON"
    pj.batch_job_id = "running_job"
    pj.save()

    pj = ProcessorJob()
    pj.pipeline_applied = "SALMON"
    pj.batch_job_id = "missing_job"
    pj.save()

    pj = ProcessorJob()
    pj.pipeline_applied = "JANITOR"
    pj.save()

    return pj
Exemplo n.º 7
0
def prepare_experiment(ids: List[int]) -> Experiment:
    (homo_sapiens, _) = Organism.objects.get_or_create(name="HOMO_SAPIENS",
                                                       taxonomy_id=9606)

    experiment = Experiment()
    experiment.accession_code = "12345"
    experiment.save()
    codes = [str(i) for i in ids]

    for code in codes:
        sample = Sample()
        sample.accession_code = code
        sample.title = code
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
        sample.organism = homo_sapiens
        sample.technology = "MICROARRAY"
        sample.is_processed = True
        sample.save()

        cr = ComputationalResult()
        cr.save()

        computed_file = ComputedFile()
        computed_file.filename = code + ".tsv"
        computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
        computed_file.size_in_bytes = int(code)
        computed_file.result = cr
        computed_file.is_smashable = True
        computed_file.save()

        scfa = SampleComputedFileAssociation()
        scfa.sample = sample
        scfa.computed_file = computed_file
        scfa.save()

        exsa = ExperimentSampleAssociation()
        exsa.experiment = experiment
        exsa.sample = sample
        exsa.save()
Exemplo n.º 8
0
def create_sample_for_experiment(sample_info: Dict,
                                 experiment: Experiment) -> Sample:
    result = ComputationalResult()
    result.save()

    sample = Sample()
    sample.accession_code = sample_info["accession_code"]
    sample.title = sample_info.get("title",
                                   None) or sample_info["accession_code"]
    sample.organism = sample_info["organism"]
    sample.technology = sample_info["technology"]
    sample.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    if sample_info.get("filename") is not None:
        computed_file = ComputedFile()
        computed_file.filename = sample_info["filename"]
        computed_file.absolute_file_path = sample_info[
            "data_dir"] + sample_info["filename"]
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

    return sample
Exemplo n.º 9
0
def setup_experiment(new_version_accessions: List[str],
                     old_version_accessions: List[str]) -> Dict:
    """ Create an experiment where some samples were processed with the newest version of salmon and
    other with an older one.
    """
    # Create the experiment
    experiment_accession = "SRP095529"
    data_dir = "/home/user/data_store/"
    experiment_dir = data_dir + experiment_accession
    experiment = Experiment.objects.create(accession_code=experiment_accession,
                                           technology="RNA-SEQ")

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    # Create the transcriptome processor and result:
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.9.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.9.1"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    quant_processor = Processor()
    quant_processor.name = "Salmon Quant"
    quant_processor.version = "salmon 0.9.1"
    quant_processor.docker_image = "dr_salmon"
    quant_processor.environment = '{"some": "environment"}'
    quant_processor.save()

    for accession_code in old_version_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
            platform_accession_code="IlluminaHiSeq1000",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index  # associate with OLD organism index
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    # Create another OrganismIndex with a newer version of
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.13.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.13.1"  # DIFFERENT SALMON VERSION
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    for accession_code in new_version_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
            platform_accession_code="IlluminaHiSeq1000",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index  # NEWER VERSION
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    return experiment
Exemplo n.º 10
0
def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    result_start = log_state("start create result object",
                             job_context["job"].id)
    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    # Temporary until we re-enable the QN test step.
    result.is_public = False
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_COMPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    # Write the compendia dataframe to a file
    job_context["csv_outfile"] = job_context["output_dir"] + job_context[
        "organism_name"] + ".tsv"
    job_context["merged_qn"].to_csv(job_context["csv_outfile"],
                                    sep="\t",
                                    encoding="utf-8")

    organism_key = list(job_context["samples"].keys())[0]
    annotation = ComputationalResultAnnotation()
    annotation.result = result

    annotation.data = {
        "organism_id":
        job_context["samples"][organism_key][0].organism_id,
        "organism_name":
        job_context["organism_name"],
        "is_qn":
        False,
        "is_compendia":
        True,
        "samples": [
            sample.accession_code
            for sample in job_context["samples"][organism_key]
        ],
        "num_samples":
        len(job_context["samples"][organism_key]),
        "experiment_accessions":
        [e.accession_code for e in job_context["experiments"]],
        "total_percent_imputed":
        job_context["total_percent_imputed"],
    }
    annotation.save()

    # Create the resulting archive
    final_zip_base = SMASHING_DIR + str(
        job_context["dataset"].pk) + "_compendia"
    # Copy LICENSE.txt and correct README.md files.
    if job_context["dataset"].quant_sf_only:
        readme_file = "/home/user/README_QUANT.md"
    else:
        readme_file = "/home/user/README_NORMALIZED.md"

    shutil.copy(readme_file, job_context["output_dir"] + "/README.md")
    shutil.copy("/home/user/LICENSE_DATASET.txt",
                job_context["output_dir"] + "/LICENSE.TXT")
    archive_path = shutil.make_archive(final_zip_base, "zip",
                                       job_context["output_dir"])

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = archive_path.split("/")[-1]
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.save()

    # Compendia Result Helpers
    primary_organism = Organism.get_object_for_name(
        job_context["primary_organism"])
    organisms = [
        Organism.get_object_for_name(organism)
        for organism in job_context["all_organisms"]
    ]
    compendium_version = (CompendiumResult.objects.filter(
        primary_organism=primary_organism, quant_sf_only=False).count() + 1)
    # Save Compendia Result
    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only
    compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm
    compendium_result.compendium_version = compendium_version
    compendium_result.result = result
    compendium_result.primary_organism = primary_organism
    compendium_result.save()

    # create relations to all organisms contained in the compendia

    compendium_result_organism_associations = []
    for compendium_organism in organisms:
        compendium_result_organism_association = CompendiumResultOrganismAssociation(
        )
        compendium_result_organism_association.compendium_result = compendium_result
        compendium_result_organism_association.organism = compendium_organism
        compendium_result_organism_associations.append(
            compendium_result_organism_association)

    CompendiumResultOrganismAssociation.objects.bulk_create(
        compendium_result_organism_associations)

    job_context["compendium_result"] = compendium_result

    logger.info("Compendium created!",
                archive_path=archive_path,
                organism_name=job_context["organism_name"])

    # Upload the result to S3
    timestamp = str(int(time.time()))
    key = job_context["organism_name"] + "_" + str(
        compendium_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME,
                                                      key)

    if not uploaded_to_s3:
        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    log_state("end create result object", job_context["job"].id, result_start)

    # TEMPORARY for iterating on compendia more quickly.
    # Reset this so the end_job does clean up the job's non-input-data stuff.
    job_context["work_dir"] = job_context["old_work_dir"]

    return job_context
Exemplo n.º 11
0
    def test_qn_reference(self):
        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606)
        homo_sapiens.save()

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()
        # We don't have a 0.tsv
        codes = [str(i) for i in range(1, 201)]

        for code in codes:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_accession_code = "A-MEXP-1171"
            sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
            sample.organism = homo_sapiens
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            computed_file = ComputedFile()
            computed_file.filename = code + ".tsv"
            computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            computed_file.size_in_bytes = int(code)
            computed_file.result = cr
            computed_file.is_smashable = True
            computed_file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = computed_file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False  # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["target_file"]))
        self.assertEqual(os.path.getsize(final_context["target_file"]), 562)

        homo_sapiens.refresh_from_db()
        target = homo_sapiens.qn_target.computedfile_set.latest()
        self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a")

        # Create and run a smasher job that will use the QN target we just made.
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = "SPECIES"
        ds.scale_by = "STANDARD"
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context["success"])

        np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811)
        np.testing.assert_almost_equal(final_context["original_merged"]["1"][0], -0.5762109)
Exemplo n.º 12
0
    def test_get_synced_files(self):
        """ """
        result = ComputationalResult()
        result.save()

        computed_file = ComputedFile()
        computed_file.s3_key = "all_the_things.jpg"
        computed_file.s3_bucket = "data-refinery-test-assets"
        computed_file.filename = "all_the_things.jpg"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 9001
        computed_file.is_smashable = False
        computed_file.sha1 = "36cf21c08d461f74ddb0f2edb6257afee309c4a4"
        computed_file.save()

        # Make sure it's not there
        try:
            os.remove("/home/user/data_store/PCL/" + computed_file.filename)
        except OSError:
            pass

        # We do this twice, once to get from S3 and once to get from local disk.
        afp = computed_file.get_synced_file_path(force=True)
        self.assertTrue(os.path.exists(afp))

        afp = computed_file.get_synced_file_path(force=True)
        self.assertTrue(os.path.exists(afp))
Exemplo n.º 13
0
def _create_result_objects(job_context: Dict) -> Dict:

    result = ComputationalResult()
    result.commands.append(" ".join(job_context['formatted_command']))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "QN_REFERENCE"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context['target_file']
    computed_file.filename = job_context['target_file'].split('/')[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.is_smashable = False
    computed_file.is_qn_target = True
    computed_file.result = result
    computed_file.save()

    annotation = ComputationalResultAnnotation()
    annotation.result = result
    annotation.data = {
        "organism_id":
        job_context['samples']['ALL'][0].organism_id,
        "is_qn":
        True,
        "platform_accession_code":
        job_context['samples']['ALL'][0].platform_accession_code,
        "samples":
        [sample.accession_code for sample in job_context["samples"]["ALL"]],
        "geneset":
        str(job_context["geneset"]),
        "num_valid_inputs":
        job_context["num_valid_inputs"]
    }
    annotation.save()

    # TODO: upload this to a public read bucket.
    # https://github.com/AlexsLemonade/refinebio/issues/586
    job_context['result'] = result
    job_context['computed_files'] = [computed_file]
    job_context['annotation'] = annotation
    job_context['success'] = True
    return job_context
Exemplo n.º 14
0
def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """

    result = ComputationalResult()
    result.commands.append(" ".join(job_context['formatted_command']))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "COMPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    # Write the compendia dataframe to a file, overwriting the previous smash
    job_context['merged_qn'].to_csv(job_context['smash_outfile'], sep='\t', encoding='utf-8')
    compendia_tsv_computed_file = ComputedFile()
    compendia_tsv_computed_file.absolute_file_path = job_context['smash_outfile']
    compendia_tsv_computed_file.filename = job_context['smash_outfile'].split('/')[-1]
    compendia_tsv_computed_file.calculate_sha1()
    compendia_tsv_computed_file.calculate_size()
    compendia_tsv_computed_file.is_smashable = False
    compendia_tsv_computed_file.is_qn_target = False
    compendia_tsv_computed_file.result = result
    compendia_tsv_computed_file.save()

    organism_key = list(job_context['samples'].keys())[0]
    annotation = ComputationalResultAnnotation()
    annotation.result = result

    annotation.data = {
        "organism_id": job_context['samples'][organism_key][0].organism_id,
        "organism_name": job_context['samples'][organism_key][0].organism.name,
        "is_qn": False,
        "is_compendia": True,
        "samples": [sample.accession_code for sample in job_context["samples"][organism_key]],
        "num_samples": len(job_context["samples"][organism_key]),
        "experiment_accessions": [e.accession_code for e in job_context['experiments']]
    }
    annotation.save()

    # Save the related metadata file
    metadata_computed_file = ComputedFile()
    metadata_computed_file.absolute_file_path = job_context['metadata_tsv_paths'][0]
    metadata_computed_file.filename = job_context['metadata_tsv_paths'][0].split('/')[-1]
    metadata_computed_file.calculate_sha1()
    metadata_computed_file.calculate_size()
    metadata_computed_file.is_smashable = False
    metadata_computed_file.is_qn_target = False
    metadata_computed_file.result = result
    metadata_computed_file.save()

    # Create the resulting archive
    final_zip_base = "/home/user/data_store/smashed/" + str(job_context["dataset"].pk) + "_compendia"
    archive_path = shutil.make_archive(final_zip_base, 'zip', job_context["output_dir"])

    # Save the related metadata file
    organism = job_context['samples'][organism_key][0].organism

    try:
        last_compendia = ComputedFile.objects.filter(
                                    is_compendia=True,
                                    compendia_organism=organism).order_by('-compendia_version')[-1]
        compendia_version = last_compendia.compendia_version + 1
    except Exception as e:
        # This is the first compendia for this Organism
        compendia_version = 1

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = archive_path.split('/')[-1]
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.is_compendia = True
    archive_computed_file.compendia_organism = job_context['samples'][organism_key][0].organism
    archive_computed_file.compendia_version = compendia_version
    archive_computed_file.save()

    logger.info("Compendia created!",
        archive_path=archive_path,
        organism_name=job_context['samples'][organism_key][0].organism.name
    )

    # Upload the result to S3
    key = job_context['samples'][organism_key][0].organism.name + "_" + str(compendia_version) + "_" + str(int(time.time())) + ".zip"
    archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key)

    job_context['result'] = result
    job_context['computed_files'] = [compendia_tsv_computed_file, metadata_computed_file, archive_computed_file]
    job_context['success'] = True

    return job_context
Exemplo n.º 15
0
    def test_dualtech_smash(self):
        """ """

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS")

        sample = Sample()
        sample.accession_code = 'GSM1487313'
        sample.title = 'GSM1487313'
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = 'SRS332914'
        sample2.title = 'SRS332914'
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        # CROSS-SMASH BY SPECIES
        ds = Dataset()
        ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        self.assertTrue(ds.is_cross_technology())
        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        os.remove(final_context['output_file'])
        self.assertEqual(len(final_context['final_frame'].columns), 2)

        # THEN BY EXPERIMENT
        ds.aggregate_by = 'EXPERIMENT'
        ds.save()

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        pj.start_time = None
        pj.end_time = None
        pj.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        os.remove(final_context['output_file'])
        self.assertEqual(len(final_context['final_frame'].columns), 1)

        # THEN BY ALL
        ds.aggregate_by = 'ALL'
        ds.save()

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        pj.start_time = None
        pj.end_time = None
        pj.save()
        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        self.assertEqual(len(final_context['final_frame'].columns), 2)
Exemplo n.º 16
0
def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    archive_path = job_context["archive_path"]
    compendia_organism = _get_organisms(job_context["samples"]).first()
    compendia_version = _get_next_compendia_version(compendia_organism)

    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_QUANTPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = FileUtils.get_filename(archive_path)
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.is_compendia = True
    archive_computed_file.quant_sf_only = True
    archive_computed_file.compendia_organism = compendia_organism
    archive_computed_file.compendia_version = compendia_version
    archive_computed_file.save()

    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = True
    compendium_result.result = result
    compendium_result.primary_organism = compendia_organism
    compendium_result.compendium_version = compendia_version
    compendium_result.save()

    logger.info(
        "Quantpendia created! Uploading to S3.",
        job_id=job_context["job_id"],
        archive_path=archive_path,
        organism_name=compendia_organism.name,
        **get_process_stats()
    )

    # Upload the result to S3
    timestamp = str(int(time.time()))
    s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key)

    if not uploaded_to_s3:
        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    return job_context
Exemplo n.º 17
0
    def test_no_smash_dupe_two(self):
        """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file."""

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "SRP051449"
        experiment.save()

        result = ComputationalResult()
        result.save()

        danio_rerio = Organism.get_object_for_name("DANIO_RERIO")

        sample = Sample()
        sample.accession_code = 'SRR1731761'
        sample.title = 'Danio rerio'
        sample.organism = danio_rerio
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'SRR1731762'
        sample.title = 'Danio rerio'
        sample.organism = danio_rerio
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'NONE'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        cr = ComputationalResult()
        cr.save()

        computed_file = ComputedFile()
        computed_file.filename = "danio_target.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = cr
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = False
        computed_file.save()

        cra = ComputationalResultAnnotation()
        cra.data = {'organism_id': danio_rerio.id, 'is_qn': True}
        cra.result = cr
        cra.save()

        final_context = smasher.smash(job.pk, upload=False)
        self.assertTrue(final_context['success'])
Exemplo n.º 18
0
def _create_result(job_context: Dict) -> Dict:
    """ Create the actual Result object"""

    # This is a NO-OP, but we make a ComputationalResult regardless.
    result = ComputationalResult()
    result.commands.append(job_context["script_name"])
    result.is_ccdl = True
    try:
        processor_key = "SUBMITTER_PROCESSED"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    # Create a ComputedFile for the computed file,
    # sync it S3 and save it.
    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["output_file_path"]
    computed_file.filename = job_context["output_file_path"].split("/")[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = True
    computed_file.is_qc = False
    computed_file.save()

    # utils.end_job will sync this to S3 for us.
    job_context["computed_files"] = [computed_file]

    for sample in job_context["samples"]:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

    logger.debug("Created %s", result)
    job_context["success"] = True
    return job_context
Exemplo n.º 19
0
def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append("SCAN.UPC::SCAN_TwoColor")
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "AGILENT_TWOCOLOR"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    # Create a ComputedFile for the sample,
    # sync it S3 and save it.
    try:
        computed_file = ComputedFile()
        computed_file.absolute_file_path = job_context["output_file_path"]
        computed_file.filename = os.path.split(
            job_context["output_file_path"])[-1]
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.save()
        job_context["computed_files"].append(computed_file)
    except Exception:
        logger.exception(
            "Exception caught while moving file %s to S3",
            computed_file.filename,
            processor_job=job_context["job_id"],
        )
        failure_reason = "Exception caught while moving file to S3"
        job_context["job"].failure_reason = failure_reason
        job_context["success"] = False
        return job_context

    for sample in job_context["samples"]:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

    logger.info("Created %s", result)
    job_context["success"] = True

    return job_context
Exemplo n.º 20
0
    def test_get_results(self):
        """ Test our ability to collect the appropriate samples. """

        sample = Sample()
        sample.accession_code = 'GSM45588'
        sample.save()

        result = ComputationalResult()
        result.save()

        computed_file1 = ComputedFile()
        computed_file1.filename = "oh_boy.txt"
        computed_file1.result = result
        computed_file1.size_in_bytes = 123
        computed_file1.is_smashable = True
        computed_file1.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "gee_whiz.bmp"
        computed_file2.result = result
        computed_file2.size_in_bytes = 123
        computed_file2.is_smashable = False
        computed_file2.save()

        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file1
        assoc.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file2
        assoc.save()

        computed_files = sample.get_result_files()
        self.assertEqual(computed_files.count(), 2)
Exemplo n.º 21
0
def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "SMASHER"
    pj.save()

    experiment = Experiment()
    experiment.accession_code = "GSE51081"
    experiment.save()

    result = ComputationalResult()
    result.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

    sample = Sample()
    sample.accession_code = 'GSM1237810'
    sample.title = 'GSM1237810'
    sample.organism = homo_sapiens
    sample.save()

    sample_annotation = SampleAnnotation()
    sample_annotation.data = {'hi': 'friend'}
    sample_annotation.sample = sample
    sample_annotation.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237810_T09-1084.PCL"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    sample = Sample()
    sample.accession_code = 'GSM1237812'
    sample.title = 'GSM1237812'
    sample.organism = homo_sapiens
    sample.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237812_S97-PURE.PCL"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237812_S97-PURE.DAT"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = False
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    ds = Dataset()
    ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']}
    ds.aggregate_by = 'EXPERIMENT'  # [ALL or SPECIES or EXPERIMENT]
    ds.scale_by = 'STANDARD'  # [NONE or MINMAX or STANDARD or ROBUST]
    ds.email_address = "*****@*****.**"
    #ds.email_address = "*****@*****.**"
    ds.quantile_normalize = False
    ds.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = pj
    pjda.dataset = ds
    pjda.save()

    return pj
Exemplo n.º 22
0
    def test_bad_overlap(self):

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {'hi': 'friend'}
        sample_annotation.sample = sample
        sample_annotation.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "big.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1237812'
        sample.title = 'GSM1237812'
        sample.organism = homo_sapiens
        sample.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "small.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']}
        ds.aggregate_by = 'ALL'  # [ALL or SPECIES or EXPERIMENT]
        ds.scale_by = 'NONE'  # [NONE or MINMAX or STANDARD or ROBUST]
        ds.email_address = "*****@*****.**"
        #ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # Now, make sure the bad can't zero this out.
        sample = Sample()
        sample.accession_code = 'GSM999'
        sample.title = 'GSM999'
        sample.organism = homo_sapiens
        sample.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "bad.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']}
        ds.aggregate_by = 'ALL'  # [ALL or SPECIES or EXPERIMENT]
        ds.scale_by = 'NONE'  # [NONE or MINMAX or STANDARD or ROBUST]
        ds.email_address = "*****@*****.**"
        #ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        self.assertEqual(len(final_context['final_frame']), 4)
Exemplo n.º 23
0
    def test_compendia(self):
        result = ComputationalResult()
        result.save()

        hsc1 = ComputedFile()
        hsc1.absolute_file_path = "/null/1.tsv"
        hsc1.filename = "1.tsv"
        hsc1.sha1 = "abc"
        hsc1.size_in_bytes = 1
        hsc1.is_smashable = False
        hsc1.is_qn_target = False
        hsc1.result = result
        hsc1.is_compendia = True
        hsc1.compendia_organism = self.homo_sapiens
        hsc1.compendia_version = 1
        hsc1.s3_bucket = "dr-compendia"
        hsc1.s3_key = "hsc1.tsv"
        hsc1.save()

        hsc2 = ComputedFile()
        hsc2.absolute_file_path = "/null/2.tsv"
        hsc2.filename = "2.tsv"
        hsc2.sha1 = "abc"
        hsc2.size_in_bytes = 1
        hsc2.is_smashable = False
        hsc2.is_qn_target = False
        hsc2.result = result
        hsc2.is_compendia = True
        hsc2.compendia_organism = self.homo_sapiens
        hsc2.compendia_version = 2
        hsc2.s3_bucket = "dr-compendia"
        hsc2.s3_key = "hsc2.tsv"
        hsc2.save()

        drc1 = ComputedFile()
        drc1.absolute_file_path = "/null/1.tsv"
        drc1.filename = "1.tsv"
        drc1.sha1 = "abc"
        drc1.size_in_bytes = 1
        drc1.is_smashable = False
        drc1.is_qn_target = False
        drc1.result = result
        drc1.is_compendia = True
        drc1.compendia_organism = self.danio_rerio
        drc1.compendia_version = 1
        drc1.s3_bucket = "dr-compendia"
        drc1.s3_key = "drc2.tsv"
        drc1.save()

        response = self.client.get(
            reverse("computed_files", kwargs={"version": API_VERSION}),
            {"is_compendia": True})
        response_json = response.json()["results"]
        self.assertEqual(3, len(response_json))
        # Prove that the download_url field is missing and not None.
        self.assertEqual("NotPresent",
                         response_json[0].get("download_url", "NotPresent"))

        # We don't actually want AWS to generate a temporary URL for
        # us, and it won't unless we're running in the cloud, but if
        # we provide an API Token and use the WithUrl serializer then
        # it will set the download_url field to None rather than
        # generate one.

        # Create a token first
        response = self.client.post(
            reverse("token", kwargs={"version": API_VERSION}),
            json.dumps({"is_activated": True}),
            content_type="application/json",
        )
        token_id = response.json()["id"]

        response = self.client.get(
            reverse("computed_files", kwargs={"version": API_VERSION}),
            {"is_compendia": True},
            HTTP_API_KEY=token_id,
        )
        response_json = response.json()["results"]
        self.assertEqual(3, len(response_json))
        self.assertIsNone(response_json[0]["download_url"])
Exemplo n.º 24
0
    def test_log2(self):
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # Has non-log2 data:
        # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421
        # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz
        experiment = Experiment()
        experiment.accession_code = "GSE44421"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1084806'
        sample.title = 'GSM1084806'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1084806-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1084807'
        sample.title = 'GSM1084807'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1084807-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']}
        ds.aggregate_by = 'EXPERIMENT'
        ds.scale_by = 'MINMAX'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        self.assertTrue(final_context['success'])
Exemplo n.º 25
0
def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append('SCAN.UPC::SCANfast')
    result.is_ccdl = True
    result.is_public = True

    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "AFFYMETRIX_SCAN"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    # Create a ComputedFile for the sample
    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["output_file_path"]
    computed_file.filename = os.path.split(job_context["output_file_path"])[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = True
    computed_file.is_qc = False
    computed_file.save()
    job_context['computed_files'].append(computed_file)

    for sample in job_context['samples']:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

    logger.debug("Created %s", result, processor_job=job_context["job_id"])
    job_context["success"] = True

    return job_context
Exemplo n.º 26
0
def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append(job_context['formatted_command'])
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "ILLUMINA_SCAN"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    # Split the result into smashable subfiles
    big_tsv = job_context["output_file_path"]
    data = pd.read_csv(big_tsv, sep='\t', header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        filename = frame.columns.values[0].replace('&', '').replace(
            "*", '').replace(";", '') + '.tsv'
        frame_path = job_context["work_dir"] + filename
        frame.to_csv(frame_path, sep='\t', encoding='utf-8')

        # This needs to be the same as the ones in the job context!
        try:
            sample = job_context['samples'].get(title=frame.columns.values[0])
        except Sample.DoesNotExist:
            logger.error(
                "Could not find sample for column while splitting Illumina file.",
                title=frame.columns.values[0],
                processor_job=job_context["job_id"],
                file_path=big_tsv,
            )
            continue

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = frame_path.split('/')[-1]
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context['computed_files'].append(computed_file)

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=result)

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

        individual_files.append(computed_file)

    logger.debug("Created %s", result)
    job_context["success"] = True
    job_context["individual_files"] = individual_files
    job_context["result"] = result

    return job_context
Exemplo n.º 27
0
def _create_result_objects(job_context: Dict) -> Dict:
    if not job_context["create_results"]:
        return job_context

    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "QN_REFERENCE"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["target_file"]
    computed_file.filename = job_context["target_file"].split("/")[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.is_smashable = False
    computed_file.is_qn_target = True
    computed_file.result = result
    computed_file.save()

    annotation = ComputationalResultAnnotation()
    annotation.result = result
    annotation.data = {
        "organism_id": job_context["samples"]["ALL"][0].organism_id,
        "is_qn": True,
        "platform_accession_code": job_context["samples"]["ALL"][0].platform_accession_code,
        "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]],
        "geneset": str(job_context["geneset"]),
        "num_valid_inputs": job_context["num_valid_inputs"],
    }
    annotation.save()

    job_context["result"] = result
    job_context["computed_files"] = [computed_file]
    job_context["annotation"] = annotation
    job_context["success"] = True
    return job_context
Exemplo n.º 28
0
    def test_no_smash_all_diff_species(self):
        """ Smashing together with 'ALL' with different species is a really weird behavior. 
        This test isn't really testing a normal case, just make sure that it's marking the
        unsmashable files.
        """

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1237810_T09-1084.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51084"
        experiment.save()

        mus_mus = Organism.get_object_for_name("MUS_MUSCULUS")

        sample = Sample()
        sample.accession_code = 'GSM1238108'
        sample.title = 'GSM1238108'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1238108-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']}
        ds.aggregate_by = 'ALL'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)
        print(ds.failure_reason)
        print(final_context['dataset'].failure_reason)

        self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
Exemplo n.º 29
0
def _populate_index_object(job_context: Dict) -> Dict:
    """ """

    result = ComputationalResult()
    result.commands.append(job_context["salmon_formatted_command"])
    try:
        processor_key = "TX_INDEX"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.is_ccdl = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    result.save()
    job_context['pipeline'].steps.append(result.id)

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["computed_archive"]
    computed_file.filename = os.path.split(job_context["computed_archive"])[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = False
    computed_file.is_qc = False
    computed_file.save()

    organism_object = Organism.get_object_for_name(job_context['organism_name'])
    index_object = OrganismIndex()
    index_object.organism = organism_object
    index_object.source_version = job_context["assembly_version"]
    index_object.assembly_name = job_context["assembly_name"]
    index_object.salmon_version = job_context["salmon_version"]
    index_object.index_type = "TRANSCRIPTOME_" + job_context['length'].upper()
    # This is where the index will be extracted to.
    index_object.absolute_directory_path = LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" \
                                           + organism_object.name + "/" + job_context['length']
    index_object.result = result

    if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME:
        logger.info("Uploading %s %s to s3", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"])
        timestamp = str(timezone.now().timestamp()).split('.')[0]
        s3_key = organism_object.name + '_' + index_object.index_type + "_" + timestamp + '.tar.gz'
        sync_result = computed_file.sync_to_s3(S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key)
        if sync_result:
            computed_file.delete_local_file()
    else:
        logger.warn("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.",
                    job_context['organism_name'],
                    job_context['length'],
                    processor_job=job_context["job_id"])

    index_object.save()

    # We uploaded the file ourselves since we wanted it to go to a
    # different bucket than end_job would put it in, therefore empty
    # this list so end_job doesn't try to upload it again.
    job_context['computed_files'] = []

    job_context['result'] = result
    job_context['computed_file'] = computed_file
    job_context['index'] = index_object

    # If there's not a long and a short index for this organism yet,
    # don't delete the input.
    # XXX: This will break once we introduce additional versions of these.
    short_indices = OrganismIndex.objects.filter(organism=organism_object,
                                                 index_type="TRANSCRIPTOME_SHORT",
                                                 source_version=job_context["assembly_version"])
    long_indices = OrganismIndex.objects.filter(organism=organism_object,
                                                 index_type="TRANSCRIPTOME_LONG",
                                                 source_version=job_context["assembly_version"])
    if short_indices.count() < 1 or long_indices.count() < 1:
        # utils.end_job deletes these, so remove them so it doesn't.
        job_context["original_files"] = []

    return job_context
Exemplo n.º 30
0
    def test_no_smash_dupe(self):
        """ """

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1237810_T09-1084.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1237811'
        sample.title = 'GSM1237811'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']}
        ds.aggregate_by = 'ALL'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        self.assertTrue(ds.success)
        for column in final_context['original_merged'].columns:
            self.assertTrue('_x' not in column)