예제 #1
0
    def test_value_passing(self):
        """The keys added to job_context and returned by processors will be
        passed through to other processors.
        """
        batch, _ = init_objects()
        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])

        mock_processor = MagicMock()
        mock_context = {
            "something_to_pass_along": True,
            "job": processor_job,
            "batches": [batch]
        }
        mock_processor.return_value = mock_context

        def processor_function(job_context):
            self.assertTrue(job_context["something_to_pass_along"])
            return job_context

        test_processor = MagicMock(side_effect=processor_function)

        utils.run_pipeline(
            {"job_id": processor_job.id},
            [utils.start_job, mock_processor, test_processor, utils.end_job])

        processor_job.refresh_from_db()
        self.assertTrue(processor_job.success)
        self.assertIsNotNone(processor_job.end_time)

        batch.refresh_from_db()
        self.assertEqual(batch.status, BatchStatuses.PROCESSED.value)
예제 #2
0
def affy_to_pcl(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.ARRAY_EXPRESS.value)
    utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [
        utils.start_job, _prepare_files, _determine_brainarray_package,
        _run_scan_upc, _create_result_objects, utils.end_job
    ])
예제 #3
0
def agilent_twocolor_to_pcl(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.AGILENT_TWOCOLOR.value)
    utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [
        utils.start_job, _prepare_files, _run_scan_twocolor,
        _create_result_objects, utils.end_job
    ])
예제 #4
0
def salmon(job_id: int) -> None:
    """Main processor function for the Salmon Processor.

    Runs salmon quant command line tool, specifying either a long or
    short read length.
    """
    utils.run_pipeline({"job_id": job_id}, [
        utils.start_job, _set_job_prefix, _prepare_files,
        _determine_index_length, _download_index, _run_salmon, _zip_and_upload,
        utils.cleanup_raw_files, utils.end_job
    ])
예제 #5
0
def salmon(job_id: int) -> None:
    """Main processor function for the Salmon Processor.

    Runs salmon quant command line tool, specifying either a long or
    short read length. Also runs Salmontools and Tximport.
    """
    pipeline = Pipeline(name=PipelineEnum.SALMON.value)
    final_context = utils.run_pipeline(
        {
            "job_id": job_id,
            "pipeline": pipeline
        },
        [
            utils.start_job,
            _set_job_prefix,
            _prepare_files,
            _determine_index_length,
            _find_or_download_index,
            _run_salmon,
            get_tximport_inputs,
            tximport,
            _run_salmontools,
            utils.end_job,
        ],
    )
    return final_context
예제 #6
0
def create_compendia(job_id: int) -> None:
    pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
    job_context = utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, COMPENDIA_PIPELINE)
    return job_context
예제 #7
0
def build_transcriptome_index(job_id: int, length="long") -> None:
    """The main function for the Transcriptome Index Processor.

    The steps in this process are as follows:
      * First, files are retrieved from Temporary Storage.
      * Next, they are prepared by removing pseudogenes from the gtf file.
      * Next the tool RSEM's prepare-reference is run.
      * Finally the salmon index command is run
    The output of salmon index is a directory which is pushed in full
    to Permanent Storage.
    """
    pipeline = Pipeline(name=PipelineEnum.TX_INDEX.value)
    return utils.run_pipeline(
        {
            "job_id": job_id,
            "length": length,
            "pipeline": pipeline
        },
        [
            utils.start_job,
            _compute_paths,
            _prepare_files,
            _extract_assembly_information,
            _process_gtf,
            _create_index,
            _zip_index,
            _populate_index_object,
            utils.end_job,
        ],
    )
예제 #8
0
def run_janitor(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.JANITOR.value)
    job_context = utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [utils.start_job, _find_and_remove_expired_jobs, utils.end_job])
    return job_context
예제 #9
0
def no_op_processor(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.NO_OP.value)
    return utils.run_pipeline({"job_id": job_id, "pipeline": pipeline},
                              [utils.start_job,
                               _prepare_files,
                               _convert_genes,
                               _create_result,
                               utils.end_job])
예제 #10
0
def create_compendia(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.COMPENDIA.value)
    job_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline},
                       [utils.start_job,
                        _prepare_input,
                        _perform_imputation,
                        _create_result_objects,
                        utils.end_job])
    return job_context
예제 #11
0
def illumina_to_pcl(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.ILLUMINA.value)
    return utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [
        utils.start_job, _prepare_files, _detect_columns, _detect_platform,
        _run_illumina, _create_result_objects, utils.end_job
    ])
예제 #12
0
def create_qn_reference(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.QN_REFERENCE.value)
    job_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline},
                       [utils.start_job,
                        _prepare_input,
                        _quantile_normalize,
                        _verify_result,
                        _create_result_objects,
                        utils.end_job])
    return job_context
예제 #13
0
    def test_processor_failure(self):
        processor_job = ProcessorJob()
        processor_job.save()
        job_context = {
            "job_id": processor_job.id,
            "job": processor_job,
            "batches": []
        }

        mock_processor = MagicMock()
        mock_processor.__name__ = "Fake processor."
        return_context = copy.copy(job_context)
        return_context["success"] = False
        mock_processor.return_value = return_context

        utils.run_pipeline(job_context, [mock_processor])
        self.assertEqual(mock_processor.call_count, 1)
        processor_job.refresh_from_db()
        self.assertFalse(processor_job.success)
        self.assertIsNotNone(processor_job.end_time)
예제 #14
0
def build_transcriptome_index(job_id: int) -> None:
    """The main function for the Transcriptome Index Processor.

    The steps in this process are as follows:
      * First, files are retrieved from Temporary Storage.
      * Next, they are prepared by removing pseudogenes from the gtf file.
      * Next the tool RSEM's prepare-reference is run.
      * Finally the salmon index command is run
    The output of salmon index is a directory which is pushed in full
    to Permanent Storage.
    """
    utils.run_pipeline({"job_id": job_id},
                       [utils.start_job,
                        _set_job_prefix,
                        _prepare_files,
                        _process_gtf,
                        _create_index,
                        _zip_index,
                        utils.upload_processed_files,
                        utils.cleanup_raw_files,
                        utils.end_job])
예제 #15
0
def smash(job_id: int, upload=True) -> None:
    """ Main Smasher interface """

    pipeline = Pipeline(name=utils.PipelineEnum.SMASHER.value)
    return utils.run_pipeline(
        {
            "job_id": job_id,
            "upload": upload,
            "pipeline": pipeline
        }, [
            utils.start_job, _prepare_files, _smash, _upload, _notify,
            _update_result_objects, utils.end_job
        ])
예제 #16
0
def create_qn_reference(job_id: int, create_results=True) -> None:
    pipeline = Pipeline(name=PipelineEnum.QN_REFERENCE.value)
    job_context = utils.run_pipeline(
        {"job_id": job_id, "pipeline": pipeline, "create_results": create_results},
        [
            utils.start_job,
            _prepare_input,
            _build_qn_target,
            _create_result_objects,
            _update_caches,
            utils.end_job,
        ],
    )
    return job_context
예제 #17
0
def tximport(job_id: int) -> None:
    """Main processor function for the Tximport Processor.

    Runs tximport command line tool on an experiment.
    """
    pipeline = Pipeline(name=utils.PipelineEnum.TXIMPORT.value)
    final_context = utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [
        utils.start_job, _set_job_prefix, _prepare_files,
        salmon._find_or_download_index, salmon.tximport, utils.end_job
    ])
    return final_context
예제 #18
0
def _perform_imputation(job_context: Dict) -> Dict:
    """
    Take the inputs and perform the primary imputation.

    Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283:
     - Calculate the sum of the lengthScaledTPM values for each row (gene) of
       the rnaseq_expression_matrix (rnaseq_row_sums).
     - Calculate the 10th percentile of rnaseq_row_sums
     - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of
       rnaseq_row_sums; this is now filtered_rnaseq_matrix
     - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
     - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of
       where these zeroes are
     - Perform a full outer join of microarray_expression_matrix and
       log2_rnaseq_matrix; combined_matrix
     - Remove genes (rows) with >30% missing values in combined_matrix
     - Remove samples (columns) with >50% missing values in combined_matrix
     - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero
       again) in combined_matrix
     - Transpose combined_matrix; transposed_matrix
     - Perform imputation of missing values with IterativeSVD (rank=10) on
       the transposed_matrix; imputed_matrix
        -- with specified svd algorithm or skip
     - Untranspose imputed_matrix (genes are now rows, samples are now columns)
    """
    imputation_start = log_state("start perform imputation",
                                 job_context["job"].id)
    job_context["time_start"] = timezone.now()

    job_context = utils.run_pipeline(
        job_context,
        [
            _filter_rnaseq_matrix,
            _log2_transform_matrix,
            _cached_remove_zeroes,
            _full_outer_join_gene_matrices,
            _filter_rows_and_columns,
            _reset_zero_values,
            _run_iterativesvd,
        ],
    )

    job_context["time_end"] = timezone.now()
    job_context["formatted_command"] = ["create_compendia.py"]
    log_state("end perform imputation", job_context["job"].id,
              imputation_start)

    return job_context
예제 #19
0
def create_quantpendia(job_id: int) -> None:
    pipeline = Pipeline(name=PipelineEnum.CREATE_QUANTPENDIA.value)
    job_context = utils.run_pipeline(
        {"job_id": job_id, "pipeline": pipeline},
        [
            utils.start_job,
            _make_dirs,
            _download_files,
            _add_metadata,
            _make_archive,
            _create_result_objects,
            _remove_job_dir,
            utils.end_job,
        ],
    )
    return job_context
예제 #20
0
def create_compendia(job_id: int) -> None:
    pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
    job_context = utils.run_pipeline(
        {
            "job_id": job_id,
            "pipeline": pipeline
        },
        [
            utils.start_job,
            _prepare_input,
            _prepare_frames,
            _perform_imputation,
            smashing_utils.write_non_data_files,
            _create_result_objects,
            utils.end_job,
        ],
    )
    return job_context
예제 #21
0
    def test_detect_columns(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})

        pipeline = Pipeline(name=PipelineEnum.ILLUMINA.value)

        final_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            [
                utils.start_job,
                illumina._prepare_files,
                illumina._detect_encoding,
                illumina._sanitize_input_file,
                illumina._convert_sanitized_to_tsv,
                illumina._detect_columns,
            ],
        )

        self.assertNotEqual(final_context.get("success"), False)

        # For this experiment, the probe ID is the first column
        self.assertEqual(final_context.get("probeId"), GSE22427_HEADER[0])

        expected_column_ids = ",".join(
            map(
                lambda t: str(t[0]),
                filter(
                    # For this header file, the samples all have the prefix LV-
                    lambda t: t[1].startswith("LV-"),
                    # We use start=1 here because the column IDs are formatted
                    # for R code so they treat the header as a 1-indexed list
                    enumerate(GSE22427_HEADER, start=1),
                ),
            ))
        self.assertEqual(final_context.get("columnIds"), expected_column_ids)
예제 #22
0
def smash(job_id: int, upload=True) -> None:
    """Main Smasher interface"""
    pipeline = Pipeline(name=PipelineEnum.SMASHER.value)
    job_context = utils.run_pipeline(
        {
            "job_id": job_id,
            "upload": upload,
            "pipeline": pipeline
        },
        [
            utils.start_job,
            smashing_utils.prepare_files,
            _smash_all,
            _upload,
            _update_result_objects,
            utils.end_job,
        ],
    )
    # ensure that `notify` is always called so that users get emails in case processing fails or succeeds
    job_context = _notify(job_context)
    return job_context
예제 #23
0
def salmon(job_id: int) -> None:
    """Main processor function for the Salmon Processor.

    Runs salmon quant command line tool, specifying either a long or
    short read length. Also runs FastQC, MultiQC, and Salmontools.
    """
    pipeline = Pipeline(name=utils.PipelineEnum.SALMON.value)
    final_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline},
                       [utils.start_job,
                        _set_job_prefix,
                        _prepare_files,
                        _extract_sra,

                        _determine_index_length,
                        _find_or_download_index,

                        _run_fastqc,
                        _run_salmon,
                        _run_salmontools,
                        _run_multiqc,
                        utils.end_job])
    return final_context
예제 #24
0
    def test_imputation(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "RNA-SEQ",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/",
                },
                experiment,
            )

            rnas.append(file)

        # Missing sample that will be filtered
        sample = create_sample_for_experiment(
            {
                "organism": danio_rerio,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "RNA-SEQ",
                "filename": None,
            },
            experiment,
        )
        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        imputation_index = create_compendia.COMPENDIA_PIPELINE.index(
            create_compendia._perform_imputation)

        pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
        job_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            create_compendia.COMPENDIA_PIPELINE[:imputation_index],
        )

        # First, run the imputation step without removing anything to get a baseline
        expected_context = utils.run_pipeline(
            job_context.copy(),
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])

        # Now pick some rows to remove according to the instructions from
        # https://github.com/AlexsLemonade/refinebio/pull/2879#issuecomment-895143336

        random.seed(42)

        # Select some rows randomly and mask a little bit less than 30% of the values
        rare_rows = random.sample(list(job_context["microarray_matrix"].index),
                                  k=25)
        rare_genes = {}
        for row in rare_rows:
            cols = random.sample(
                list(job_context["microarray_matrix"].columns),
                # There are around 840 samples, and we want to pick a little bit
                # less than 30% of them
                k=int(0.28 * 840),
            )
            rare_genes[row] = cols
            for col in cols:
                job_context["microarray_matrix"].loc[row, col] = np.nan

        # Now randomly select some entries from the other rows to mask
        individual_indices = random.sample(
            list(
                itertools.product(
                    set(job_context["microarray_matrix"].index) -
                    set(rare_rows),
                    job_context["microarray_matrix"].columns,
                )),
            k=1000,
        )
        for row, col in individual_indices:
            job_context["microarray_matrix"].loc[row, col] = np.nan

        final_context = utils.run_pipeline(
            job_context,
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])
        self.assertDidNotFail(job)

        index = set(final_context["merged_no_qn"].index) & set(
            expected_context["merged_no_qn"].index)
        columns = set(final_context["merged_no_qn"].columns) & set(
            expected_context["merged_no_qn"].columns)

        # Calculate the Root-Mean-Square Error (RMSE) of the imputed values.
        # See https://en.wikipedia.org/wiki/Root-mean-square_deviation
        # for a description of the formula.

        N = 0
        squared_error = 0
        affected_entries = {
            *individual_indices,
            *((row, col) for row, cols in rare_genes.items() for col in cols),
        }
        for row, col in affected_entries:
            if row in index and col in columns:
                actual = final_context["merged_no_qn"].loc[row, col]
                expected = expected_context["merged_no_qn"].loc[row, col]

                N += 1
                squared_error += (actual - expected)**2

        rmse = math.sqrt(squared_error / N)

        # The results of a previous run plus a little bit of leeway
        self.assertLess(abs(rmse - 0.2868600293662542), 0.05)
예제 #25
0
def affy_to_pcl(job_id: int) -> None:
    utils.run_pipeline({"job_id": job_id}, [
        utils.start_job, _prepare_files, _determine_brainarray_package,
        _run_scan_upc, utils.upload_processed_files, utils.cleanup_raw_files,
        utils.end_job
    ])
예제 #26
0
def no_op_processor(job_id: int) -> None:
    utils.run_pipeline({"job_id": job_id},
                       [utils.start_job, _no_op_processor_fn, utils.end_job])
예제 #27
0
 def test_no_job(self):
     mock_processor = MagicMock()
     utils.run_pipeline({"job_id": 100}, [mock_processor])
     mock_processor.assert_not_called()