示例#1
0
    def test_from_existing_bam(self):
        bam_path = get_sample_data(Path("mbf_align/ex2.bam"))
        bam_job = ppg.FileInvariant(bam_path)
        genome = object()
        lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False,
                                       "AA123")
        assert lane.name == "test_lane"
        assert lane.load()[0] is bam_job
        assert isinstance(lane.load()[1], ppg.FileInvariant)
        assert lane.genome is genome
        assert not lane.is_paired
        assert lane.vid == "AA123"

        with pytest.raises(ValueError):
            mbf_align.AlignedSample("test_lane", bam_job, genome, False,
                                    "AA123")
        lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, True,
                                        "AA123")
        assert lane2.is_paired

        b = lane.get_bam()
        assert isinstance(b, pysam.Samfile)
        b = lane.get_unique_aligned_bam()
        assert isinstance(b, pysam.Samfile)
        assert lane.get_bam_names()[0] == bam_path
        assert lane.get_bam_names()[1] == bam_path + ".bai"

        assert lane.mapped_reads() == 8
        assert lane.unmapped_reads() == 0
        for job in get_qc_jobs():
            assert job._pruned
示例#2
0
    def test_to_fastq(self):
        bam_path = get_sample_data(Path("mbf_align/ex2.bam"))
        bam_job = ppg.FileInvariant(bam_path)
        genome = object()
        lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False,
                                       "AA123")
        fastq_path = "out.fastq"
        lane.to_fastq(fastq_path)
        ppg.run_pipegraph()
        assert Path(fastq_path).exists()
        assert (Path(fastq_path).read_text() == """@read_28833_29006_6945
AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG
+
<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<
@read_28701_28881_323b
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@read_28701_28881_323c
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@read_28701_28881_324a
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@read_28701_28881_324b
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@read_28701_28881_324c
TGCAAGGCCGCATCGGCCAAGGCCAAGATATAGGT
+
<<<<7<<<<<<<<<<<<;6<<<:;7<<<<;<<<<<
@test_clipped1
AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG
+
<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<
@test_clipped1
AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG
+
<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<
""")
        lane2 = mbf_align.AlignedSample("test_lane2",
                                        bam_job,
                                        genome,
                                        is_paired=True,
                                        vid="AA123")
        with pytest.raises(ValueError):
            lane2.to_fastq(
                "nope.fastq")  # no support for paired end data at this point
示例#3
0
def job_reheader_and_rename_chromosomes(input_bam_path, output_bam_path,
                                        replacements):
    input_path_bam = Path(input_bam_path)
    output_bam_path = Path(output_bam_path)

    def do_replace(replacements=replacements):
        reheader_and_rename_chromosomes(input_bam_path, output_bam_path,
                                        replacements)

    output_bam_path.parent.mkdir(exist_ok=True, parents=True)
    return ppg.MultiFileGeneratingJob(
        [output_bam_path,
         output_bam_path.with_suffix(".bam.bai")], do_replace).depends_on(
             ppg.FileInvariant(input_bam_path),
             ppg.FunctionInvariant("mbf_bam.reheader_and_rename_chromosomes",
                                   reheader_and_rename_chromosomes),
         )
示例#4
0
    def __init__(
        self,
        name,
        genome_fasta_filename,
        cdna_fasta_filename,
        protein_fasta_filename,
        gtf_filename,
        cache_dir,
    ):
        """
        A FileBasedGenome used for interactive work,
        uses files that a FileBasedGenome has created in a previous ppg run.
        """
        super().__init__()
        self.name = name
        self.cache_dir = Path(cache_dir)

        self.genome_fasta_filename = genome_fasta_filename
        self.cdna_fasta_filename = cdna_fasta_filename
        self.protein_fasta_filename = protein_fasta_filename
        self.gtf_filename = gtf_filename

        self._filename_lookups = {
            "genome.fasta":
            self.genome_fasta_filename,
            "cdna.fasta":
            self.cdna_fasta_filename,
            "protein.fasta":
            self.protein_fasta_filename,
            "genes.gtf":
            self.gtf_filename,
            "df_genes.msgpack":
            self.cache_dir / "lookup" / "df_genes.msgpack",
            "df_transcripts.msgpack":
            self.cache_dir / "lookup" / "df_transcripts.msgpack",
        }

        if ppg.util.inside_ppg():
            self.gene_gtf_dependencies = ppg.FileInvariant(self.gtf_filename)
        else:
            self.gene_gtf_dependencies = []
示例#5
0
def PseudoNotebookRun(notebook_python_file, target_object, chdir=False):
    notebook_python_file = str(notebook_python_file)
    inv = ppg.FileInvariant(notebook_python_file)

    def run():
        import marburg_biobank.create

        source = Path(notebook_python_file).read_text()
        collector = {}

        def write_dfs(d):
            res = {}
            for k, v in d.items():
                if isinstance(v, tuple):
                    collector[k] = v[0]  # throw away description
                else:
                    collector[k] = v
            return res

        def get_dummy_ipython():
            class DummyIpython:
                def run_line_magic(self, *args, **kwargs):
                    pass

            return DummyIpython()

        marburg_biobank.create.write_dfs = write_dfs
        g = globals().copy()
        g["get_ipython"] = get_dummy_ipython
        g['here'] = Path(notebook_python_file).parent.absolute()
        ppg.util.global_pipegraph = None
        if chdir:
            os.chdir(Path(notebook_python_file).parent)
        exec(source, g)
        os.chdir("/project")
        return collector

    return ppg.CachedAttributeLoadingJob(notebook_python_file + ".result",
                                         target_object, "data",
                                         run).depends_on(inv)
示例#6
0
    def test_chromosome_mapping(self):
        bam_path = get_sample_data(Path("mbf_align/ex2.bam"))
        bam_job = ppg.FileInvariant(bam_path)
        genome = DummyGenome()
        lane = mbf_align.AlignedSample("test_lane", bam_job, genome, False,
                                       "AA123")
        assert lane.name == "test_lane"
        assert lane.load()[0] is bam_job
        assert isinstance(lane.load()[1], ppg.FileInvariant)
        assert lane.genome is genome
        assert not lane.is_paired
        assert lane.vid == "AA123"

        with pytest.raises(ValueError):
            mbf_align.AlignedSample("test_lane", bam_job, genome, False,
                                    "AA123")
        lane2 = mbf_align.AlignedSample("test_lane2", bam_job, genome, True,
                                        "AA123")
        assert lane2.is_paired

        b = lane.get_bam()
        assert isinstance(b, pysam.Samfile)
        b
示例#7
0
    def _parse_alignment_job_input(self, alignment_job):
        if isinstance(alignment_job, (str, Path)):
            alignment_job = ppg.FileInvariant(alignment_job)
        if not isinstance(alignment_job,
                          (ppg.FileInvariant, ppg.FileGeneratingJob)):
            raise ValueError(
                "alignment_job must be a ppg.FileGeneratingJob or FileChecksumInvariant"
                "was %s" % (type(alignment_job)))
        bam_name = None
        bai_name = None
        for fn in alignment_job.filenames:
            if str(fn).endswith(".bam"):
                if bam_name is None:
                    bam_name = str(fn)
                else:
                    raise ValueError(
                        "Job passed to AlignedSample had multiple .bam filenames"
                    )
            elif str(fn).endswith(".bai"):
                if bai_name is None:
                    index_fn = str(fn)
                    bai_name = index_fn
                else:
                    raise ValueError(
                        "Job passed to AlignedSample had multiple .bai filenames"
                    )

        if bam_name is None:
            raise ValueError(
                "Job passed to AlignedSample had no .bam filenames")

        if isinstance(alignment_job, ppg.MultiFileGeneratingJob):
            if bai_name is None:
                index_fn = bam_name + ".bai"
                index_job = ppg.FileGeneratingJob(
                    index_fn, self._index(bam_name, index_fn))
                index_job.depends_on(alignment_job)

            else:
                index_fn = bai_name
                index_job = alignment_job

        elif isinstance(alignment_job, ppg.FileGeneratingJob):
            index_fn = bam_name + ".bai"
            index_job = ppg.FileGeneratingJob(index_fn,
                                              self._index(bam_name, index_fn))
            index_job.depends_on(alignment_job)
        elif isinstance(alignment_job, ppg.FileInvariant):
            index_fn = bam_name + ".bai"
            if Path(index_fn).exists():
                index_job = ppg.FileInvariant(index_fn)
            else:
                cache_dir = Path(
                    ppg.util.global_pipegraph.cache_folder) / "bam_indices"
                cache_dir.mkdir(exist_ok=True)
                index_fn = cache_dir / (self.name + "_" + Path(bam_name).name +
                                        ".bai")
                index_job = ppg.FileGeneratingJob(
                    index_fn, self._index(bam_name, index_fn))
                index_job.depends_on(alignment_job)
        else:
            raise NotImplementedError(
                "Should not happe / covered by earlier if")
        return alignment_job, index_job, Path(bam_name), Path(index_fn)
示例#8
0
 def deps(self, ddf):
     """Return ppg.jobs"""
     return ppg.FileInvariant(self.tablepath)