Пример #1
0
    def test_dataset_split_multi_movie(self):
        ds1 = pbtestdata.get_file("subreads-sequel")
        ds2 = pbtestdata.get_file("subreads-xml")
        tmp_ds = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        with SubreadSet(ds1, ds2) as ds:
            ds.write(tmp_ds)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        base_args = [
            "dataset", "split", "--maxChunks", "4", "--targetSize", "1",
            "--prefix", "tst_multi_ds"
        ]

        def run_and_validate(args, ds_sizes):
            outdir = tempfile.mkdtemp(suffix="dataset-unittest")
            final_args = base_args + args + ["--outdir", outdir, tmp_ds]
            self._check_cmd(" ".join(final_args))
            dss = [
                openDataSet(op.join(outdir, fn))
                for fn in sorted(os.listdir(outdir))
            ]
            assert [len(ds) for ds in dss] == ds_sizes

        run_and_validate(["--zmws"], [52, 22, 42, 21])
        #run_and_validate(["--auto"], [8, 12, 54, 63])
        run_and_validate(["--zmws", "--keepReadGroups"], [8, 12, 54, 63])
Пример #2
0
 def test_dataset_create_set_sample_names(self):
     sample_args = "--well-sample-name WELLSAMPLE --bio-sample-name BIOSAMPLE".split(
     )
     outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     cmd = " ".join([
         "dataset", "create", "--force", outfile,
         pbtestdata.get_file("subreads-bam")
     ] + sample_args)
     self._run_cmd_with_output(cmd, outfile)
     with SubreadSet(outfile) as ds:
         assert len(ds.metadata.collections) == 1
         assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE"
         assert ds.metadata.collections[0].wellSample.bioSamples[
             0].name == "BIOSAMPLE"
         assert len(ds.metadata.collections[0].wellSample.bioSamples) == 1
     # now with existing samples
     outfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     cmd = " ".join([
         "dataset", "create", "--force", outfile,
         pbtestdata.get_file("barcoded-subreadset")
     ] + sample_args)
     self._run_cmd_with_output(cmd, outfile)
     with SubreadSet(outfile) as ds:
         assert len(ds.metadata.collections) == 1
         assert ds.metadata.collections[0].wellSample.name == "WELLSAMPLE"
         biosamples = {
             s.name
             for s in ds.metadata.collections[0].wellSample.bioSamples
         }
         assert biosamples == {"BIOSAMPLE"}
Пример #3
0
 def test_get_dataset_size(self):
     tiny_xml = pbtestdata.get_file("subreads-sequel")
     m = get_dataset_size(tiny_xml, True, True)
     assert m.numRecords == 20
     assert m.totalLengthMb == 1
     assert m.indexSizeGb == 2
     assert m.numResources == 1 and m.numFilters == 0
     m = get_dataset_size(tiny_xml, False, False)
     assert m.numRecords == 20
     assert m.totalLengthMb == 1
     assert m.indexSizeGb == 1
     m = get_dataset_size(self.BIG_DATA, True, True)
     assert m.numRecords == 805580876
     assert m.totalLengthMb == 271330
     assert m.indexSizeGb == 45
     assert m.numResources == 1 and m.numFilters == 0
     m = get_dataset_size(self.TINY_REF, False, False)
     assert m.numRecords == 1
     assert m.totalLengthMb == 1
     m = get_dataset_size(self.BIG_REF, False, False)
     assert m.numRecords == 86
     assert m.totalLengthMb == 2993
     ds_aln = pbtestdata.get_file("aligned-ds-2")
     m = get_dataset_size(ds_aln, True, True)
     assert m.numRecords == 21
     assert m.numResources == 2
 def setUpClass(cls):
     cls.xml_path = pbtestdata.get_file("aligned-xml")
     cls.ds_reader = AlignmentSet(cls.xml_path, strict=True,
                                  reference=pbtestdata.get_file("lambda-fasta"))
     cls.bam_readers = cls.ds_reader.resourceReaders()
     cls.interval_lists = summarize_coverage.build_interval_lists(
         cls.bam_readers)
Пример #5
0
 def test_split_zmws_around_read_groups(self):
     ds1 = pbtestdata.get_file("subreads-xml")
     ds2 = pbtestdata.get_file("subreads-sequel")
     ds = SubreadSet(ds1, ds2)
     assert len(ds) == 137
     # this is still the default behavior
     chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=True))
     assert len(chunks[0]) == 72
     assert len(chunks[1]) == 65
     # don't break up movies
     chunks = list(ds.split(chunks=2, zmws=True, breakReadGroups=False))
     assert len(chunks[0]) == 20
     assert len(chunks[1]) == 117
     assert np.all(chunks[0].index.qId == -2081539485)
     assert np.all(chunks[1].index.qId == -1197849594)
     chunks = list(
         ds.split(chunks=4, targetSize=1, zmws=True, breakReadGroups=False))
     assert [len(c) for c in chunks] == [8, 12, 54, 63]
     assert np.all(chunks[0].index.qId == -2081539485)
     assert np.all(chunks[1].index.qId == -2081539485)
     assert np.all(chunks[2].index.qId == -1197849594)
     assert np.all(chunks[3].index.qId == -1197849594)
     # control: single-movie dataset
     ds = SubreadSet(ds1)
     chunks1 = list(ds.split(chunks=4, zmws=True, breakReadGroups=False))
     chunks2 = list(ds.split(chunks=4, zmws=True, breakReadGroups=True))
     assert [len(x) for x in chunks1] == [len(y) for y in chunks2]
class TestToolContractHgap(pbcommand.testkit.core.PbTestApp):
    DRIVER_BASE = "python -m pbreports.report.coverage_hgap"
    INPUT_FILES = [
        pbtestdata.get_file("lambda-fasta"),
        pbtestdata.get_file("alignment-summary-gff")
    ]
    IS_DISTRIBUTED = True
    RESOLVED_IS_DISTRIBUTED = True
 def test_get_index_size_bytes(self):
     import pbtestdata
     ds = openDataSet(pbtestdata.get_file("subreads-sequel"))
     assert get_index_size_bytes(ds.externalResources[0].pbi) == 580
     ds2 = openDataSet(pbtestdata.get_file("ccs-barcoded"))
     assert get_index_size_bytes(ds2.externalResources[0].pbi) == 68
     ds3 = openDataSet(pbtestdata.get_file("aligned-xml"))
     assert get_index_size_bytes(ds3.externalResources[0].pbi) == 7504
 def test_run_bamsieve_extract_unmapped(self):
     mapped = _make_filtered(pbtestdata.get_file("aligned-xml"))
     subreads = pbtestdata.get_file("subreads-xml")
     args = [
         "bamsieve", "--subreads", "--blacklist", mapped, subreads,
         "unmapped.subreads.bam"
     ]
     self._check_call(args)
     assert_no_reads_in_common(self, mapped, "unmapped.subreads.bam")
 def setUpClass(cls):
     cls.xml_path = pbtestdata.get_file("aligned-xml")
     cls.ds_reader = AlignmentSet(
         cls.xml_path,
         strict=True,
         reference=pbtestdata.get_file("lambda-fasta"))
     cls.bam_readers = cls.ds_reader.resourceReaders()
     cls.interval_lists = summarize_coverage.build_interval_lists(
         cls.bam_readers)
class TestScatterCCSReference(pbcommand.testkit.core.PbTestScatterApp):
    DRIVER_BASE = "python -m pbcoretools.tasks.scatter_ccs_reference"
    INPUT_FILES = [
        pbtestdata.get_file("rsii-ccs"),
        pbtestdata.get_file("lambdaNEB")
    ]
    MAX_NCHUNKS = 8
    RESOLVED_MAX_NCHUNKS = 8
    CHUNK_KEYS = ("$chunk.ccsset_id", "$chunk.reference_id")
class TestScatterSubreadReference(pbcommand.testkit.core.PbTestScatterApp):
    DRIVER_BASE = "python -m pbcoretools.tasks.scatter_subread_reference"
    INPUT_FILES = [
        pbtestdata.get_file("subreads-xml"),
        pbtestdata.get_file("lambdaNEB")
    ]
    MAX_NCHUNKS = 3
    RESOLVED_MAX_NCHUNKS = 3
    CHUNK_KEYS = ("$chunk.subreadset_id", "$chunk.reference_id")
class TestSummarizeCoverage(pbcommand.testkit.PbTestApp):
    DRIVER_BASE = "python -m pbreports.report.summarize_coverage.summarize_coverage "
    DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
    DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
    REQUIRES_PBCORE = True
    INPUT_FILES = [
        pbtestdata.get_file("aligned-xml"),
        pbtestdata.get_file("lambda-fasta")
    ]
    TASK_OPTIONS = {}
Пример #13
0
class TestSummarizeConsensus(pbcommand.testkit.PbTestApp):
    DRIVER_BASE = "summarizeConsensus"
    DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
    DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
    REQUIRES_PBCORE = True
    INPUT_FILES = [
        pbtestdata.get_file("alignment-summary-gff"),
        pbtestdata.get_file("variants-gff")
    ]
    TASK_OPTIONS = {}
Пример #14
0
 def test_trust_counts(self):
     import pbtestdata
     f1 = pbtestdata.get_file("aligned-xml")
     f2 = pbtestdata.get_file("aligned-ds-2")
     ds = openDataFile(f1, f2, trustCounts=True)
     assert ds.numRecords == 133
     assert len(ds) == 133
     assert ds.totalLength == 274217
     assert ds._index is None
     assert len(ds._openReaders) == 0
class TestScatterSubreadsBarcoding(pbcommand.testkit.core.PbTestScatterApp):
    DRIVER_BASE = "python -m pbcoretools.tasks.scatter_subreads_bam2bam"
    INPUT_FILES = [
        # XXX not actually barcoded data, but it doesn't matter here
        pbtestdata.get_file("subreads-bam"),
        pbtestdata.get_file("barcodeset")
    ]
    MAX_NCHUNKS = 8
    RESOLVED_MAX_NCHUNKS = 8
    NCHUNKS_EXPECTED = 2
    CHUNK_KEYS = ("$chunk.subreadset_id", "$chunk.barcodeset_id")
Пример #16
0
    def test_get_dataset_metadata(self):
        import pbtestdata
        md = get_dataset_metadata(pbtestdata.get_file("subreads-xml"))
        assert md.metatype == "PacBio.DataSet.SubreadSet"

        from pbcore.io import SubreadSet
        ds = SubreadSet(pbtestdata.get_file("subreads-xml"))
        assert md.uuid == ds.uuid

        with pytest.raises(Exception) as e:
            get_dataset_metadata(None)
Пример #17
0
class TestPbalignMinorVariants(pbcommand.testkit.PbTestApp):
    DRIVER_BASE = "python -m pbalign.tasks.align_minorvariants"
    INPUT_FILES = [
        pbtestdata.get_file("rsii-ccs"),
        pbtestdata.get_file("lambdaNEB")
    ]

    def run_after(self, rtc, output_dir):
        ds_out = openDataSet(rtc.task.output_files[0])
        self.assertTrue(isinstance(ds_out, ConsensusAlignmentSet),
                        type(ds_out).__name__)
Пример #18
0
 def test_gather_alignments_trust_counts(self):
     f1 = pbtestdata.get_file("aligned-xml")
     f2 = pbtestdata.get_file("aligned-ds-2")
     tmp_out = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
     args = [
         "dataset", "create", "--trustCounts",
         tmp_out, f1, f2
     ]
     assert subprocess.check_call(args) == 0
     ds = AlignmentSet(tmp_out, trustCounts=True)
     assert ds.numRecords == 133
     assert ds.totalLength == 274217
class TestPbreportMappingStatsHGAP(pbcommand.testkit.PbTestApp):
    DRIVER_BASE = "python -m pbreports.report.mapping_stats_hgap"
    REQUIRES_PBCORE = True
    INPUT_FILES = [
        pbtestdata.get_file("aligned-internal-subreads"),
        pbtestdata.get_file("internal-subreads")
    ]

    def run_after(self, rtc, output_dir):
        r = load_report_from_json(rtc.task.output_files[0])
        a = r.attributes[0]
        self.assertEqual(a.id, Constants.A_PCT_MAPPED)
        self.assertAlmostEqual(a.value, 0.9137, delta=0.0001)
class TestPbreportTopVariants(pbcommand.testkit.PbTestApp):
    from pbreports.report.top_variants import Constants
    DRIVER_BASE = "python -m pbreports.report.top_variants "
    DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
    DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
    REQUIRES_PBCORE = True
    INPUT_FILES = [
        pbtestdata.get_file("variants-gff"),
        pbtestdata.get_file("lambda-fasta")
    ]
    TASK_OPTIONS = {
        Constants.HOW_MANY_ID: Constants.HOW_MANY_DEFAULT,
        Constants.BATCH_SORT_SIZE_ID: Constants.BATCH_SORT_SIZE_DEFAULT,
    }
Пример #21
0
class TestEstimateLimaMemory(PbIntegrationBase):
    TINY_DATA = pbtestdata.get_file("subreads-sequel")
    TINY_BARCODES = pbtestdata.get_file("barcodeset")
    BIG_BARCODES = "/pbi/dept/secondary/siv/barcodes/Sequel_RSII_384_barcodes_v1/Sequel_RSII_384_barcodes_v1.barcodeset.xml"
    BIG_DATA = "/pbi/dept/secondary/siv/testdata/Spider/all4mers/rSPOC1_20180629_223342/1_A01/mSPOC1_180629_223410.subreadset.xml"
    CCS_DATA = "/pbi/dept/secondary/siv/testdata/SA3-Sequel/bcol/m54119_161211_175055.consensusreadset.xml"

    def test_estimate_lima_memory(self):
        mem_gb = estimate_lima_memory(self.TINY_BARCODES, self.TINY_DATA, True)
        assert mem_gb == 2
        # this is silly of course.  but it's technically possible with the
        # Sequel II system, so we might as well just deal with it
        mem_gb = estimate_lima_memory(self.BIG_BARCODES, self.BIG_DATA, False)
        assert mem_gb == 2752
        # this is a more realistic case - 147K barcode pairs but the BAM file
        # is small enough to fit in the default footprint
        mem_gb = estimate_lima_memory(self.BIG_BARCODES, self.CCS_DATA, False)
        assert mem_gb == 7

    def test_integration_tiny(self):
        args = [
            "python3", "-m", "pbcoretools.tasks.memory.estimate_lima_memory",
            self.TINY_BARCODES, self.TINY_DATA, "--symmetric"
        ]
        self._check_call(args)
        with open("lima_mem_gb.txt") as txt_out:
            assert txt_out.read() == "2"

    def test_integration_big(self):
        args = [
            "python3", "-m", "pbcoretools.tasks.memory.estimate_lima_memory",
            self.BIG_BARCODES, self.BIG_DATA, "--asymmetric"
        ]
        self._check_call(args)
        with open("lima_mem_gb.txt") as txt_out:
            assert txt_out.read() == "2752"

    def test_defined_biosamples(self):
        # XXX awful dependency but it makes testing easier
        from pbcoretools.file_utils import set_bio_samples
        ds_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        bc = openDataSet(self.BIG_BARCODES)
        with openDataSet(self.BIG_DATA, trustCounts=True) as ds:
            bcs = [("bc1001--bc1{:03d}".format(x), "Sample {}".format(x))
                   for x in range(384)]
            set_bio_samples(ds, bcs)
            ds.write(ds_tmp)
        mem_gb = estimate_lima_memory(self.BIG_BARCODES, ds_tmp, False)
        assert mem_gb == 2
Пример #22
0
 def test_get_dataset_metadata(self):
     try:
         import pbtestdata
     except ImportError:
         raise unittest.SkipTest("pbtestdata not available, skipping")
     else:
         md = get_dataset_metadata(pbtestdata.get_file("subreads-xml"))
         self.assertEqual(md.metatype, "PacBio.DataSet.SubreadSet")
         try:
             from pbcore.io import SubreadSet
         except ImportError:
             raise unittest.SkipTest("pbcore not available, skipping")
         else:
             ds = SubreadSet(pbtestdata.get_file("subreads-xml"))
             self.assertEqual(md.uuid, ds.uuid)
Пример #23
0
class TestPbalign(pbcommand.testkit.PbTestApp):
    DRIVER_BASE = "pbalign "
    REQUIRES_PBCORE = True
    INPUT_FILES = [
        pbtestdata.get_file("subreads-xml"),
        pbtestdata.get_file("lambdaNEB")
    ]
    TASK_OPTIONS = {
        "pbalign.task_options.algorithm_options": "--holeNumbers 1-1000,30000-30500,60000-60600,100000-100500",
    }

    def run_after(self, rtc, output_dir):
        ds_out = openDataSet(rtc.task.output_files[0])
        self.assertTrue(isinstance(ds_out, AlignmentSet),
                        type(ds_out).__name__)
Пример #24
0
class TestToolContract(pbcommand.testkit.PbTestApp):
    DATA_DIR = op.join(LOCAL_DATA, "variants")
    DRIVER_BASE = "python -m pbreports.report.variants"
    DRIVER_EMIT = DRIVER_BASE + " --emit-tool-contract "
    DRIVER_RESOLVE = DRIVER_BASE + " --resolved-tool-contract "
    REQUIRES_PBCORE = True
    INPUT_FILES = [
        pbtestdata.get_file("lambda-fasta"),
        pbtestdata.get_file("consensus-summary-gff"),
        pbtestdata.get_file("variants-gff")
    ]
    TASK_OPTIONS = {
        "pbreports.task_options.max_contigs": 25,
        "pbreports.task_options.dpi": 60,
        "pbreports.task_options.dumpdata": True,
    }
    def test_make_filter_stats_report_sts_xml(self):
        """
        Test the content of the filter report generated from an sts.xml
        """
        sts_xml = pbtestdata.get_file("stats-xml")
        rpt = make_filter_report(sts_xml, self.get_output_dir())
        d = json.loads(rpt.to_json())
        self._compare_attribute_values(report_d=d,
                                       expected_d={
                                           Constants.A_NBASES: 1672335649,
                                           Constants.A_NREADS: 394658,
                                           Constants.A_READ_N50: 7750,
                                           Constants.A_READ_LENGTH: 4237
                                       })
        self.assertTrue(
            os.path.exists(
                os.path.join(self.get_output_dir(), 'readLenDist0.png')))
        # self.assertTrue(os.path.exists(os.path.join(
        #    self.get_output_dir(),
        #    'readQualDist0.png')))

        # these are from a raw STS file
        self.assertEqual(len(rpt._dataset_uuids), 0,
                         "Incorrect report datasets uuids")
        print pformat(rpt.to_dict())
        validate_report_complete(self, rpt)
Пример #26
0
 def setup_class(cls):
     bam_files = []
     with SubreadSet(pbtestdata.get_file("barcoded-subreadset")) as ds_in:
         for er in ds_in.externalResources:
             bam_files.append(er.bam)
     with SubreadSet(*bam_files, strict=True) as ds_out:
         ds_out.write(cls.INPUT_FILE)
 def test_integration(self):
     ccs_barcoded = pbtestdata.get_file("ccs-barcoded")
     datastore = tempfile.NamedTemporaryFile(suffix=".datastore.json").name
     lima_out = tempfile.NamedTemporaryFile(
         suffix=".consensusreadset.xml").name
     ccs_in = tempfile.NamedTemporaryFile(
         suffix=".consensusreadset.xml").name
     with ConsensusReadSet(ccs_barcoded) as ccs_tmp:
         ccs_tmp.name = "My Data (filtered)"
         ccs_tmp.tags = "ccs,filtered"
         ccs_tmp.write(ccs_in)
         ccs_tmp.name = "lima out"
         ccs_tmp.write(lima_out)
     ds = DataStore([
         DataStoreFile(uuid.uuid4(), "lima", FileTypes.DS_CCS.file_type_id,
                       lima_out)
     ])
     ds.write_json(datastore)
     args = [
         "python3", "-m", "pbcoretools.tasks.make_trimmed_dataset",
         datastore, ccs_in
     ]
     self._check_call(args)
     with ConsensusReadSet("trimmed.consensusreadset.xml",
                           trustCounts=True) as ccs_out:
         assert ccs_out.numRecords > 0
         assert ccs_out.name == "My Data (trimmed)"
         assert ccs_out.tags == "ccs"
Пример #28
0
 def test__read_in_indexed_alignmentset(self):
     bam = pbtestdata.get_file("aligned-bam")
     data = _read_in_indexed_alignmentset(bam)
     self.assertTrue(all([row[2] == 254 for row in data]))
     self.assertEqual(len(data), 112)
     self.assertEqual(data[-1][0], 605)
     self.assertTrue(0.927 < data[-1][1] < 0.928)
 def test__read_in_indexed_alignmentset(self):
     bam = pbtestdata.get_file("aligned-bam")
     data = _read_in_indexed_alignmentset(bam)
     self.assertTrue(all([row[2]==254 for row in data]))
     self.assertEqual(len(data), 112)
     self.assertEqual(data[-1][0], 605)
     self.assertTrue(0.927 < data[-1][1] < 0.928)
class TestIntegrationMappingStatsReport(unittest.TestCase):
    ALIGNMENTS = pbtestdata.get_file("aligned-bam")

    def setUp(self):
        self.output_dir = tempfile.mkdtemp(suffix="_mapping_stats")
        self.aligned_reads_bam = self.ALIGNMENTS
        t = tempfile.NamedTemporaryFile(delete=False,
                                        suffix="mapping_report.json")
        t.close()
        self.report_json = t.name

    def test_basic(self):
        cmd = _to_cmd(self.ALIGNMENTS, self.report_json)
        rcode = run_backticks(cmd)
        self.assertEqual(rcode, 0)
        with open(self.report_json, 'r') as f:
            s = json.load(f)
            log.info("JsonReport: ")
            log.info(pprint.pformat(s, indent=4))
        report = dict_to_report(s)
        self.assertIsNotNone(report)
        self.assertEqual(len(report.tables), 1)
        log.info(str(report.tables[0]))
        validate_report_metadata(self, report, spec)
        validate_report_complete(self, report)
Пример #31
0
 def test_update_barcoded_sample_metadata(self):
     datastore_tmp = tempfile.NamedTemporaryFile(
         suffix=".datastore.json").name
     barcodes = pbtestdata.get_file("barcodeset")
     ds = split_barcoded_dataset(self.SUBREADS)
     ds.write_json(datastore_tmp)
     base_dir = tempfile.mkdtemp()
     datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp,
                                                 self.SUBREADS, barcodes)
     validate_barcoded_datastore_files(self, self.SUBREADS, datastore)
     # now with use_barcode_uuids=False
     datastore = update_barcoded_sample_metadata(base_dir,
                                                 datastore_tmp,
                                                 self.SUBREADS,
                                                 barcodes,
                                                 use_barcode_uuids=False)
     validate_barcoded_datastore_files(self,
                                       self.SUBREADS,
                                       datastore,
                                       use_barcode_uuids=False)
     # test that it doesn't break with no collection metadata
     ss = SubreadSet(self.SUBREADS)
     ss.metadata.collections = None
     ss_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ss.write(ss_tmp)
     ds = split_barcoded_dataset(ss_tmp)
     ds.write_json(datastore_tmp)
     base_dir = tempfile.mkdtemp()
     datastore = update_barcoded_sample_metadata(base_dir, datastore_tmp,
                                                 self.SUBREADS, barcodes)
     validate_barcoded_datastore_files(self,
                                       self.SUBREADS,
                                       datastore,
                                       have_collection_metadata=False,
                                       number_of_expected_collections=0)
Пример #32
0
 def test_integration_simple(self):
     ds_in = pbtestdata.get_file("ccs-sequel")
     args = [
         "python3", "-m", "pbcoretools.tasks.consolidate_reads_bam", ds_in
     ]
     self._check_call(args)
     assert op.isfile("reads.bam")
    def test_make_filter_stats_report_sts_xml(self):
        """
        Test the content of the filter report generated from an sts.xml
        """
        sts_xml = pbtestdata.get_file("stats-xml")
        rpt = make_filter_report(sts_xml, self.get_output_dir())
        d = json.loads(rpt.to_json())
        self._compare_attribute_values(
            report_d=d,
            expected_d={
                Constants.A_NBASES: 1672335649,
                Constants.A_NREADS: 394658,
                Constants.A_READ_N50: 7750,
                Constants.A_READ_LENGTH: 4237,
            },
        )
        self.assertTrue(os.path.exists(os.path.join(self.get_output_dir(), "readLenDist0.png")))
        # self.assertTrue(os.path.exists(os.path.join(
        #    self.get_output_dir(),
        #    'readQualDist0.png')))

        # these are from a raw STS file
        self.assertEqual(len(rpt._dataset_uuids), 0, "Incorrect report datasets uuids")
        print pformat(rpt.to_dict())
        validate_report_complete(self, rpt)
 def test_provenance_record_ordering(self):
     import pbtestdata
     ds = SubreadSet(pbtestdata.get_file("subreads-sequel"), strict=True)
     ds.metadata.addParentDataSet(uuid.uuid4(), ds.datasetType, createdBy="AnalysisJob", timeStampedName="")
     tmp_out = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     ds.write(tmp_out)
     ds = SubreadSet(tmp_out, strict=True)
     tags = [r['tag'] for r in ds.metadata.record['children']]
     self.assertEqual(tags, ['TotalLength', 'NumRecords', 'Provenance', 'Collections', 'SummaryStats'])
 def setUp(self):
     BAM_IN = pbtestdata.get_file("subreads-bam")
     ds = SubreadSet(BAM_IN, strict=True)
     chunks = ds.split(zmws=True, chunks=2, targetSize=2)
     assert len(chunks) == 2
     self.zmw_range = chunks[CHUNK_INDEX].zmwRanges[0][1:3]
     logging.info("zmwRanges[CHUNK_INDEX] = {r}".format(
         r=str(chunks[CHUNK_INDEX].zmwRanges)))
     logging.info("SubreadSet = {f}".format(f=self.INPUT_FILES[0]))
     chunks[CHUNK_INDEX].write(self.INPUT_FILES[0])
 def test_ccs_barcodes_table(self):
     CCS_DS = pbtestdata.get_file("ccs-barcoded")
     ds = ConsensusReadSet(CCS_DS)
     r = to_report(ds, tempfile.mkdtemp())
     self.assertEqual([c.values for c in r.tables[1].columns[0:4]],
                      [["lbc1", "lbc3"], [1, 1], [1958, 1954], [1958, 1954]])
     self.assertAlmostEqual(r.tables[1].columns[4].values[0], 0.9724,
                            places=4)
     self.assertAlmostEqual(r.tables[1].columns[4].values[1], 0.9926,
                            places=4)
    def test_exit_code_0(self):
        """
        Like a cram test. Assert exits with 0, even though region size is 0 See
        bug 25079
        """
        from pbcore.util.Process import backticks
        import tempfile
        ref = pbtestdata.get_file("lambda-fasta")
        tiny_reads = pbtestdata.get_file("aligned-xml")
        out = os.path.join(tempfile.mkdtemp(suffix="summ_cov"), 'gff')
        cmd = 'summarize_coverage --region_size=0 --num_regions=500 {a} {r} {g}'.format(
            a=tiny_reads, r=ref, g=out)

        o, c, m = backticks(cmd)
        log.info(cmd)
        if c is not 0:
            log.error(m)
            log.error(o)
            print(m)
        self.assertEquals(0, c)
        self.assertTrue(
            os.path.exists(os.path.join(out)))
 def test_adapter_exit_code_0(self):
     subreads_xml = pbtestdata.get_file("subreads-sequel")
     cmd = "adapter_xml {c} {r}".format(r="foo.json", c=subreads_xml)
     o, c, m = backticks(cmd)
     print "COMMAND: {c}".format(c=cmd)
     log.info(cmd)
     print "o: {o}".format(o=o)
     print "c: {c}".format(c=c)
     print "m: {m}".format(m=m)
     if c is not 0:
         log.error(m)
         log.error(o)
     self.assertEquals(0, c)
 def test_loading_exit_code_0(self):
     sts_xml = pbtestdata.get_file("stats-xml")
     cmd = "loading_xml {c} {r}".format(r="foo.json", c=sts_xml)
     o, c, m = backticks(cmd)
     print "COMMAND: {c}".format(c=cmd)
     log.info(cmd)
     print "o: {o}".format(o=o)
     print "c: {c}".format(c=c)
     print "m: {m}".format(m=m)
     if c is not 0:
         log.error(m)
         log.error(o)
     self.assertEquals(0, c)
 def test_merge_biosamples(self):
     import pbtestdata
     ds1 = pbtestdata.get_file("subreads-biosample-1")
     ds2 = pbtestdata.get_file("subreads-biosample-2")
     # Case 1: two biosamples
     ds = SubreadSet(ds1, ds2)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice", "Bob"])
     # Case 2: same biosample in both files
     ds = SubreadSet(ds1, ds1)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice"])
     self.assertEqual(len(ds.metadata.bioSamples[0].DNABarcodes), 1)
     # Case 3: same biosample, different barcodes
     dsTmp = SubreadSet(ds1)
     dsTmp.metadata.bioSamples[0].DNABarcodes[0].name = "F7--R7"
     tmpFile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     dsTmp.write(tmpFile)
     ds = SubreadSet(ds1, tmpFile)
     samples = [bs.name for bs in ds.metadata.bioSamples]
     self.assertEqual(samples, ["Alice"])
     bcs = [bc.name for bc in ds.metadata.bioSamples[0].DNABarcodes]
     self.assertEqual(bcs, ["F1--R1", "F7--R7"])
    def setUpClass(cls):
        cls.output_dir = tempfile.mkdtemp(suffix="_mapping_stats")
        cls.aligned_reads_xml = pbtestdata.get_file("rsii-ccs-aligned")
        t = tempfile.NamedTemporaryFile(
            delete=False, suffix="mapping_report.json")
        t.close()
        cls.report_json = t.name
        cls.report = mapping_stats_ccs.to_report(cls.aligned_reads_xml,
                                                 cls.output_dir)
        cls.report.write_json(cls.report_json)

        if isinstance(cls.report, Report):
            log.info(pprint.pformat(cls.report.to_dict()))
            for table in cls.report.tables:
                log.info(str(table))
 def test_filter_exit_code_0(self):
     tmpdir = tempfile.mkdtemp()
     cwd = os.getcwd()
     sts_xml = pbtestdata.get_file("subreads-sequel")
     cmd = "filter_stats_xml {c} {r}".format(r="foo.json", c=sts_xml)
     o, c, m = backticks(cmd)
     print "COMMAND: {c}".format(c=cmd)
     log.info(cmd)
     print "o: {o}".format(o=o)
     print "c: {c}".format(c=c)
     print "m: {m}".format(m=m)
     if c is not 0:
         log.error(m)
         log.error(o)
     self.assertEquals(0, c)
    def test_make_filter_stats_report_dataset(self):
        """
        Test the content of the filter report generated from a dataset
        """
        sts_xml = pbtestdata.get_file("subreads-sequel")
        rpt = make_filter_report(sts_xml, self.get_output_dir())
        d = json.loads(rpt.to_json())
        self._compare_attribute_values(
            report_d=d,
            expected_d={
                Constants.A_NBASES: 1672335649,
                Constants.A_NREADS: 394658,
                Constants.A_READ_N50: 7750,
                Constants.A_READ_LENGTH: 4237,
            },
        )

        self.assertTrue(os.path.exists(os.path.join(self.get_output_dir(), "readLenDist0.png")))
def _get_bax2bam_inputs():
    """Little hackery to get the setup class Inputs and to avoid calls to
    setupclass if skiptest is used

    Nat: we want to test that this behaves properly when multiple movies are
    supplied as input, so we make an HdfSubreadSet on the fly from various
    bax files in testdata
    """
    if HAVE_DATA_AND_BAX2BAM:
        hdf_subread_xml = tempfile.NamedTemporaryFile(suffix=".hdfsubreadset.xml").name

        bax_files = (SIV_DATA_DIR + "/SA3-RS/lambda/2372215/0007_tiny/Analysis_Results/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.bax.h5",
                     pbtestdata.get_file("rsii-bax-h5"))
        ds = HdfSubreadSet(*bax_files)
        ds.name = "lambda_rsii"
        assert len(set([f.movieName for f in ds.resourceReaders()])) == 2
        ds.write(hdf_subread_xml)
        return [hdf_subread_xml]
    else:
        # Assume the test data isn't found and the test won't be run
        return ["/path/to/this-test-should-be-skipped.txt"]
 def test_consensus_read_set_ref(self):
     import pbtestdata
     ds = ConsensusReadSet(pbtestdata.get_file("ccs-sequel"), strict=True)
     uuid = ds.metadata.collections[0].consensusReadSetRef.uuid
     self.assertEqual(uuid, "5416f525-d3c7-496b-ba8c-18d7ec1b4499")
 def _generate_chunk_output_file(self, i=None):
     return self._copy_mock_output_file(pbtestdata.get_file("ccs-bam-aligned"))
 def _generate_chunk_output_file(self, i=None):
     return self._copy_mock_output_file(pbtestdata.get_file("subreads-bam"))
 def setUp(self):
     self.barcodes = pbtestdata.get_file("barcodeset")
     self.subreads = pbtestdata.get_file("barcoded-subreadset")
     self.ccs = False
Пример #49
0
import shutil
import os.path as op
import os

from pbcore.io import openDataFile, openDataSet, BamReader

import pbtestdata

from pbcoretools import bamSieve

DATA_DIR = op.join(op.dirname(op.dirname(__file__)), "data")
SUBREADS1 = op.join(DATA_DIR, "tst_1_subreads.bam")
DS1 = op.join(DATA_DIR, "tst_1.subreadset.xml")
SUBREADS2 = op.join(DATA_DIR, "tst_3_subreads.bam")
DS2 = op.join(DATA_DIR, "tst_3.subreadset.xml")
SUBREADS3 = pbtestdata.get_file("subreads-bam")
SUBREADS4 = pbtestdata.get_file("aligned-bam")
CCS = pbtestdata.get_file("ccs-bam")
BARCODED = pbtestdata.get_file("barcoded-subreads-bam")
BARCODED_DS = pbtestdata.get_file("barcoded-subreadset")

class TestBamSieve(unittest.TestCase):

    def test_whitelist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        WHITELIST = set([24962, 32901, 30983])

        def _run_with_whitelist(wl):
            rc = bamSieve.filter_reads(
                input_bam=SUBREADS3,
                output_bam=ofn,
 def setUp(self):
     self.aln_path = pbtestdata.get_file("aligned-xml")
     self.gff_path = pbtestdata.get_file("alignment-summary-gff")
     self.ref_path = pbtestdata.get_file("lambda-fasta")
     self.selected_reference = None
 def getAlignmentSet(self):
     return pbtestdata.get_file("aligned-bam")