示例#1
0
 def test_error(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     with pytest.raises(UserError) as exc:
         rc = bamsieve.filter_reads(input_bam=DS1,
                                    output_bam=ofn,
                                    whitelist=set([5, 6, 7, 8]),
                                    blacklist=set([1, 2, 3, 4]))
     with pytest.raises(UserError) as exc:
         rc = bamsieve.filter_reads(input_bam=DS1,
                                    output_bam=ofn,
                                    whitelist=set([5, 6, 7, 8]),
                                    percentage=50)
     with pytest.raises(UserError) as exc:
         rc = bamsieve.filter_reads(input_bam=DS1,
                                    output_bam=ofn,
                                    percentage=500)
     with pytest.raises(UserError) as exc:
         rc = bamsieve.filter_reads(input_bam=DS1,
                                    output_bam=ofn,
                                    percentage=50,
                                    count=1)
     # dataset output, but BAM input
     ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     with pytest.raises(UserError) as exc:
         rc = bamsieve.filter_reads(input_bam=SUBREADS2,
                                    output_bam=ofn,
                                    percentage=50)
示例#2
0
    def test_subreadset_scraps(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        rc = bamsieve.filter_reads(input_bam=BARCODED_DS,
                                   output_bam=ofn,
                                   whitelist=[74056024])
        assert rc == 0

        def _verify():
            with SubreadSet(ofn, strict=False) as ds_out:
                ext_res = ds_out.externalResources[0]
                assert ext_res.bam.endswith(".subreads.bam")
                assert ext_res.scraps.endswith(".scraps.bam")
                for bam_file in [ext_res.bam, ext_res.scraps]:
                    with BamReader(bam_file) as bam:
                        zmws = set([rec.HoleNumber for rec in bam])
                        assert len(zmws) == 1
                        assert 74056024 in zmws

        _verify()
        rc = bamsieve.filter_reads(input_bam=BARCODED_DS,
                                   output_bam=ofn,
                                   count=1,
                                   seed=1)
        _verify()
        rc = bamsieve.filter_reads(input_bam=BARCODED_DS,
                                   output_bam=ofn,
                                   blacklist=[28901719])
        assert rc == 0
示例#3
0
def split_barcoded_dataset(file_name, ext=".subreadset.xml"):
    from pbcoretools.bamsieve import filter_reads
    ds_in = openDataSet(file_name)
    ds_dir = tempfile.mkdtemp()
    ds_files = []
    for bc, label in zip([0, 2], ["lbc1--lbc1", "lbc3--lbc3"]):
        ds_tmp = op.join(ds_dir, "lima_output.{l}{e}".format(l=label, e=ext))
        filter_reads(input_bam=file_name,
                     output_bam=ds_tmp,
                     whitelist=[bc],
                     use_barcodes=True)
        ds_files.append(
            DataStoreFile(uuid.uuid4(), "barcoding.tasks.lima-0",
                          ds_in.datasetType, ds_tmp))
    return DataStore(ds_files)
示例#4
0
    def test_whitelist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        WHITELIST = set([24962, 32901, 30983])

        def _run_with_whitelist(wl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       whitelist=wl)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                have_zmws = set([rec.HoleNumber for rec in bam_out])
                assert have_zmws == WHITELIST

        _run_with_whitelist(WHITELIST)
        _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(WHITELIST)]))
        _run_with_whitelist(tmp_wl)
        # now with a BAM file as whitelist
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn,
                                   whitelist=SUBREADS4)
        with BamReader(ofn) as bam_out:
            assert 117 == len([rec for rec in bam_out])
示例#5
0
 def _run_with_whitelist(wl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                whitelist=wl)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == WHITELIST
示例#6
0
 def _run_with_blacklist(bl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS2,
                                output_bam=ofn,
                                blacklist=bl)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == set([9])
示例#7
0
 def test_anonymize(self):
     ofn1 = tempfile.NamedTemporaryFile(suffix=".bam").name
     ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn1,
                                whitelist=set([24962]))
     assert rc == 0
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn2,
                                whitelist=set([24962]),
                                anonymize=True)
     assert rc == 0
     with openDataFile(ofn1) as bam1:
         with openDataFile(ofn2) as bam2:
             for rec1, rec2 in zip(bam1, bam2):
                 assert rec1.qName == rec2.qName
                 assert rec1.peer.seq != rec2.peer.seq
示例#8
0
 def _run_with_blacklist(bl):
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                blacklist=bl,
                                use_subreads=True)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         qnames = set([rec.qName for rec in bam_out])
         assert qnames & BLACKLIST == set()
         assert len([x for x in bam_out]) == 114
示例#9
0
 def test_count(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                output_bam=ofn,
                                count=1,
                                seed=12345)
     assert rc == 0
     with BamReader(ofn) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         assert len(zmws) == 1
示例#10
0
 def test_barcodes(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     rc = bamsieve.filter_reads(input_bam=BARCODED,
                                output_bam=ofn,
                                whitelist=[0],
                                use_barcodes=True)
     with BamReader(ofn) as bam_out:
         zmws = set([rec.HoleNumber for rec in bam_out])
         assert len(zmws) == 1
         assert 74056024 in zmws
示例#11
0
 def test_count_overflow(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
     with warnings.catch_warnings(record=True) as w:
         rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                    output_bam=ofn,
                                    count=100000,
                                    seed=12345)
         assert rc == 0
         assert len(w) == 1
         with BamReader(ofn) as bam_out:
             zmws = set([rec.HoleNumber for rec in bam_out])
             assert len(zmws) == 48
示例#12
0
 def test_dataset_relative_paths(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     basename = op.basename(ofn).split(".")[0]
     rc = bamsieve.filter_reads(input_bam=DS2,
                                output_bam=ofn,
                                whitelist="8",
                                relative=True)
     assert rc == 0
     # move everything to another directory and make sure paths still work
     tmpdir = tempfile.mkdtemp()
     for file_name in os.listdir(op.dirname(ofn)):
         if file_name.startswith(basename):
             shutil.move(op.join(op.dirname(ofn), file_name),
                         op.join(tmpdir, file_name))
     ofn2 = op.join(tmpdir, op.basename(ofn))
     with SubreadSet(ofn2, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == set([8])
示例#13
0
 def test_sts_xml(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     rc = bamsieve.filter_reads(input_bam=SUBREADS_STS,
                                output_bam=ofn,
                                count=1,
                                seed=12345,
                                keep_original_uuid=True)
     assert rc == 0
     with SubreadSet(ofn, strict=True) as ds:
         with SubreadSet(SUBREADS_STS) as ds_in:
             assert ds_in.uuid == ds.uuid
         for er in ds.externalResources:
             if er.metaType == FileTypes.BAM_SUB.file_type_id:
                 assert er.sts is not None
                 assert os.path.exists(er.sts)
                 assert er.sts != ds_in.externalResources[0].sts
                 break
         else:
             self.fail("Can't find subreads BAM")
示例#14
0
 def test_dataset_io(self):
     ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     rc = bamsieve.filter_reads(input_bam=DS2,
                                output_bam=ofn,
                                whitelist="8")
     assert rc == 0
     with SubreadSet(ofn, strict=False) as bam_out:
         with SubreadSet(DS2) as ds_in:
             assert ds_in.uuid != bam_out.uuid
             assert bam_out.name == ds_in.name + " (bamsieve)"
             assert bam_out.tags == ds_in.tags
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == set([8])
     # make sure paths are absolute
     tmpdir = tempfile.mkdtemp()
     ofn2 = op.join(tmpdir, op.basename(ofn))
     shutil.copyfile(ofn, ofn2)
     with SubreadSet(ofn2, strict=False) as bam_out:
         have_zmws = set([rec.HoleNumber for rec in bam_out])
         assert have_zmws == set([8])
示例#15
0
    def test_subreads_whitelist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
        WHITELIST = set([
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298'
        ])
        ZMWS = set([1650, 7957])

        def _run_with_whitelist(wl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       whitelist=wl,
                                       use_subreads=True)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                have_zmws = set([rec.HoleNumber for rec in bam_out])
                assert have_zmws == ZMWS
                qnames = set([rec.qName for rec in bam_out])
                assert qnames == WHITELIST

        _run_with_whitelist(WHITELIST)
        _run_with_whitelist(",".join([str(x) for x in list(WHITELIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(WHITELIST)]))
        _run_with_whitelist(tmp_wl)
        # now with a BAM file as whitelist
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn2,
                                   use_subreads=True,
                                   whitelist=ofn)

        with BamReader(ofn) as bam_out:
            subreads = set([x.qName for x in bam_out])
        with BamReader(ofn2) as bam_out:
            subreads2 = set([x.qName for x in bam_out])
        assert subreads == subreads2
示例#16
0
    def test_subreads_blacklist(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".bam").name
        ofn2 = tempfile.NamedTemporaryFile(suffix=".bam").name
        BLACKLIST = set([
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/1920_2155',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9554_9634',
            'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/1650/2200_3298'
        ])

        def _run_with_blacklist(bl):
            rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                       output_bam=ofn,
                                       blacklist=bl,
                                       use_subreads=True)
            assert rc == 0
            with BamReader(ofn) as bam_out:
                qnames = set([rec.qName for rec in bam_out])
                assert qnames & BLACKLIST == set()
                assert len([x for x in bam_out]) == 114

        _run_with_blacklist(BLACKLIST)
        _run_with_blacklist(",".join([str(x) for x in list(BLACKLIST)]))
        tmp_wl = tempfile.NamedTemporaryFile(suffix=".txt").name
        with open(tmp_wl, "w") as wl_out:
            wl_out.write("\n".join([str(x) for x in list(BLACKLIST)]))
        _run_with_blacklist(tmp_wl)

        # now with the BAM file we just made as blacklist
        EXPECTED_OUT = BLACKLIST
        rc = bamsieve.filter_reads(input_bam=SUBREADS3,
                                   output_bam=ofn2,
                                   use_subreads=True,
                                   blacklist=ofn)

        with BamReader(ofn) as bam_out:
            subreads = set([x.qName for x in bam_out])
        with BamReader(ofn2) as bam_out:
            subreads2 = set([x.qName for x in bam_out])
        assert subreads & subreads2 == set()
        assert subreads2 == EXPECTED_OUT

        # now an integration test, because this is used in Cromwell workflow
        ofn3 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
        args = ["bamsieve", "--subreads", "--blacklist", ofn, SUBREADS3, ofn3]
        rc = subprocess.check_call(args)
        with BamReader(ofn3) as bam_out:
            subreads3 = set([x.qName for x in bam_out])
            assert subreads & subreads3 == set()
            assert subreads3 == EXPECTED_OUT
        # and again, with a dataset as input
        ds_tmp = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        with SubreadSet(ofn) as ds:
            ds.write(ds_tmp)
        ofn4 = tempfile.NamedTemporaryFile(suffix=".subreads.bam").name
        args = [
            "bamsieve", "--subreads", "--blacklist", ds_tmp, SUBREADS3, ofn4
        ]
        rc = subprocess.check_call(args)
        with BamReader(ofn4) as bam_out:
            subreads4 = set([x.qName for x in bam_out])
            assert subreads & subreads4 == set()
            assert subreads4 == EXPECTED_OUT
def _make_filtered(ds_file):
    tmp_file = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
    bamsieve.filter_reads(input_bam=ds_file,
                          output_bam=tmp_file,
                          blacklist={49050})
    return tmp_file