示例#1
0
 def test_contigset_build(self):
     ds1 = ContigSet(data.getXml(3), skipMissing=True)
     assert type(ds1).__name__ == 'ContigSet'
     assert type(ds1._metadata).__name__ == 'ContigSetMetadata'
     ds2 = ContigSet(data.getXml(3), skipMissing=True)
     assert type(ds2).__name__ == 'ContigSet'
     assert type(ds2._metadata).__name__ == 'ContigSetMetadata'
示例#2
0
 def test_copy(self):
     ds1 = DataSet(data.getXml())
     ds2 = ds1.copy()
     self.assertFalse(ds1 == ds2)
     self.assertFalse(ds1.uuid == ds2.uuid)
     self.assertFalse(ds1 is ds2)
     self.assertTrue(ds1.name == ds2.name)
     self.assertTrue(ds1.externalResources == ds2.externalResources)
     # The name and UniqueId are different:
     self.assertFalse(ds1.objMetadata == ds2.objMetadata)
     self.assertTrue(ds1.filters == ds2.filters)
     self.assertTrue(ds1.subdatasets == ds2.subdatasets)
     self.assertTrue(len(ds1.subdatasets) == 2)
     self.assertTrue(len(ds2.subdatasets) == 2)
     assert not reduce(lambda x, y: x or y, [
         ds1d is ds2d for ds1d in ds1.subdatasets
         for ds2d in ds2.subdatasets
     ])
     # TODO: once simulated files are indexable, turn on strict:
     ds1 = SubreadSet(data.getXml(no=10), strict=False)
     self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata')
     ds2 = ds1.copy()
     self.assertEquals(type(ds2.metadata).__name__, 'SubreadSetMetadata')
     # Lets try casting
     ds1 = DataSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'DataSet')
     ds1 = ds1.copy(asType='SubreadSet')
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
     # Lets do some illicit casting
     with self.assertRaises(TypeError):
         ds1 = ds1.copy(asType='ReferenceSet')
     # Lets try not having to cast
     ds1 = SubreadSet(data.getBam())
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
示例#3
0
 def test_getitem(self):
     types = [AlignmentSet(data.getXml(8)),
              ReferenceSet(data.getXml(9)),
              SubreadSet(data.getXml(10)),
             ]
     for ds in types:
         self.assertTrue(ds[0])
示例#4
0
    def test_subreadset_metadata_element_name(self):
        # without touching the element:
        sset = SubreadSet(data.getXml(9))
        log.debug(data.getXml(9))
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml")
        log.debug(fn.name)
        sset.write(fn.name)
        f = ET.parse(fn.name)
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')) == 0
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')) == 1
        fn.close()

        # with touching the element:
        sset = SubreadSet(data.getXml(9))
        sset.metadata.description = 'foo'
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml")
        sset.write(fn.name, validate=False)
        f = ET.parse(fn.name)
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')) == 0
        assert len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')) == 1
        fn.close()
 def test_contigset_build(self):
     ds1 = ContigSet(data.getXml(3), skipMissing=True)
     self.assertEquals(type(ds1).__name__, 'ContigSet')
     self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
     ds2 = ContigSet(data.getXml(3), skipMissing=True)
     self.assertEquals(type(ds2).__name__, 'ContigSet')
     self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
    def test_alignment_reference(self):
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8),
            referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8),
            referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
示例#7
0
 def test_contigset_build(self):
     ds1 = ContigSet(data.getXml(3), skipMissing=True)
     self.assertEquals(type(ds1).__name__, 'ContigSet')
     self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
     ds2 = ContigSet(data.getXml(3), skipMissing=True)
     self.assertEquals(type(ds2).__name__, 'ContigSet')
     self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
示例#8
0
 def test_split(self):
     ds1 = DataSet(data.getXml())
     self.assertTrue(ds1.numExternalResources > 1)
     dss = ds1.split()
     self.assertTrue(len(dss) == ds1.numExternalResources)
     dss = ds1.split(chunks=1)
     self.assertTrue(len(dss) == 1)
     dss = ds1.split(chunks=2, ignoreSubDatasets=True)
     self.assertTrue(len(dss) == 2)
     self.assertFalse(dss[0].uuid == dss[1].uuid)
     self.assertTrue(dss[0].name == dss[1].name)
     # Lets try merging and splitting on subdatasets
     ds1 = DataSet(data.getXml(8))
     self.assertEquals(ds1.totalLength, 123588)
     ds1tl = ds1.totalLength
     ds2 = DataSet(data.getXml(11))
     self.assertEquals(ds2.totalLength, 117086)
     ds2tl = ds2.totalLength
     dss = ds1 + ds2
     self.assertTrue(dss.totalLength == (ds1tl + ds2tl))
     ds1, ds2 = sorted(dss.split(2),
                       key=lambda x: x.totalLength,
                       reverse=True)
     self.assertTrue(ds1.totalLength == ds1tl)
     self.assertTrue(ds2.totalLength == ds2tl)
示例#9
0
 def test_ccsread_build(self):
     ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
     self.assertEquals(type(ds1).__name__, 'ConsensusReadSet')
     self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata')
     ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
     self.assertEquals(type(ds2).__name__, 'ConsensusReadSet')
     self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
示例#10
0
    def test_alignment_reference(self):
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
 def test_ccsread_build(self):
     ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
     self.assertEquals(type(ds1).__name__, 'ConsensusReadSet')
     self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata')
     ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
     self.assertEquals(type(ds2).__name__, 'ConsensusReadSet')
     self.assertEquals(type(ds2._metadata).__name__, 'SubreadSetMetadata')
示例#12
0
 def test_subreadset_split_metadata_element_name(self):
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     sset = SubreadSet(data.getXml(10), data.getXml(13))
     chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True)
     self.assertEqual(len(chunks), 2)
     chunks[0].write(fn)
示例#13
0
    def test_subreadset_metadata_element_name(self):
        # without touching the element:
        sset = SubreadSet(data.getXml(10))
        log.debug(data.getXml(10))
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.debug(fn)
        sset.write(fn)
        f = ET.parse(fn)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')),
            0)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')),
            1)

        # with touching the element:
        sset = SubreadSet(data.getXml(10))
        sset.metadata.description = 'foo'
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn, validate=False)
        f = ET.parse(fn)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'SubreadSetMetadata')),
            0)
        self.assertEqual(len(f.getroot().findall(
            '{http://pacificbiosciences.com/PacBioDatasets.xsd}'
            'DataSetMetadata')),
            1)
示例#14
0
 def test_ccsread_build(self):
     ds1 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
     assert type(ds1).__name__ == 'ConsensusReadSet'
     assert type(ds1._metadata).__name__ == 'SubreadSetMetadata'
     ds2 = ConsensusReadSet(data.getXml(2), strict=False, skipMissing=True)
     assert type(ds2).__name__ == 'ConsensusReadSet'
     assert type(ds2._metadata).__name__ == 'SubreadSetMetadata'
    def test_len(self):
        # AlignmentSet
        aln = AlignmentSet(data.getXml(8), strict=True)
        self.assertEqual(len(aln), 92)
        self.assertEqual(aln._length, (92, 123588))
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        self.assertEqual(sum(1 for _ in aln), 92)
        self.assertEqual(sum(len(rec) for rec in aln), 123588)

        # AlignmentSet with filters
        aln = AlignmentSet(data.getXml(15), strict=True)
        self.assertEqual(len(aln), 40)
        self.assertEqual(aln._length, (40, 52023))
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)

        # SubreadSet
        sset = SubreadSet(data.getXml(10), strict=True)
        self.assertEqual(len(sset), 92)
        self.assertEqual(sset._length, (92, 124093))
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        self.assertEqual(sum(1 for _ in sset), 92)
        self.assertEqual(sum(len(rec) for rec in sset), 124093)

        # ReferenceSet
        sset = ReferenceSet(data.getXml(9), strict=True)
        self.assertEqual(len(sset), 59)
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
示例#16
0
    def test_len(self):
        # AlignmentSet
        aln = AlignmentSet(data.getXml(7), strict=True)
        assert len(aln) == 92
        assert aln._length == (92, 123588)
        assert aln.totalLength == 123588
        assert aln.numRecords == 92
        aln.totalLength = -1
        aln.numRecords = -1
        assert aln.totalLength == -1
        assert aln.numRecords == -1
        aln.updateCounts()
        assert aln.totalLength == 123588
        assert aln.numRecords == 92
        assert sum(1 for _ in aln) == 92
        assert sum(len(rec) for rec in aln) == 123588

        # AlignmentSet with filters
        aln = AlignmentSet(data.getXml(14), strict=True)
        assert len(aln) == 40
        assert aln._length == (40, 52023)
        assert aln.totalLength == 52023
        assert aln.numRecords == 40
        aln.totalLength = -1
        aln.numRecords = -1
        assert aln.totalLength == -1
        assert aln.numRecords == -1
        aln.updateCounts()
        assert aln.totalLength == 52023
        assert aln.numRecords == 40

        # SubreadSet
        sset = SubreadSet(data.getXml(9), strict=True)
        assert len(sset) == 92
        assert sset._length == (92, 124093)
        assert sset.totalLength == 124093
        assert sset.numRecords == 92
        sset.totalLength = -1
        sset.numRecords = -1
        assert sset.totalLength == -1
        assert sset.numRecords == -1
        sset.updateCounts()
        assert sset.totalLength == 124093
        assert sset.numRecords == 92
        assert sum(1 for _ in sset) == 92
        assert sum(len(rec) for rec in sset) == 124093

        # ReferenceSet
        sset = ReferenceSet(data.getXml(8), strict=True)
        assert len(sset) == 59
        assert sset.totalLength == 85774
        assert sset.numRecords == 59
        sset.totalLength = -1
        sset.numRecords = -1
        assert sset.totalLength == -1
        assert sset.numRecords == -1
        sset.updateCounts()
        assert sset.totalLength == 85774
        assert sset.numRecords == 59
示例#17
0
 def test_getitem(self):
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9)),
     ]
     for ds in types:
         assert ds[0]
 def test_subreadset_split_metadata_element_name(self):
     fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
     log.debug(fn)
     sset = SubreadSet(data.getXml(10),
                       data.getXml(13))
     chunks = sset.split(chunks=5, zmws=False, ignoreSubDatasets=True)
     self.assertEqual(len(chunks), 2)
     chunks[0].write(fn)
示例#19
0
 def test_copyTo_cli_absolute_dir(self):
     # to a directory:
     # absolute:
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     fn = os.path.join(outdir, os.path.split(data.getXml(7))[1])
     cmd = "dataset copyto {i} {o}".format(i=data.getXml(7), o=outdir)
     self._run_cmd_with_output(cmd, fn)
     sset = AlignmentSet(fn, strict=True)
     assert not _is_relative(fn)
示例#20
0
 def test_contigset_build(self):
     ds1 = ContigSet(data.getXml(3))
     self.assertEquals(type(ds1).__name__, 'ContigSet')
     self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
     ds2 = ContigSet(data.getXml(3))
     self.assertEquals(type(ds2).__name__, 'ContigSet')
     self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
     for contigmd in ds2.metadata.contigs:
         self.assertEquals(type(contigmd).__name__, 'ContigMetadata')
示例#21
0
 def test_contigset_build(self):
     ds1 = ContigSet(data.getXml(3))
     self.assertEquals(type(ds1).__name__, 'ContigSet')
     self.assertEquals(type(ds1._metadata).__name__, 'ContigSetMetadata')
     ds2 = ContigSet(data.getXml(3))
     self.assertEquals(type(ds2).__name__, 'ContigSet')
     self.assertEquals(type(ds2._metadata).__name__, 'ContigSetMetadata')
     for contigmd in ds2.metadata.contigs:
         self.assertEquals(type(contigmd).__name__, 'ContigMetadata')
示例#22
0
    def test_file_arg(self):
        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 10
        qn = [r.qName for r in sset[:size]]
        with open(fn, 'w') as ofh:
            for q in qn:
                ofh.write(q)
                ofh.write('\n')
        good_qn = [('=', fn)]
        sset.filters.addRequirement(qname=good_qn)
        assert size == sum(1 for _ in sset)
        assert size == len(sset)
        og = set(qn)
        for r in sset:
            og.discard(r.qName)
        assert len(og) == 0

        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 10
        qn = [r.qName for r in sset[:size]]
        with open(fn, 'w') as ofh:
            for q in qn:
                ofh.write(q)
                ofh.write('\n')
        good_qn = [('=', fn)]
        sset.filters.addRequirement(qname_file=good_qn)
        assert size == sum(1 for _ in sset)
        assert size == len(sset)
        og = set(qn)
        for r in sset:
            og.discard(r.qName)
        assert len(og) == 0

        fn = tempfile.NamedTemporaryFile(suffix="filterVals.txt").name
        log.debug(fn)
        sset = SubreadSet(data.getXml(9))
        assert len(sset) == 92
        size = 4
        hn = [r for r in sorted(list(set(sset.index.holeNumber)))[:size]]
        with open(fn, 'w') as ofh:
            for h in hn:
                ofh.write(str(h))
                ofh.write('\n')
        good_hn = [('=', fn)]
        sset.filters.addRequirement(zm=good_hn)
        assert size == len(set(sset.index.holeNumber))
        og = set(hn)
        for r in sset:
            og.discard(r.holeNumber)
        assert len(og) == 0
示例#23
0
 def test_create_cli(self):
     log.debug("Absolute")
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     cmd = "dataset create --type AlignmentSet {o} {i1} {i2}".format(
         o=os.path.join(outdir, 'pbalchemysim.alignmentset.xml'),
         i1=data.getXml(7),
         i2=data.getXml(10))
     self._check_cmd(cmd)
     assert os.path.exists(
         os.path.join(outdir, os.path.basename(data.getXml(11))))
示例#24
0
 def test_create_cli_relative(self):
     log.debug("Relative")
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     ofn = self._get_mock_alignment_set_out(outdir)
     cmd = ("dataset create --relative --type AlignmentSet "
            "{o} {i1} {i2}".format(o=ofn,
                                   i1=data.getXml(7),
                                   i2=data.getXml(10)))
     self._check_cmd(cmd)
     assert os.path.exists(ofn)
示例#25
0
 def test_create_cli_automatic_type(self):
     log.debug("No type specified")
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     ofn = self._get_mock_alignment_set_out(outdir)
     cmd = "dataset create {o} {i1} {i2}".format(o=ofn,
                                                 i1=data.getXml(7),
                                                 i2=data.getXml(10))
     self._run_cmd_with_output(cmd, ofn)
     aset = AlignmentSet(ofn)
     shutil.rmtree(outdir)
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [AlignmentSet(data.getXml(8)),
              ReferenceSet(data.getXml(9)),
              SubreadSet(data.getXml(10)),
              #ConsensusAlignmentSet(data.getXml(20)),
              HdfSubreadSet(data.getXml(19))]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         self.assertEqual(type(mystery), type(ds))
示例#27
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [
         AlignmentSet(data.getXml(7)),
         ReferenceSet(data.getXml(8)),
         SubreadSet(data.getXml(9))
     ]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         assert type(mystery) == type(ds)
示例#28
0
 def test_checkFilterMatch(self):
     # different resourceIds, compatible filters:
     ds1 = DataSet(data.getXml(no=8))
     ds2 = DataSet(data.getXml(no=11))
     #self.assertTrue(ds1._checkFilterMatch(ds2.filters))
     self.assertTrue(ds1.filters.testCompatibility(ds2.filters))
     # different resourceIds, incompatible filters:
     ds3 = DataSet(data.getXml(no=11))
     ds3.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
     #self.assertFalse(ds1._checkFilterMatch(ds3.filters))
     self.assertFalse(ds1.filters.testCompatibility(ds3.filters))
示例#29
0
 def test_create_cli_generate_indices_2(self):
     log.debug("Generate existing indices no type specified")
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     ofn = self._get_mock_alignment_set_out(outdir)
     cmd = ("dataset create "
            "--generateIndices {o} {i1} {i2}").format(o=ofn,
                                                      i1=data.getXml(7),
                                                      i2=data.getXml(10))
     self._run_cmd_with_output(cmd, ofn)
     aset = AlignmentSet(ofn, strict=True)
     shutil.rmtree(outdir)
示例#30
0
 def test_checkFilterMatch(self):
     # different resourceIds, compatible filters:
     ds1 = DataSet(data.getXml(no=8))
     ds2 = DataSet(data.getXml(no=11))
     #self.assertTrue(ds1._checkFilterMatch(ds2.filters))
     self.assertTrue(ds1.filters.testCompatibility(ds2.filters))
     # different resourceIds, incompatible filters:
     ds3 = DataSet(data.getXml(no=11))
     ds3.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
     #self.assertFalse(ds1._checkFilterMatch(ds3.filters))
     self.assertFalse(ds1.filters.testCompatibility(ds3.filters))
示例#31
0
 def test_file_factory(self):
     # TODO: add ConsensusReadSet, cmp.h5 alignmentSet
     types = [
         AlignmentSet(data.getXml(8)),
         ReferenceSet(data.getXml(9)),
         SubreadSet(data.getXml(10)),
         #ConsensusAlignmentSet(data.getXml(20)),
         HdfSubreadSet(data.getXml(19))
     ]
     for ds in types:
         mystery = openDataFile(ds.toExternalFiles()[0])
         self.assertEqual(type(mystery), type(ds))
示例#32
0
 def test_subreadset_consolidate(self):
     log.debug("Test through API")
     aln = SubreadSet(data.getXml(10), data.getXml(13))
     self.assertEqual(len(aln.toExternalFiles()), 2)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'merged.bam')
     aln.consolidate(outfn)
     self.assertTrue(os.path.exists(outfn))
     self.assertEqual(len(aln.toExternalFiles()), 1)
     nonCons = SubreadSet(data.getXml(10), data.getXml(13))
     self.assertEqual(len(nonCons.toExternalFiles()), 2)
     for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
         self.assertEqual(read1, read2)
     self.assertEqual(len(aln), len(nonCons))
示例#33
0
 def test_subreadset_consolidate(self):
     log.debug("Test through API")
     aln = SubreadSet(data.getXml(10), data.getXml(13))
     self.assertEqual(len(aln.toExternalFiles()), 2)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'merged.bam')
     aln.consolidate(outfn)
     self.assertTrue(os.path.exists(outfn))
     self.assertEqual(len(aln.toExternalFiles()), 1)
     nonCons = SubreadSet(data.getXml(10), data.getXml(13))
     self.assertEqual(len(nonCons.toExternalFiles()), 2)
     for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
         self.assertEqual(read1, read2)
     self.assertEqual(len(aln), len(nonCons))
示例#34
0
    def test_nested_external_resources(self):
        log.debug("Testing nested externalResources in AlignmentSets")
        aln = AlignmentSet(data.getXml(0), skipMissing=True)
        self.assertTrue(aln.externalResources[0].pbi)
        self.assertTrue(aln.externalResources[0].reference)
        self.assertEqual(
            aln.externalResources[0].externalResources[0].metaType,
            'PacBio.ReferenceFile.ReferenceFastaFile')
        self.assertEqual(aln.externalResources[0].scraps, None)

        log.debug("Testing nested externalResources in SubreadSets")
        subs = SubreadSet(data.getXml(5), skipMissing=True)
        self.assertTrue(subs.externalResources[0].scraps)
        self.assertEqual(
            subs.externalResources[0].externalResources[0].metaType,
            'PacBio.SubreadFile.ScrapsBamFile')
        self.assertEqual(subs.externalResources[0].reference, None)

        log.debug("Testing added nested externalResoruces to SubreadSet")
        subs = SubreadSet(data.getXml(10))
        self.assertFalse(subs.externalResources[0].scraps)
        subs.externalResources[0].scraps = 'fake.fasta'
        self.assertTrue(subs.externalResources[0].scraps)
        self.assertEqual(
            subs.externalResources[0].externalResources[0].metaType,
            'PacBio.SubreadFile.ScrapsBamFile')
        subs.externalResources[0].barcodes = 'bc.fasta'
        self.assertTrue(subs.externalResources[0].barcodes)
        self.assertEqual(
            subs.externalResources[0].externalResources[1].metaType,
            "PacBio.DataSet.BarcodeSet")

        subs.externalResources[0].adapters = 'foo.adapters.fasta'
        self.assertEqual(subs.externalResources[0].adapters,
                         'foo.adapters.fasta')
        self.assertEqual(
            subs.externalResources[0].externalResources[2].metaType,
            "PacBio.SubreadFile.AdapterFastaFile")

        log.debug("Testing adding nested externalResources to AlignmetnSet "
                  "manually")
        aln = AlignmentSet(data.getXml(8))
        self.assertTrue(aln.externalResources[0].bai)
        self.assertTrue(aln.externalResources[0].pbi)
        self.assertFalse(aln.externalResources[0].reference)
        aln.externalResources[0].reference = 'fake.fasta'
        self.assertTrue(aln.externalResources[0].reference)
        self.assertEqual(
            aln.externalResources[0].externalResources[0].metaType,
            'PacBio.ReferenceFile.ReferenceFastaFile')
    def test_nested_external_resources(self):
        log.debug("Testing nested externalResources in AlignmentSets")
        aln = AlignmentSet(data.getXml(0), skipMissing=True)
        self.assertTrue(aln.externalResources[0].pbi)
        self.assertTrue(aln.externalResources[0].reference)
        self.assertEqual(
            aln.externalResources[0].externalResources[0].metaType,
            'PacBio.ReferenceFile.ReferenceFastaFile')
        self.assertEqual(aln.externalResources[0].scraps, None)

        log.debug("Testing nested externalResources in SubreadSets")
        subs = SubreadSet(data.getXml(5), skipMissing=True)
        self.assertTrue(subs.externalResources[0].scraps)
        self.assertEqual(
            subs.externalResources[0].externalResources[0].metaType,
            'PacBio.SubreadFile.ScrapsBamFile')
        self.assertEqual(subs.externalResources[0].reference, None)

        log.debug("Testing added nested externalResoruces to SubreadSet")
        subs = SubreadSet(data.getXml(10))
        self.assertFalse(subs.externalResources[0].scraps)
        subs.externalResources[0].scraps = 'fake.fasta'
        self.assertTrue(subs.externalResources[0].scraps)
        self.assertEqual(
            subs.externalResources[0].externalResources[0].metaType,
            'PacBio.SubreadFile.ScrapsBamFile')
        subs.externalResources[0].barcodes = 'bc.fasta'
        self.assertTrue(subs.externalResources[0].barcodes)
        self.assertEqual(
            subs.externalResources[0].externalResources[1].metaType,
            "PacBio.DataSet.BarcodeSet")

        subs.externalResources[0].adapters = 'foo.adapters.fasta'
        self.assertEqual(subs.externalResources[0].adapters,
                         'foo.adapters.fasta')
        self.assertEqual(
            subs.externalResources[0].externalResources[2].metaType,
            "PacBio.SubreadFile.AdapterFastaFile")

        log.debug("Testing adding nested externalResources to AlignmetnSet "
                  "manually")
        aln = AlignmentSet(data.getXml(8))
        self.assertTrue(aln.externalResources[0].bai)
        self.assertTrue(aln.externalResources[0].pbi)
        self.assertFalse(aln.externalResources[0].reference)
        aln.externalResources[0].reference = 'fake.fasta'
        self.assertTrue(aln.externalResources[0].reference)
        self.assertEqual(
            aln.externalResources[0].externalResources[0].metaType,
            'PacBio.ReferenceFile.ReferenceFastaFile')
 def test_incorrect_len_getitem(self):
     types = [AlignmentSet(data.getXml(8)),
              ReferenceSet(data.getXml(9)),
              SubreadSet(data.getXml(10)),
              HdfSubreadSet(data.getXml(19))]
     fn = tempfile.NamedTemporaryFile(suffix=".xml").name
     for ds in types:
         explen = -2
         with openDataFile(ds.toExternalFiles()[0]) as mystery:
             # try to avoid crashes...
             explen = len(mystery)
             mystery.numRecords = 1000000000
             mystery.write(fn)
         with openDataFile(fn) as mystery:
             self.assertEqual(len(list(mystery)), explen)
示例#37
0
 def test_subread_build(self):
     ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
     ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
     assert type(ds1).__name__ == 'SubreadSet'
     assert ds1._metadata.__class__.__name__ == 'SubreadSetMetadata'
     assert type(ds1._metadata).__name__ == 'SubreadSetMetadata'
     assert type(ds1.metadata).__name__ == 'SubreadSetMetadata'
     assert len(ds1.metadata.collections) == 1
     assert len(ds2.metadata.collections) == 1
     ds3 = ds1 + ds2
     assert len(ds3.metadata.collections) == 2
     ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True)
     assert type(ds4).__name__ == 'SubreadSet'
     assert type(ds4._metadata).__name__ == 'SubreadSetMetadata'
     assert len(ds4.metadata.collections) == 1
示例#38
0
 def test_filter_cli(self):
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, "filtered8.xml")
     log.debug(outfn)
     cmd = "dataset filter {i} {o} {f}".format(i=data.getXml(7),
                                               o=outfn,
                                               f="rname=E.faecalis.1")
     self._run_cmd_with_output(cmd, outfn)
     aln = AlignmentSet(data.getXml(7))
     aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
     aln.updateCounts()
     dset = AlignmentSet(outfn)
     assert str(aln.filters) == str(dset.filters)
     assert aln.totalLength == dset.totalLength
     assert aln.numRecords == dset.numRecords
示例#39
0
 def test_factory_function(self):
     bam = data.getBam()
     aln = data.getXml(8)
     ref = data.getXml(9)
     sub = data.getXml(10)
     inTypes = [bam, aln, ref, sub]
     expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet]
     for infn, exp in zip(inTypes, expTypes):
         # TODO enable this for all when simulated subread files can be
         # pbi'd
         if exp in [DataSet, ReferenceSet, AlignmentSet]:
             ds = openDataSet(infn, strict=True)
         else:
             ds = openDataSet(infn)
         self.assertEqual(type(ds), exp)
示例#40
0
 def test_factory_function(self):
     bam = data.getBam()
     aln = data.getXml(8)
     ref = data.getXml(9)
     sub = data.getXml(10)
     inTypes = [bam, aln, ref, sub]
     expTypes = [DataSet, AlignmentSet, ReferenceSet, SubreadSet]
     for infn, exp in zip(inTypes, expTypes):
         # TODO enable this for all when simulated subread files can be
         # pbi'd
         if exp in [DataSet, ReferenceSet, AlignmentSet]:
             ds = openDataSet(infn, strict=True)
         else:
             ds = openDataSet(infn)
         self.assertEqual(type(ds), exp)
示例#41
0
    def test_loadmetadata_from_dataset_create_cli(self):
        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        log.debug(fn)

        aln = AlignmentSet(data.getXml(8))
        aln.metadata.collections = None
        aln.copyTo(fn)
        aln.close()
        del aln
        self.assertTrue(os.path.exists(fn))

        aln = AlignmentSet(fn)
        self.assertFalse(aln.metadata.collections)

        cmd = "dataset create --metadata {m} {o} {i}".format(
            o=fn2,
            i=fn,
            m=("/pbi/dept/secondary/siv/testdata/"
               "SA3-Sequel/lambda/roche_SAT/"
               "m54013_151205_032353.subreadset.xml"))
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0, m)
        aln = AlignmentSet(fn2)
        self.assertTrue(aln.metadata.collections)
示例#42
0
    def test_referenceset_contigs(self):
        names = [
            'A.baumannii.1', 'A.odontolyticus.1', 'B.cereus.1', 'B.cereus.2',
            'B.cereus.4', 'B.cereus.6', 'B.vulgatus.1', 'B.vulgatus.2',
            'B.vulgatus.3', 'B.vulgatus.4', 'B.vulgatus.5', 'C.beijerinckii.1',
            'C.beijerinckii.2', 'C.beijerinckii.3', 'C.beijerinckii.4',
            'C.beijerinckii.5', 'C.beijerinckii.6', 'C.beijerinckii.7',
            'C.beijerinckii.8', 'C.beijerinckii.9', 'C.beijerinckii.10',
            'C.beijerinckii.11', 'C.beijerinckii.12', 'C.beijerinckii.13',
            'C.beijerinckii.14', 'D.radiodurans.1', 'D.radiodurans.2',
            'E.faecalis.1', 'E.faecalis.2', 'E.coli.1', 'E.coli.2', 'E.coli.4',
            'E.coli.5', 'E.coli.6', 'E.coli.7', 'H.pylori.1', 'L.gasseri.1',
            'L.monocytogenes.1', 'L.monocytogenes.2', 'L.monocytogenes.3',
            'L.monocytogenes.5', 'N.meningitidis.1', 'P.acnes.1',
            'P.aeruginosa.1', 'P.aeruginosa.2', 'R.sphaeroides.1',
            'R.sphaeroides.3', 'S.aureus.1', 'S.aureus.4', 'S.aureus.5',
            'S.epidermidis.1', 'S.epidermidis.2', 'S.epidermidis.3',
            'S.epidermidis.4', 'S.epidermidis.5', 'S.agalactiae.1',
            'S.mutans.1', 'S.mutans.2', 'S.pneumoniae.1']
        seqlens = [1458, 1462, 1472, 1473, 1472, 1472, 1449, 1449, 1449, 1449,
                   1449, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433, 1433,
                   1433, 1433, 1433, 1433, 1433, 1423, 1423, 1482, 1482, 1463,
                   1463, 1463, 1463, 1463, 1463, 1424, 1494, 1471, 1471, 1471,
                   1471, 1462, 1446, 1457, 1457, 1386, 1388, 1473, 1473, 1473,
                   1472, 1472, 1472, 1472, 1472, 1470, 1478, 1478, 1467]
        ds = ReferenceSet(data.getXml(9))
        log.debug([contig.id for contig in ds])
        for contig, name, seqlen in zip(ds.contigs, names, seqlens):
            self.assertEqual(contig.id, name)
            self.assertEqual(len(contig.sequence), seqlen)

        for name in names:
            self.assertTrue(ds.get_contig(name))
    def test_len_h5(self):
        # HdfSubreadSet
        # len means something else in bax/bas land. These numbers may actually
        # be correct...
        sset = HdfSubreadSet(data.getXml(17), strict=True)
        self.assertEqual(len(sset), 9)
        self.assertEqual(sset._length, (9, 128093))
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)

        # AlignmentSet with cmp.h5
        aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
        self.assertEqual(len(aln), 112)
        self.assertEqual(aln._length, (112, 59970))
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)
示例#44
0
    def test_reads_in_reference(self):
        ds = DataSet(data.getBam())
        refNames = ds.refNames

        # See test_ref_names for why this is expected:
        rn = refNames[15]
        reads = ds.readsInReference(rn)
        self.assertEqual(len(list(reads)), 11)

        ds2 = DataSet(data.getBam(0))
        reads = ds2.readsInReference("E.faecalis.1")
        self.assertEqual(len(list(reads)), 20)

        reads = ds2.readsInReference("E.faecalis.2")
        self.assertEqual(len(list(reads)), 3)

        ds2 = DataSet(data.getXml(8))
        reads = ds2.readsInReference("E.faecalis.1")
        self.assertEqual(len(list(reads)), 20)

        ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])

        # Because of the filter!
        reads = ds2.readsInReference("E.faecalis.2")
        self.assertEqual(len(list(reads)), 0)
 def test_subread_build(self):
     ds1 = SubreadSet(data.getXml(no=5), skipMissing=True)
     ds2 = SubreadSet(data.getXml(no=5), skipMissing=True)
     self.assertEquals(type(ds1).__name__, 'SubreadSet')
     self.assertEquals(ds1._metadata.__class__.__name__,
                       'SubreadSetMetadata')
     self.assertEquals(type(ds1._metadata).__name__, 'SubreadSetMetadata')
     self.assertEquals(type(ds1.metadata).__name__, 'SubreadSetMetadata')
     self.assertEquals(len(ds1.metadata.collections), 1)
     self.assertEquals(len(ds2.metadata.collections), 1)
     ds3 = ds1 + ds2
     self.assertEquals(len(ds3.metadata.collections), 2)
     ds4 = SubreadSet(data.getSubreadSet(), skipMissing=True)
     self.assertEquals(type(ds4).__name__, 'SubreadSet')
     self.assertEquals(type(ds4._metadata).__name__, 'SubreadSetMetadata')
     self.assertEquals(len(ds4.metadata.collections), 1)
示例#46
0
    def test_qname_filter_scaling(self):
        # unaligned bam
        bam0 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590956/0003/"
                "Analysis_Results/m140913_222218_42240_c10069"
                "9952400000001823139203261564_s1_p0.all.subreadset.xml")
        bam1 = ("/pbi/dept/secondary/siv/testdata/"
                "SA3-DS/ecoli/2590953/0001/"
                "Analysis_Results/m140913_005018_42139_c10071"
                "3652400000001823152404301534_s1_p0.all.subreadset.xml")
        sset = SubreadSet(bam0, bam1)
        self.assertEqual(len(sset), 178570)
        size = 10
        qn = [r.qName for r in sset[:size]]
        good_qn = [('=', name) for name in qn]
        sset.filters.addRequirement(qname=good_qn)
        self.assertEqual(size, sum(1 for _ in sset))
        self.assertEqual(size, len(sset))


        sset = SubreadSet(data.getXml(10))
        self.assertEqual(len(sset), 92)
        size = 10
        qn = [r.qName for r in sset[:size]]
        good_qn = [('=', name) for name in qn]
        sset.filters.addRequirement(qname=good_qn)
        self.assertEqual(size, sum(1 for _ in sset))
        self.assertEqual(size, len(sset))
示例#47
0
    def test_add_double_bound_filters(self):
        ds1 = AlignmentSet(data.getXml(8))
        ds1.filters.addRequirement(rq=[('>', '0.85'), ('<', '0.99')])
        self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')

        ds1 = AlignmentSet(data.getXml(8))
        self.assertEquals(str(ds1.filters), '')
        ds1.filters.addFilter(rq=[('>', '0.85'), ('<', '0.99')])
        self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')

        ds1.filters.addFilter(length=[('>', '1000')])
        self.assertEquals(str(ds1.filters),
                          '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')

        ds1.filters.removeFilter(0)
        self.assertEquals(str(ds1.filters), '( length > 1000 )')
    def test_contigset_consolidate_int_names(self):
        #build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(
                      i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
                      o=inFas))
        rs1 = ContigSet(inFas)

        double = 'B.cereus.1'
        exp_double = rs1.get_contig(double)

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord('5141', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord('5142', exp_double.sequence)

        exp_double_seqs = [exp_double.sequence, exp_double.sequence]
        exp_names = ['5141', '5142']

        obs_file = ContigSet(outFas1, outFas2)
        log.debug(obs_file.toExternalFiles())
        obs_file.consolidate()
        log.debug(obs_file.toExternalFiles())

        # open obs and compare to exp
        for name, seq in zip(exp_names, exp_double_seqs):
            self.assertEqual(obs_file.get_contig(name).sequence[:], seq)
 def test_pbmerge_indexing(self):
     log.debug("Test through API")
     aln = AlignmentSet(data.getXml(12))
     self.assertEqual(len(aln.toExternalFiles()), 2)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'merged.bam')
     log.info(outfn)
     consolidateXml(aln, outfn, cleanup=False)
     self.assertTrue(os.path.exists(outfn))
     self.assertTrue(os.path.exists(outfn + '.pbi'))
     cons = AlignmentSet(outfn)
     self.assertEqual(len(aln), len(cons))
     orig_stats = os.stat(outfn + '.pbi')
     cons.externalResources[0].pbi = None
     self.assertEqual(None, cons.externalResources[0].pbi)
     # test is too quick, stat times might be within the same second
     time.sleep(1)
     cons.induceIndices()
     self.assertEqual(outfn + '.pbi', cons.externalResources[0].pbi)
     self.assertEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
     cons.externalResources[0].pbi = None
     self.assertEqual(None, cons.externalResources[0].pbi)
     # test is too quick, stat times might be within the same second
     time.sleep(1)
     cons.induceIndices(force=True)
     self.assertNotEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
示例#50
0
    def test_de_novo(self):
        ofn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        log.info(ofn)
        ss = SubreadSet(data.getXml(10))
        col = CollectionMetadata()
        self.assertFalse(ss.metadata.collections)

        ss.metadata.collections.append(col)
        self.assertTrue(ss.metadata.collections)

        col.cellIndex = 1
        self.assertTrue(ss.metadata.collections[0].cellIndex, 1)

        col.instrumentName = "foo"
        self.assertTrue(ss.metadata.collections[0].instrumentName, "foo")

        col.context = 'bar'
        self.assertTrue(ss.metadata.collections[0].context, "bar")

        ss.metadata.collections[0].runDetails.name = 'foo'
        self.assertEqual('foo', ss.metadata.collections[0].runDetails.name)

        ss.metadata.collections[0].wellSample.name = 'bar'
        self.assertEqual('bar', ss.metadata.collections[0].wellSample.name)

        ss.metadata.collections[0].wellSample.wellName = 'baz'
        self.assertEqual('baz', ss.metadata.collections[0].wellSample.wellName)

        ss.metadata.collections[0].wellSample.concentration = 'baz'
        self.assertEqual('baz',
                         ss.metadata.collections[0].wellSample.concentration)
        ss.write(ofn, validate=False)
示例#51
0
    def test_newUuid_random_cli(self):
        fn_orig = data.getXml(8)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        fn = os.path.join(outdir, 'fn.alignmentset.xml')
        fn2 = os.path.join(outdir, 'fn2.alignmentset.xml')
        with AlignmentSet(fn_orig) as aln:
            aln.copyTo(fn)
            shutil.copy(fn, fn2)

        pre_uuid = AlignmentSet(fn).uuid
        pre_uuid2 = AlignmentSet(fn2).uuid
        self.assertEqual(pre_uuid, pre_uuid2)

        cmd = "dataset newuuid --random {d}".format(d=fn)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
        self.assertTrue(os.path.exists(fn))

        cmd = "dataset newuuid --random {d}".format(d=fn2)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
        self.assertTrue(os.path.exists(fn2))

        post_uuid = AlignmentSet(fn).uuid
        post_uuid2 = AlignmentSet(fn2).uuid
        self.assertNotEqual(pre_uuid, post_uuid)
        self.assertNotEqual(pre_uuid2, post_uuid2)
        # RANDOM, THEREFORE THESE ARE NOT EQUAL:
        self.assertNotEqual(post_uuid, post_uuid2)
示例#52
0
    def test_split_by_contigs_presplit(self):
        # Consumes too much memory for Jenkins

        # Test to make sure the result of a split by contigs has an appropriate
        # number of records (make sure filters are appropriately aggressive)
        ds2 = DataSet(data.getXml(15))
        bams = ds2.externalResources.resourceIds
        self.assertEqual(len(bams), 2)
        refwindows = ds2.refWindows
        self.assertEqual(refwindows, [(0, 0, 224992)])
        res1 = openIndexedAlignmentFile(bams[0][7:])
        res2 = openIndexedAlignmentFile(bams[1][7:])

        def count(iterable):
            count = 0
            for _ in iterable:
                count += 1
            return count

        self.assertEqual(count(res1.readsInRange(*refwindows[0])), 1409)
        self.assertEqual(count(res2.readsInRange(*refwindows[0])), 1375)
        self.assertEqual(count(ds2.readsInRange(*refwindows[0])), 2784)
        self.assertEqual(count(ds2.records), 2784)
        ds2.disableFilters()
        self.assertEqual(count(ds2.records), 53552)
        self.assertEqual(ds2.countRecords(), 53552)
示例#53
0
    def test_subset_filter(self):
        ds2 = AlignmentSet(data.getXml(7))
        assert len(ds2) == 92
        modvalue = 8

        # manually:
        hns = ds2.index.holeNumber
        assert np.count_nonzero(hns % modvalue == 0) == 26

        # dset filters:
        ds2.filters.addRequirement(zm=[('=', '0', modvalue)])
        assert len(ds2) == 26

        # written:
        filtstr = '( Uint32Cast(zm) % 8 = 0 )'
        assert str(ds2.filters) == filtstr

        filtxmlstr = ('<pbbase:Property Hash="Uint32Cast" Modulo="8" '
                      'Name="zm" Operator="=" Value="0"/>')
        fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name
        ds2.write(fn)
        with open(fn, 'r') as ifh:
            found = False
            for line in ifh:
                if filtxmlstr in line:
                    found = True
        assert found
示例#54
0
    def test_loadmetadata_from_dataset_create_cli(self):
        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        log.debug(fn)

        aln = AlignmentSet(data.getXml(8))
        aln.metadata.collections = None
        aln.copyTo(fn)
        aln.close()
        del aln
        self.assertTrue(os.path.exists(fn))

        aln = AlignmentSet(fn)
        self.assertFalse(aln.metadata.collections)

        cmd = "dataset create --metadata {m} {o} {i}".format(
            o=fn2,
            i=fn,
            m=("/pbi/dept/secondary/siv/testdata/"
               "SA3-Sequel/lambda/roche_SAT/"
               "m54013_151205_032353.subreadset.xml"))
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0, m)
        aln = AlignmentSet(fn2)
        self.assertTrue(aln.metadata.collections)
示例#55
0
 def test_filter(self):
     ds2 = DataSet(data.getXml(8))
     ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
     self.assertEqual(len(list(ds2.records)), 20)
     ds2.disableFilters()
     self.assertEqual(len(list(ds2.records)), 92)
     ds2.enableFilters()
     self.assertEqual(len(list(ds2.records)), 20)
    def test_contigset_consolidate(self):
        #build set to merge
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")

        inFas = os.path.join(outdir, 'infile.fasta')
        outFas1 = os.path.join(outdir, 'tempfile1.fasta')
        outFas2 = os.path.join(outdir, 'tempfile2.fasta')

        # copy fasta reference to hide fai and ensure FastaReader is used
        backticks('cp {i} {o}'.format(
                      i=ReferenceSet(data.getXml(9)).toExternalFiles()[0],
                      o=inFas))
        rs1 = ContigSet(inFas)

        singletons = ['A.baumannii.1', 'A.odontolyticus.1']
        double = 'B.cereus.1'
        reader = rs1.resourceReaders()[0]
        exp_double = rs1.get_contig(double)
        exp_singles = [rs1.get_contig(name) for name in singletons]

        # todo: modify the names first:
        with FastaWriter(outFas1) as writer:
            writer.writeRecord(exp_singles[0])
            writer.writeRecord(exp_double.name + '_10_20', exp_double.sequence)
        with FastaWriter(outFas2) as writer:
            writer.writeRecord(exp_double.name + '_0_10',
                               exp_double.sequence + 'ATCGATCGATCG')
            writer.writeRecord(exp_singles[1])

        exp_double_seq = ''.join([exp_double.sequence,
                                  'ATCGATCGATCG',
                                  exp_double.sequence])
        exp_single_seqs = [rec.sequence for rec in exp_singles]

        acc_file = ContigSet(outFas1, outFas2)
        acc_file.induceIndices()
        log.debug(acc_file.toExternalFiles())
        self.assertEqual(len(acc_file), 4)
        self.assertEqual(len(list(acc_file)), 4)
        acc_file.consolidate()
        log.debug(acc_file.toExternalFiles())

        # open acc and compare to exp
        for name, seq in zip(singletons, exp_single_seqs):
            self.assertEqual(acc_file.get_contig(name).sequence[:], seq)
        self.assertEqual(acc_file.get_contig(double).sequence[:],
                         exp_double_seq)

        self.assertEqual(len(acc_file._openReaders), 1)
        self.assertEqual(len(acc_file.index), 3)
        self.assertEqual(len(acc_file._indexMap), 3)
        self.assertEqual(len(acc_file), 3)
        self.assertEqual(len(list(acc_file)), 3)

        # test merge:
        acc1 = ContigSet(outFas1)
        acc2 = ContigSet(outFas2)
        acc3 = acc1 + acc2
 def test_contigset_len(self):
     ref = ReferenceSet(data.getXml(9))
     exp_n_contigs = len(ref)
     refs = ref.split(10)
     self.assertEqual(len(refs), 10)
     obs_n_contigs = 0
     for r in refs:
         obs_n_contigs += len(r)
     self.assertEqual(obs_n_contigs, exp_n_contigs)
示例#58
0
 def test_addFilters(self):
     ds1 = DataSet()
     filt = Filters()
     filt.addRequirement(rq=[('>', '0.85')])
     ds1.addFilters(filt)
     self.assertEquals(str(ds1.filters), '( rq > 0.85 )')
     # Or added from a source XML
     ds2 = DataSet(data.getXml(16))
     self.assertTrue(str(ds2.filters).startswith(
         '( rname = E.faecalis'))
示例#59
0
    def test_add_double_bound_filters(self):
        ds1 = AlignmentSet(data.getXml(8))
        ds1.filters.addRequirement(rq=[('>', '0.85'),
                                       ('<', '0.99')])
        self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')

        ds1 = AlignmentSet(data.getXml(8))
        self.assertEquals(str(ds1.filters), '')
        ds1.filters.addFilter(rq=[('>', '0.85'),
                                  ('<', '0.99')])
        self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')

        ds1.filters.addFilter(length=[('>', '1000')])
        self.assertEquals(str(ds1.filters),
                          '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')

        ds1.filters.removeFilter(0)
        self.assertEquals(str(ds1.filters),
                          '( length > 1000 )')
示例#60
0
 def test_merge(self):
     # xmls with different resourceIds: success
     ds1 = DataSet(data.getXml(no=8))
     ds2 = DataSet(data.getXml(no=11))
     ds3 = ds1 + ds2
     expected = ds1.numExternalResources + ds2.numExternalResources
     self.assertTrue(ds3.numExternalResources == expected)
     # xmls with different resourceIds but conflicting filters:
     # failure to merge
     ds2 = DataSet(data.getXml(no=11))
     ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
     ds3 = ds1 + ds2
     self.assertEqual(ds3, None)
     # xmls with same resourceIds: ignores new inputs
     ds1 = DataSet(data.getXml(no=8))
     ds2 = DataSet(data.getXml(no=8))
     ds3 = ds1 + ds2
     expected = ds1.numExternalResources
     self.assertTrue(ds3.numExternalResources == expected)