def test_alignment_reference(self):
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8),
            referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8),
            referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
def _make_alignmentset(file_name=None):
    bam = pbcore.data.getBamAndCmpH5()[0]
    ds = AlignmentSet(bam)
    if file_name is None:
        file_name = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
    ds.write(file_name)
    return file_name
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(no=8))
        self.assertFalse(aln.metadata.collections)
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        self.assertTrue(aln.metadata.collections)
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        self.assertEqual(sset.metadata, orig_metadata)

        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        with self.assertRaises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
    def test_len_h5(self):
        # HdfSubreadSet
        # len means something else in bax/bas land. These numbers may actually
        # be correct...
        sset = HdfSubreadSet(data.getXml(17), strict=True)
        self.assertEqual(len(sset), 9)
        self.assertEqual(sset._length, (9, 128093))
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 128093)
        self.assertEqual(sset.numRecords, 9)

        # AlignmentSet with cmp.h5
        aln = AlignmentSet(upstreamData.getBamAndCmpH5()[1], strict=True)
        self.assertEqual(len(aln), 112)
        self.assertEqual(aln._length, (112, 59970))
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 59970)
        self.assertEqual(aln.numRecords, 112)
 def test_refWindows(self):
     ds = AlignmentSet(data.getBam())
     dss = ds.split(chunks=2, contigs=True)
     self.assertEqual(len(dss), 2)
     log.debug(dss[0].filters)
     log.debug(dss[1].filters)
     self.assertTrue(
         '( rname = E.faecalis.2 '
         in str(dss[0].filters)
         or
         '( rname = E.faecalis.2 '
         in str(dss[1].filters))
     ds = AlignmentSet(data.getBam())
     ds.filters.addRequirement(rname=[('=', 'E.faecalis.2'),
                                      ('=', 'E.faecalis.2')],
                               tStart=[('<', '99'),
                                       ('<', '299')],
                               tEnd=[('>', '0'),
                                     ('>', '100')])
     self.assertEqual(str(ds.filters),
                      '( rname = E.faecalis.2 AND tstart '
                      '< 99 AND tend > 0 ) OR ( rname = '
                      'E.faecalis.2 AND tstart < 299 AND tend > 100 )')
     self.assertEqual(ds.refWindows, [('E.faecalis.2', 0, 99),
                                      ('E.faecalis.2', 100, 299)])
Exemplo n.º 6
0
    def test_newUuid_random_cli(self):
        fn_orig = data.getXml(7)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        fn = os.path.join(outdir, 'fn.alignmentset.xml')
        fn2 = os.path.join(outdir, 'fn2.alignmentset.xml')
        with AlignmentSet(fn_orig) as aln:
            aln.copyTo(fn)
            shutil.copy(fn, fn2)

        pre_uuid = AlignmentSet(fn).uuid
        pre_uuid2 = AlignmentSet(fn2).uuid
        assert pre_uuid == pre_uuid2

        cmd = "dataset newuuid --random {d}".format(d=fn)
        self._run_cmd_with_output(cmd, fn)

        cmd = "dataset newuuid --random {d}".format(d=fn2)
        self._run_cmd_with_output(cmd, fn2)

        post_uuid = AlignmentSet(fn).uuid
        post_uuid2 = AlignmentSet(fn2).uuid
        assert pre_uuid != post_uuid
        assert pre_uuid2 != post_uuid2
        # RANDOM, THEREFORE THESE ARE NOT EQUAL:
        assert post_uuid != post_uuid2
Exemplo n.º 7
0
def mainCmpH5(options):
    alnReader = AlignmentSet(options.inputCmpH5,
                             referenceFastaFname=options.referenceFilename)
    if options.fofn is not None:
        alnReader.attach(options.fofn)

    if options.referenceFilename:
        referenceTable = loadReferences(options.referenceFilename, alnReader)
    else:
        referenceTable = None

    for refWindow in options.referenceWindows:
        refId = refWindow.refId
        refName = alnReader.referenceInfo(refWindow.refId).FullName
        refLength = alnReader.referenceInfo(refWindow.refId).Length
        refWindow = refWindow._replace(refId=refId)
        refWindow = makeDisplayWindow(refLength, options.width, refWindow)

        if options.rowNumbers != None:
            alns = alnReader[options.rowNumbers]
        else:
            alns = readsInWindow(alnReader, refWindow, options.depth,
                                       minMapQV=options.minMapQV, strategy=options.sorting)

        print windowToGffString(Window(refName, refWindow.start, refWindow.end))

        if options.oneAtATime:
            formatIndividualAlignments(alnReader, refWindow, alns)
        else:
            formatWindow(alnReader, refWindow, alns,
                         referenceTable, options.aligned, options.color,
                         options.realign, options.consensus)
        print
Exemplo n.º 8
0
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(7))
        assert not aln.metadata.collections
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        assert aln.metadata.collections
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        assert not sset.metadata.collections
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        assert sset.metadata == orig_metadata

        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                   'SA3-Sequel/lambda/roche_SAT/'
                   'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        assert not sset.metadata.collections
        with pytest.raises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
Exemplo n.º 9
0
    def test_subset_filter(self):
        ds2 = AlignmentSet(data.getXml(7))
        assert len(ds2) == 92
        modvalue = 8

        # manually:
        hns = ds2.index.holeNumber
        assert np.count_nonzero(hns % modvalue == 0) == 26

        # dset filters:
        ds2.filters.addRequirement(zm=[('=', '0', modvalue)])
        assert len(ds2) == 26

        # written:
        filtstr = '( Uint32Cast(zm) % 8 = 0 )'
        assert str(ds2.filters) == filtstr

        filtxmlstr = ('<pbbase:Property Hash="Uint32Cast" Modulo="8" '
                      'Name="zm" Operator="=" Value="0"/>')
        fn = tempfile.NamedTemporaryFile(suffix="alignmentset.xml").name
        ds2.write(fn)
        with open(fn, 'r') as ifh:
            found = False
            for line in ifh:
                if filtxmlstr in line:
                    found = True
        assert found
Exemplo n.º 10
0
    def test_newUuid_random_cli(self):
        fn_orig = data.getXml(8)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        fn = os.path.join(outdir, 'fn.alignmentset.xml')
        fn2 = os.path.join(outdir, 'fn2.alignmentset.xml')
        with AlignmentSet(fn_orig) as aln:
            aln.copyTo(fn)
            shutil.copy(fn, fn2)

        pre_uuid = AlignmentSet(fn).uuid
        pre_uuid2 = AlignmentSet(fn2).uuid
        self.assertEqual(pre_uuid, pre_uuid2)

        cmd = "dataset newuuid --random {d}".format(d=fn)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
        self.assertTrue(os.path.exists(fn))

        cmd = "dataset newuuid --random {d}".format(d=fn2)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
        self.assertTrue(os.path.exists(fn2))

        post_uuid = AlignmentSet(fn).uuid
        post_uuid2 = AlignmentSet(fn2).uuid
        self.assertNotEqual(pre_uuid, post_uuid)
        self.assertNotEqual(pre_uuid2, post_uuid2)
        # RANDOM, THEREFORE THESE ARE NOT EQUAL:
        self.assertNotEqual(post_uuid, post_uuid2)
Exemplo n.º 11
0
 def _readAlignmentInput(self):
     """
     Read the AlignmentSet input file and
     store it as self._inAlnFile.
     """
     fname = options.inputFilename
     self._inAlnFile = AlignmentSet(fname)
Exemplo n.º 12
0
    def test_loadmetadata_from_dataset_create_cli(self):
        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        log.debug(fn)

        aln = AlignmentSet(data.getXml(8))
        aln.metadata.collections = None
        aln.copyTo(fn)
        aln.close()
        del aln
        self.assertTrue(os.path.exists(fn))

        aln = AlignmentSet(fn)
        self.assertFalse(aln.metadata.collections)

        cmd = "dataset create --metadata {m} {o} {i}".format(
            o=fn2,
            i=fn,
            m=("/pbi/dept/secondary/siv/testdata/"
               "SA3-Sequel/lambda/roche_SAT/"
               "m54013_151205_032353.subreadset.xml"))
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0, m)
        aln = AlignmentSet(fn2)
        self.assertTrue(aln.metadata.collections)
    def test_len(self):
        # AlignmentSet
        aln = AlignmentSet(data.getXml(8), strict=True)
        self.assertEqual(len(aln), 92)
        self.assertEqual(aln._length, (92, 123588))
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 123588)
        self.assertEqual(aln.numRecords, 92)
        self.assertEqual(sum(1 for _ in aln), 92)
        self.assertEqual(sum(len(rec) for rec in aln), 123588)

        # AlignmentSet with filters
        aln = AlignmentSet(data.getXml(15), strict=True)
        self.assertEqual(len(aln), 40)
        self.assertEqual(aln._length, (40, 52023))
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)
        aln.totalLength = -1
        aln.numRecords = -1
        self.assertEqual(aln.totalLength, -1)
        self.assertEqual(aln.numRecords, -1)
        aln.updateCounts()
        self.assertEqual(aln.totalLength, 52023)
        self.assertEqual(aln.numRecords, 40)

        # SubreadSet
        sset = SubreadSet(data.getXml(10), strict=True)
        self.assertEqual(len(sset), 92)
        self.assertEqual(sset._length, (92, 124093))
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 124093)
        self.assertEqual(sset.numRecords, 92)
        self.assertEqual(sum(1 for _ in sset), 92)
        self.assertEqual(sum(len(rec) for rec in sset), 124093)

        # ReferenceSet
        sset = ReferenceSet(data.getXml(9), strict=True)
        self.assertEqual(len(sset), 59)
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
        sset.totalLength = -1
        sset.numRecords = -1
        self.assertEqual(sset.totalLength, -1)
        self.assertEqual(sset.numRecords, -1)
        sset.updateCounts()
        self.assertEqual(sset.totalLength, 85774)
        self.assertEqual(sset.numRecords, 59)
Exemplo n.º 14
0
    def test_readGroupTable(self):
        aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2))
        readers = aln.resourceReaders()

        self.assertEqual(len(readers[0].readGroupTable), 1)
        self.assertEqual(len(readers[1].readGroupTable), 1)
        self.assertEqual(len(readers[2].readGroupTable), 1)
        self.assertEqual(len(aln.readGroupTable), 3)
Exemplo n.º 15
0
    def test_readGroupTable(self):
        aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2))
        readers = aln.resourceReaders()

        self.assertEqual(len(readers[0].readGroupTable), 1)
        self.assertEqual(len(readers[1].readGroupTable), 1)
        self.assertEqual(len(readers[2].readGroupTable), 1)
        self.assertEqual(len(aln.readGroupTable), 3)
Exemplo n.º 16
0
 def test_filter(self):
     ds2 = AlignmentSet(data.getXml(8))
     ds2.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
     self.assertEqual(len(list(ds2.records)), 20)
     ds2.disableFilters()
     self.assertEqual(len(list(ds2.records)), 92)
     ds2.enableFilters()
     self.assertEqual(len(list(ds2.records)), 20)
Exemplo n.º 17
0
 def test_absolutize_cli(self):
     fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
     aln = AlignmentSet(data.getXml(7))
     aln.copyTo(fn, relative=True)
     assert _is_relative(fn)
     cmd = "dataset absolutize {d}".format(d=fn)
     self._run_cmd_with_output(cmd, fn)
     assert not _is_relative(fn)
Exemplo n.º 18
0
 def test_referenceInfo(self):
     aln = AlignmentSet(data.getBam(0))
     readers = aln.resourceReaders()
     self.assertEqual(len(readers[0].referenceInfoTable), 59)
     self.assertEqual(
         str(readers[0].referenceInfo('E.faecalis.1')),
         "(27, 27, 'E.faecalis.1', 'E.faecalis.1', 1482, "
         "'a1a59c267ac1341e5a12bce7a7d37bcb', 0L, 0L)")
Exemplo n.º 19
0
 def test_referenceInfo(self):
     aln = AlignmentSet(data.getBam(0))
     readers = aln.resourceReaders()
     self.assertEqual(len(readers[0].referenceInfoTable), 59)
     self.assertEqual(
         str(readers[0].referenceInfo('E.faecalis.1')),
         "(27, 27, 'E.faecalis.1', 'E.faecalis.1', 1482, "
         "'a1a59c267ac1341e5a12bce7a7d37bcb', 0L, 0L)")
Exemplo n.º 20
0
    def test_alignment_reference(self):
        rfn = data.getXml(9)
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8),
                           referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        ds1 = AlignmentSet(data.getXml(8),
                           referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)

        fofn_out = tempfile.NamedTemporaryFile(suffix=".fofn").name
        log.debug(fofn_out)
        with open(fofn_out, 'w') as f:
            f.write(data.getXml(8))
            f.write('\n')
            f.write(data.getXml(11))
            f.write('\n')
        ds1 = AlignmentSet(fofn_out,
                           referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
        self.assertEqual(ds1.externalResources[0].reference, fasta_file)
        self.assertEqual(ds1.resourceReaders()[0].referenceFasta.filename,
                         fasta_file)
    def test_split_by_contigs_with_split(self):
        # test to make sure the refWindows work when chunks == # refs
        ds3 = AlignmentSet(data.getBam())
        dss = ds3.split(contigs=True)
        self.assertEqual(len(dss), 12)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        # not all references have something mapped to them, refWindows doesn't
        # care...
        self.assertNotEqual(refWindows, sorted(ds3.refWindows))
        random_few = [('C.beijerinckii.13', 0, 1433),
                      ('B.vulgatus.4', 0, 1449),
                      ('E.faecalis.1', 0, 1482)]
        for reference in random_few:
            found = False
            for ref in refWindows:
                if ref == reference:
                    found = True
            self.assertTrue(found)
        old_refWindows = refWindows

        dss = ds3.split(contigs=True, chunks=1)
        self.assertEqual(len(dss), 1)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        self.assertEqual(refWindows, old_refWindows)

        dss = ds3.split(contigs=True, chunks=24)
        self.assertEqual(len(dss), 24)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))

        random_few = [('E.faecalis.2', 0, 741),
                      ('E.faecalis.2', 741, 1482)]
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            if not found:
                log.debug(ref)
            self.assertTrue(found)

        dss = ds3.split(contigs=True, chunks=36)
        self.assertEqual(len(dss), 36)
        refWindows = sorted(reduce(lambda x, y: x + y,
                                   [ds.refWindows for ds in dss]))
        random_few = [('E.faecalis.2', 0, 494),
                      ('E.faecalis.2', 494, 988),
                      ('E.faecalis.2', 988, 1482)]
        for ref in random_few:
            found = False
            for window in refWindows:
                if ref == window:
                    found = True
            self.assertTrue(found)
Exemplo n.º 22
0
    def test_referenceInfoTable(self):
        aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2))
        readers = aln.resourceReaders()

        self.assertEqual(len(readers[0].referenceInfoTable), 1)
        self.assertEqual(len(readers[1].referenceInfoTable), 59)
        self.assertEqual(len(readers[2].referenceInfoTable), 1)
        self.assertEqual(readers[0].referenceInfoTable.Name,
                         readers[2].referenceInfoTable.Name)
        self.assertEqual(len(aln.referenceInfoTable), 60)
Exemplo n.º 23
0
    def test_referenceInfoTable(self):
        aln = AlignmentSet(data.getBam(0), data.getBam(1), data.getBam(2))
        readers = aln.resourceReaders()

        self.assertEqual(len(readers[0].referenceInfoTable), 1)
        self.assertEqual(len(readers[1].referenceInfoTable), 59)
        self.assertEqual(len(readers[2].referenceInfoTable), 1)
        self.assertEqual(readers[0].referenceInfoTable.Name,
                         readers[2].referenceInfoTable.Name)
        self.assertEqual(len(aln.referenceInfoTable), 60)
Exemplo n.º 24
0
 def test_pbmerge_indexing(self):
     log.debug("Test through API")
     aln = AlignmentSet(data.getXml(12))
     self.assertEqual(len(aln.toExternalFiles()), 2)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'merged.bam')
     log.info(outfn)
     consolidateXml(aln, outfn, cleanup=False)
     self.assertTrue(os.path.exists(outfn))
     self.assertTrue(os.path.exists(outfn + '.pbi'))
     cons = AlignmentSet(outfn)
     self.assertEqual(len(aln), len(cons))
     orig_stats = os.stat(outfn + '.pbi')
     cons.externalResources[0].pbi = None
     self.assertEqual(None, cons.externalResources[0].pbi)
     # test is too quick, stat times might be within the same second
     time.sleep(1)
     cons.induceIndices()
     self.assertEqual(outfn + '.pbi', cons.externalResources[0].pbi)
     self.assertEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
     cons.externalResources[0].pbi = None
     self.assertEqual(None, cons.externalResources[0].pbi)
     # test is too quick, stat times might be within the same second
     time.sleep(1)
     cons.induceIndices(force=True)
     self.assertNotEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
Exemplo n.º 25
0
    def test_alignmentset_partial_consolidate(self):
        testFile = ("/pbi/dept/secondary/siv/testdata/SA3-DS/"
                    "lambda/2372215/0007_tiny/Alignment_"
                    "Results/m150404_101626_42267_c10080"
                    "7920800000001823174110291514_s1_p0."
                    "all.alignmentset.xml")
        aln = AlignmentSet(testFile)
        nonCons = AlignmentSet(testFile)
        assert len(aln.toExternalFiles()) == 3
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn, numFiles=2)
        assert not os.path.exists(outfn)
        assert os.path.exists(_infixFname(outfn, "0"))
        assert os.path.exists(_infixFname(outfn, "1"))
        assert len(aln.toExternalFiles()) == 2
        assert len(nonCons.toExternalFiles()) == 3
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            assert read1 == read2
        assert len(aln) == len(nonCons)

        log.debug("Test cli")
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        datafile = os.path.join(outdir, "merged.bam")
        xmlfile = os.path.join(outdir, "merged.xml")
        cmd = "dataset consolidate --numFiles 2 {i} {d} {x}".format(
            i=testFile, d=datafile, x=xmlfile)
        log.debug(cmd)
        subprocess.check_call(cmd.split())
Exemplo n.º 26
0
 def test_pbmerge_indexing(self):
     log.debug("Test through API")
     aln = AlignmentSet(data.getXml(11))
     assert len(aln.toExternalFiles()) == 2
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'merged.bam')
     log.info(outfn)
     consolidateXml(aln, outfn, cleanup=False)
     assert os.path.exists(outfn)
     assert os.path.exists(outfn + '.pbi')
     cons = AlignmentSet(outfn)
     assert len(aln) == len(cons)
     orig_stats = os.stat(outfn + '.pbi')
     cons.externalResources[0].pbi = None
     assert cons.externalResources[0].pbi is None
     # test is too quick, stat times might be within the same second
     time.sleep(1)
     cons.induceIndices()
     assert outfn + '.pbi' == cons.externalResources[0].pbi
     assert orig_stats == os.stat(cons.externalResources[0].pbi)
     cons.externalResources[0].pbi = None
     assert cons.externalResources[0].pbi is None
     # test is too quick, stat times might be within the same second
     time.sleep(1)
     cons.induceIndices(force=True)
     assert orig_stats != os.stat(cons.externalResources[0].pbi)
Exemplo n.º 27
0
    def test_alignmentset_partial_consolidate(self):
        testFile = ("/pbi/dept/secondary/siv/testdata/SA3-DS/"
                    "lambda/2372215/0007_tiny/Alignment_"
                    "Results/m150404_101626_42267_c10080"
                    "7920800000001823174110291514_s1_p0."
                    "all.alignmentset.xml")
        aln = AlignmentSet(testFile)
        nonCons = AlignmentSet(testFile)
        self.assertEqual(len(aln.toExternalFiles()), 3)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn, numFiles=2)
        self.assertFalse(os.path.exists(outfn))
        self.assertTrue(os.path.exists(_infixFname(outfn, "0")))
        self.assertTrue(os.path.exists(_infixFname(outfn, "1")))
        self.assertEqual(len(aln.toExternalFiles()), 2)
        self.assertEqual(len(nonCons.toExternalFiles()), 3)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(aln), len(nonCons))

        log.debug("Test cli")
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        datafile = os.path.join(outdir, "merged.bam")
        xmlfile = os.path.join(outdir, "merged.xml")
        cmd = "dataset consolidate --numFiles 2 {i} {d} {x}".format(i=testFile,
                                                                    d=datafile,
                                                                    x=xmlfile)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
Exemplo n.º 28
0
    def test_updateCounts(self):
        log.info("Testing updateCounts without filters")
        aln = AlignmentSet(data.getBam(0))
        readers = aln.resourceReaders()

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength
                self.assertEqual(
                    record.aStart, record.bam.pbi[record.rowNumber]['aStart'])
                self.assertEqual(
                    record.aEnd, record.bam.pbi[record.rowNumber]['aEnd'])

        expNum = 0
        for reader in readers:
            expNum += len(reader)

        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)

        log.info("Testing whether filters are respected")
        aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
        aln.updateCounts()
        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        def count(gen):
            count = 0
            for _ in gen:
                count += 1
            return count

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength

        bfile = openIndexedAlignmentFile(data.getBam(0))
        rWin = (bfile.referenceInfo('E.faecalis.1').ID,
                0,
                bfile.referenceInfo('E.faecalis.1').Length)
        reads = bfile.readsInRange(*rWin)
        expNum = count(reads)
        expLen = 0
        reads = bfile.readsInRange(*rWin)
        for read in reads:
            expLen += read.readLength

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)
Exemplo n.º 29
0
 def test_relativize_cli(self):
     fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
     aln = AlignmentSet(data.getXml(8))
     aln.copyTo(fn)
     self.assertFalse(_is_relative(fn))
     cmd = "dataset relativize {d}".format(d=fn)
     log.debug(cmd)
     o, r, m = backticks(cmd)
     self.assertEqual(r, 0)
     self.assertTrue(os.path.exists(fn))
     self.assertTrue(_is_relative(fn))
Exemplo n.º 30
0
 def test_relativize_cli(self):
     fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
     aln = AlignmentSet(data.getXml(8))
     aln.copyTo(fn)
     self.assertFalse(_is_relative(fn))
     cmd = "dataset relativize {d}".format(d=fn)
     log.debug(cmd)
     o, r, m = backticks(cmd)
     self.assertEqual(r, 0)
     self.assertTrue(os.path.exists(fn))
     self.assertTrue(_is_relative(fn))
Exemplo n.º 31
0
 def test_absolutize_cli_3(self):
     fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, os.path.split(fn)[1])
     aln = AlignmentSet(data.getXml(7))
     aln.copyTo(fn, relative=True)
     assert _is_relative(fn)
     cmd = "dataset absolutize {d} --outdir {o}".format(d=fn, o=outdir)
     self._run_cmd_with_output(cmd, fn)
     assert os.path.exists(outfn)
     assert _is_relative(fn)
     assert not _is_relative(outfn)
Exemplo n.º 32
0
    def test_updateCounts(self):
        log.info("Testing updateCounts without filters")
        aln = AlignmentSet(data.getBam(0))
        readers = aln.resourceReaders()

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength
                self.assertEqual(record.aStart,
                                 record.bam.pbi[record.rowNumber]['aStart'])
                self.assertEqual(record.aEnd,
                                 record.bam.pbi[record.rowNumber]['aEnd'])

        expNum = 0
        for reader in readers:
            expNum += len(reader)

        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)

        log.info("Testing whether filters are respected")
        aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
        aln.updateCounts()
        accLen = aln.metadata.totalLength
        accNum = aln.metadata.numRecords

        def count(gen):
            count = 0
            for _ in gen:
                count += 1
            return count

        expLen = 0
        for reader in readers:
            for record in reader:
                expLen += record.readLength

        bfile = openIndexedAlignmentFile(data.getBam(0))
        rWin = (bfile.referenceInfo('E.faecalis.1').ID, 0,
                bfile.referenceInfo('E.faecalis.1').Length)
        reads = bfile.readsInRange(*rWin)
        expNum = count(reads)
        expLen = 0
        reads = bfile.readsInRange(*rWin)
        for read in reads:
            expLen += read.readLength

        self.assertEqual(expLen, accLen)
        self.assertEqual(expNum, accNum)
Exemplo n.º 33
0
 def test_newUuid_cli(self):
     fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
     aln = AlignmentSet(data.getXml(8))
     aln.copyTo(fn)
     pre_uuid = AlignmentSet(fn).uuid
     cmd = "dataset newuuid {d}".format(d=fn)
     log.debug(cmd)
     o, r, m = backticks(cmd)
     post_uuid = AlignmentSet(fn).uuid
     self.assertEqual(r, 0)
     self.assertTrue(os.path.exists(fn))
     self.assertNotEqual(pre_uuid, post_uuid)
Exemplo n.º 34
0
 def loadSharedAlignmentSet(self, alignmentFilename):
     """
     Read the input AlignmentSet so the indices can be shared with the
     slaves.  This is also used to pass to ReferenceUtils for setting up
     the ipdModel object.
     """
     logging.info("Reading AlignmentSet: %s" % alignmentFilename)
     logging.info("           reference: %s" % self.args.reference)
     self.alignments = AlignmentSet(alignmentFilename,
                                    referenceFastaFname=self.args.reference)
     # XXX this should ensure that the file(s) get opened, including any
     # .pbi indices - but need to confirm this
     self.refInfo = self.alignments.referenceInfoTable
Exemplo n.º 35
0
    def test_nested_external_resources(self):
        log.debug("Testing nested externalResources in AlignmentSets")
        aln = AlignmentSet(data.getXml(0), skipMissing=True)
        self.assertTrue(aln.externalResources[0].pbi)
        self.assertTrue(aln.externalResources[0].reference)
        self.assertEqual(
            aln.externalResources[0].externalResources[0].metaType,
            'PacBio.ReferenceFile.ReferenceFastaFile')
        self.assertEqual(aln.externalResources[0].scraps, None)

        log.debug("Testing nested externalResources in SubreadSets")
        subs = SubreadSet(data.getXml(5), skipMissing=True)
        self.assertTrue(subs.externalResources[0].scraps)
        self.assertEqual(
            subs.externalResources[0].externalResources[0].metaType,
            'PacBio.SubreadFile.ScrapsBamFile')
        self.assertEqual(subs.externalResources[0].reference, None)

        log.debug("Testing added nested externalResoruces to SubreadSet")
        subs = SubreadSet(data.getXml(10))
        self.assertFalse(subs.externalResources[0].scraps)
        subs.externalResources[0].scraps = 'fake.fasta'
        self.assertTrue(subs.externalResources[0].scraps)
        self.assertEqual(
            subs.externalResources[0].externalResources[0].metaType,
            'PacBio.SubreadFile.ScrapsBamFile')
        subs.externalResources[0].barcodes = 'bc.fasta'
        self.assertTrue(subs.externalResources[0].barcodes)
        self.assertEqual(
            subs.externalResources[0].externalResources[1].metaType,
            "PacBio.DataSet.BarcodeSet")

        subs.externalResources[0].adapters = 'foo.adapters.fasta'
        self.assertEqual(subs.externalResources[0].adapters,
                         'foo.adapters.fasta')
        self.assertEqual(
            subs.externalResources[0].externalResources[2].metaType,
            "PacBio.SubreadFile.AdapterFastaFile")

        log.debug("Testing adding nested externalResources to AlignmetnSet "
                  "manually")
        aln = AlignmentSet(data.getXml(8))
        self.assertTrue(aln.externalResources[0].bai)
        self.assertTrue(aln.externalResources[0].pbi)
        self.assertFalse(aln.externalResources[0].reference)
        aln.externalResources[0].reference = 'fake.fasta'
        self.assertTrue(aln.externalResources[0].reference)
        self.assertEqual(
            aln.externalResources[0].externalResources[0].metaType,
            'PacBio.ReferenceFile.ReferenceFastaFile')
Exemplo n.º 36
0
    def _run(self):
        logging.info("Worker %s (PID=%d) started running" %
                     (self.name, self.pid))
        if self._sharedAlignmentSet is not None:
            # XXX this will create an entirely new AlignmentSet object, but
            # keeping any indices already loaded into memory
            self.caseAlignments = _reopen(self._sharedAlignmentSet)
            # `self._sharedAlignmentSet.close()
            self._sharedAlignmentSet = None
        else:
            warnings.warn("Shared AlignmentSet not used")
            self.caseAlignments = AlignmentSet(self.options.infile,
                                               referenceFastaFname=self.options.reference)

        self.controlAlignments = None
        if not self.options.control is None:
            self.controlAlignments = AlignmentSet(self.options.control,
                                                  referenceFastaFname=self.options.reference)

        if self.options.randomSeed is None:
            np.random.seed(42)
        self.onStart()

        while True:
            if self.isTerminated():
                break

            chunkDesc = self._workQueue.get()
            if chunkDesc is None:
                # Sentinel indicating end of input.  Place a sentinel
                # on the results queue and end this worker process.
                self._resultsQueue.put(None)
                self._workQueue.task_done()
                break
            else:
                (chunkId, datum) = chunkDesc
                logging.info("Got chunk: (%s, %s) -- Process: %s" %
                             (chunkId, str(datum), current_process()))
                result = self.onChunk(
                    datum)  # pylint: disable=assignment-from-none

                logging.debug("Process %s: putting result." %
                              current_process())
                self._resultsQueue.put((chunkId, result))
                self._workQueue.task_done()

        self.onFinish()

        logging.info("Process %s (PID=%d) done; exiting." %
                     (self.name, self.pid))
Exemplo n.º 37
0
    def test_alignment_reference(self):
        rs1 = ReferenceSet(data.getXml(9))
        fasta_res = rs1.externalResources[0]
        fasta_file = urlparse(fasta_res.resourceId).path

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=rs1)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8), referenceFastaFname=fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)

        ds1 = AlignmentSet(data.getXml(8))
        ds1.addReference(fasta_file)
        aln_ref = None
        for aln in ds1:
            aln_ref = aln.reference()
            break
        self.assertTrue(aln_ref is not None)
Exemplo n.º 38
0
def mainGff(options):
    reader = GffReader(options.inputGff)
    alnsFname, referenceFname = extractCmpH5AndReferenceFromGff(reader)
    # Allow overriding
    alnsFname = options.inputCmpH5 or alnsFname
    referenceFname = options.referenceFilename or referenceFname

    assert os.path.isfile(alnsFname)
    assert os.path.isfile(referenceFname)

    alnReader = AlignmentSet(alnsFname, referenceFastaFname=referenceFname)

    if options.fofn is not None:
        alnReader.attach(options.fofn)

    referenceTable = loadReferences(referenceFname, alnReader)

    for i, gffRecord in enumerate(reader):
        referenceSeq = gffRecord.get("reference", "-")
        variantSeq   = gffRecord.get("variantSeq", "-")
        variantConfidence = gffRecord.confidence
        variantSummary = "(%s > %s)" % (referenceSeq, variantSeq)
        print gffRecord.type, gffRecord.seqid, gffRecord.start, gffRecord.end, \
            variantSummary, variantConfidence
        refId = gffRecord.seqid
        refLength = alnReader.referenceInfo(gffRecord.seqid).Length
        refWindow = makeDisplayWindow(refLength, options.width,
                                       Window(refId,
                                              gffRecord.start-10,
                                              gffRecord.end+10))
        if "rows" in gffRecord.attributes:
            alns = alnReader[map(int, gffRecord.rows.split(","))]
        else:
            alns = readsInWindow(alnReader, refWindow, options.depth,
                                 minMapQV=options.minMapQV, strategy=options.sorting)
        formatWindow(alnReader, refWindow, alns, referenceTable,
                     aligned=(gffRecord.type != "insertion"),
                     consensus=options.consensus,
                     useColor=options.color,
                     doRealign=options.realign)

        if options.pulseRecognizer:
            # CSV output for pulse recognizer
            print
            csvFname = "variant-" + str(i) +  ".csv"
            dumpVariantCsv(csvFname, alnReader, alns, gffRecord)
            formatVariantCsvLink(csvFname)

        print
Exemplo n.º 39
0
 def test_filter_cli(self):
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, "filtered8.xml")
     log.debug(outfn)
     cmd = "dataset filter {i} {o} {f}".format(i=data.getXml(7),
                                               o=outfn,
                                               f="rname=E.faecalis.1")
     self._run_cmd_with_output(cmd, outfn)
     aln = AlignmentSet(data.getXml(7))
     aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
     aln.updateCounts()
     dset = AlignmentSet(outfn)
     assert str(aln.filters) == str(dset.filters)
     assert aln.totalLength == dset.totalLength
     assert aln.numRecords == dset.numRecords
Exemplo n.º 40
0
def main():
    datasets, dtype, subsampleto, title, output = parseArgs()
    d = []
    for dset in datasets:
        if dtype == 'AlignmentSet':
            f = AlignmentSet(dset)
        elif dtype == 'SubreadSet':
            f = SubreadSet(dset)
        else:
            raise ValueError('invalid dataSetType')
        x = f.index['holeNumber'] / UINTMAX16
        y = f.index['holeNumber'] - x * UINTMAX16
        if len(f) > subsampleto:
            x, y = zip(*random.sample(zip(x, y), subsampleto))
        h = Scatter(x=x,
                    y=y,
                    mode='markers',
                    marker=dict(size=5, opacity=0.2),
                    showlegend=False)
        d.append(h)
    layout = Layout(title=title,
                    height=600,
                    width=600,
                    xaxis=dict(title='X', range=[0, 1500]),
                    yaxis=dict(title='Y', range=[0, 1500]))
    fig = Figure(data=d, layout=layout)
    plot(fig, show_link=False, auto_open=False, filename=output)
Exemplo n.º 41
0
def find_discordant_mappings(file_name, max_subread_distance=25000):
    """
    Verify that aligned subreads from the same polymerase read are concordant.
    Written as a generator to facilitate interactive use.
    """
    mapping_dict = {}
    n = 0
    with AlignmentSet(file_name) as ds:
        for alignment in ds:
            read_id = (alignment.movieName, alignment.HoleNumber)
            reference_name = alignment.referenceInfo.FullName
            reference_pos = int(alignment.tStart)  # Comes as a uint
            if read_id not in mapping_dict:
                mapping_dict[read_id] = (reference_name, reference_pos,
                                         alignment.qName)
            else:
                assert reference_name == mapping_dict[read_id][0]
                delta = mapping_dict[read_id][1] - reference_pos
                msg = "non-concordant mappings for {a} and {b}: " +\
                      "delta={d} (= |{t} - {u}|)"
                if abs(delta) > max_subread_distance:
                    yield msg.format(a=mapping_dict[read_id][2],
                                     b=alignment.qName,
                                     d=delta,
                                     t=mapping_dict[read_id][1],
                                     u=alignment.tStart)
Exemplo n.º 42
0
def _get_reads_info(aligned_reads_file):
    """
    Extract information from the BAM files. Returns a tuple of length 2.
    First item is a dictionary of dictionaries, such that holes are mapped by cell, then set.
    Second item is the instrument name. 
    :param aligned_reads_file: (str) path to aligned_reads[.xml,.bam]
    :return tuple (reads_by_cell_then_set, instrument) (dict, string): A dictionary of dictionaries,
    instrument name
    """
    inst = None
    reads_by_cell = defaultdict(set)
    with AlignmentSet(aligned_reads_file) as ds:
        for bamfile in ds.resourceReaders():
            if ds.isIndexed:
                logging.info("Indexed file - will use fast loop.")
                for (hole, rgId) in zip(bamfile.holeNumber, bamfile.qId):
                    movie_name = bamfile.readGroupInfo(rgId).MovieName
                    cell = movie_to_cell(movie_name)
                    if inst is None:
                        inst = _cell_2_inst(cell)
                    reads_by_cell[cell].add(hole)
            else:
                for aln in bamfile:
                    hole = aln.HoleNumber
                    movie_name = aln.movieName
                    cell = movie_to_cell(movie_name)
                    if inst is None:
                        inst = _cell_2_inst(cell)
                    reads_by_cell[cell].add(hole)
    return reads_by_cell, inst
Exemplo n.º 43
0
def make_sat_report(aligned_reads_file, mapping_stats_report, variants_report,
                    report, output_dir):
    """
    Entry to report.
    :param aligned_reads_file: (str) path to aligned_reads.xml
    :param mapping_stats_report: (str) path to mapping stats json report
    :param variants_report: (str) path to variants report
    """
    _validate_inputs([('aligned_reads_file', aligned_reads_file),
                      ('mapping_stats_report', mapping_stats_report),
                      ('variants_report', variants_report)])

    d_map = _get_mapping_stats_data(mapping_stats_report)
    reads, inst = _get_reads_info(aligned_reads_file)
    d_bam = _get_read_hole_data(reads, inst)
    d_var = _get_variants_data(variants_report)
    ds = AlignmentSet(aligned_reads_file)

    rpt = Report(meta_rpt.id, dataset_uuids=(ds.uuid, ))
    rpt.add_attribute(
        Attribute(Constants.A_INSTRUMENT, d_bam[Constants.A_INSTRUMENT]))
    rpt.add_attribute(
        Attribute(Constants.A_COVERAGE, d_var[Constants.A_COVERAGE]))
    rpt.add_attribute(
        Attribute(Constants.A_CONCORDANCE, d_var[Constants.A_CONCORDANCE]))
    rpt.add_attribute(
        Attribute(Constants.A_READLENGTH, d_map[Constants.A_READLENGTH]))
    rpt.add_attribute(Attribute(Constants.A_READS, d_bam[Constants.A_READS]))
    rpt = meta_rpt.apply_view(rpt)
    rpt.write_json(os.path.join(output_dir, report))
Exemplo n.º 44
0
 def _readCmpH5Input(self):
     """
     Read the CmpH5 input file into a CmpH5 object and
     store it as self._inCmpH5.
     """
     fname = options.inputFilename
     self._inCmpH5 = AlignmentSet(fname)
Exemplo n.º 45
0
 def _readAlignmentInput(self):
     """
     Read the AlignmentSet input file and
     store it as self._inAlnFile.
     """
     fname = options.inputFilename
     self._inAlnFile = AlignmentSet(fname)
Exemplo n.º 46
0
    def test_add_double_bound_filters(self):
        ds1 = AlignmentSet(data.getXml(8))
        ds1.filters.addRequirement(rq=[('>', '0.85'), ('<', '0.99')])
        self.assertEquals(str(ds1.filters), '( rq > 0.85 ) OR ( rq < 0.99 )')

        ds1 = AlignmentSet(data.getXml(8))
        self.assertEquals(str(ds1.filters), '')
        ds1.filters.addFilter(rq=[('>', '0.85'), ('<', '0.99')])
        self.assertEquals(str(ds1.filters), '( rq > 0.85 AND rq < 0.99 )')

        ds1.filters.addFilter(length=[('>', '1000')])
        self.assertEquals(str(ds1.filters),
                          '( rq > 0.85 AND rq < 0.99 ) OR ( length > 1000 )')

        ds1.filters.removeFilter(0)
        self.assertEquals(str(ds1.filters), '( length > 1000 )')
    def test_uuid(self):
        ds = AlignmentSet()
        old = ds.uuid
        _ = ds.newUuid()
        self.assertNotEqual(old, ds.uuid)

        aln = AlignmentSet(data.getXml(no=8))
        oldUuid = aln.uuid
        outdir = tempfile.mkdtemp(suffix="dataset-doctest")
        outXml = os.path.join(outdir, 'tempfile.xml')
        aln.write(outXml)
        aln = AlignmentSet(outXml)
        self.assertEqual(aln.uuid, oldUuid)
Exemplo n.º 48
0
    def test_uuid(self):
        ds = AlignmentSet()
        old = ds.uuid
        _ = ds.newUuid()
        assert not old == ds.uuid

        aln = AlignmentSet(data.getXml(7))
        oldUuid = aln.uuid
        outdir = tempfile.mkdtemp(suffix="dataset-doctest")
        outXml = os.path.join(outdir, 'tempfile.xml')
        aln.write(outXml)
        aln = AlignmentSet(outXml)
        assert aln.uuid == oldUuid
 def test_pbmerge_indexing(self):
     log.debug("Test through API")
     aln = AlignmentSet(data.getXml(12))
     self.assertEqual(len(aln.toExternalFiles()), 2)
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, 'merged.bam')
     log.info(outfn)
     consolidateXml(aln, outfn, cleanup=False)
     self.assertTrue(os.path.exists(outfn))
     self.assertTrue(os.path.exists(outfn + '.pbi'))
     cons = AlignmentSet(outfn)
     self.assertEqual(len(aln), len(cons))
     orig_stats = os.stat(outfn + '.pbi')
     cons.externalResources[0].pbi = None
     self.assertEqual(None, cons.externalResources[0].pbi)
     # test is too quick, stat times might be within the same second
     time.sleep(1)
     cons.induceIndices()
     self.assertEqual(outfn + '.pbi', cons.externalResources[0].pbi)
     self.assertEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
     cons.externalResources[0].pbi = None
     self.assertEqual(None, cons.externalResources[0].pbi)
     # test is too quick, stat times might be within the same second
     time.sleep(1)
     cons.induceIndices(force=True)
     self.assertNotEqual(orig_stats, os.stat(cons.externalResources[0].pbi))
Exemplo n.º 50
0
    def test_loadmetadata_from_dataset_create_cli(self):
        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        fn2 = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        log.debug(fn)

        aln = AlignmentSet(data.getXml(8))
        aln.metadata.collections = None
        aln.copyTo(fn)
        aln.close()
        del aln
        self.assertTrue(os.path.exists(fn))

        aln = AlignmentSet(fn)
        self.assertFalse(aln.metadata.collections)

        cmd = "dataset create --metadata {m} {o} {i}".format(
            o=fn2,
            i=fn,
            m=("/pbi/dept/secondary/siv/testdata/"
               "SA3-Sequel/lambda/roche_SAT/"
               "m54013_151205_032353.subreadset.xml"))
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0, m)
        aln = AlignmentSet(fn2)
        self.assertTrue(aln.metadata.collections)
Exemplo n.º 51
0
    def test_alignmentset_partial_consolidate(self):
        testFile = ("/mnt/secondary-siv/testdata/SA3-DS/"
                    "lambda/2372215/0007_tiny/Alignment_"
                    "Results/m150404_101626_42267_c10080"
                    "7920800000001823174110291514_s1_p0."
                    "all.alignmentset.xml")
        aln = AlignmentSet(testFile)
        nonCons= AlignmentSet(testFile)
        self.assertEqual(len(aln.toExternalFiles()), 3)
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, 'merged.bam')
        aln.consolidate(outfn, numFiles=2)
        self.assertFalse(os.path.exists(outfn))
        self.assertTrue(os.path.exists(_infixFname(outfn, "0")))
        self.assertTrue(os.path.exists(_infixFname(outfn, "1")))
        self.assertEqual(len(aln.toExternalFiles()), 2)
        self.assertEqual(len(nonCons.toExternalFiles()), 3)
        for read1, read2 in zip(sorted(list(aln)), sorted(list(nonCons))):
            self.assertEqual(read1, read2)
        self.assertEqual(len(aln), len(nonCons))

        log.debug("Test cli")
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        datafile = os.path.join(outdir, "merged.bam")
        xmlfile = os.path.join(outdir, "merged.xml")
        cmd = "dataset.py consolidate --numFiles 2 {i} {d} {x}".format(
            i=testFile, d=datafile, x=xmlfile)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
Exemplo n.º 52
0
def to_chunked_alignmentset_files(alignmentset_path, reference_path,
                                  max_total_nchunks, chunk_key, dir_name,
                                  base_name, ext):
    dset = AlignmentSet(alignmentset_path, strict=True)
    dset_chunks = dset.split(contigs=True, maxChunks=max_total_nchunks,
                             breakContigs=True)

    # sanity checking
    reference_set = ReferenceSet(reference_path, strict=True)
    d = {}
    for i, dset in enumerate(dset_chunks):
        chunk_id = '_'.join([base_name, str(i)])
        chunk_name = '.'.join([chunk_id, ext])
        chunk_path = os.path.join(dir_name, chunk_name)
        dset.write(chunk_path)
        d[chunk_key] = os.path.abspath(chunk_path)
        d['$chunk.reference_id'] = reference_path
        c = PipelineChunk(chunk_id, **d)
        yield c
    def test_split_references(self):
        test_file_1 = ('/pbi/dept/secondary/siv/testdata/SA3-RS/lambda/'
                       '2372215/0007_tiny/Alignment_Results/m150404_1016'
                       '26_42267_c100807920800000001823174110291514_s1_p'
                       '0.1.aligned.bam')
        test_file_2 = ('/pbi/dept/secondary/siv/testdata/SA3-Sequel/ecoli/'
                       '315/3150204/r54049_20160508_152025/1_A01/Alignment'
                       '_Results/m54049_160508_155917.alignmentset.xml')
        test_file_3 = ('/pbi/dept/secondary/siv/testdata/SA3-RS/ecoli/'
                       'tiny-multimovie/Alignment_Results/'
                       'combined.alignmentset.xml')
        NREC1 = len(AlignmentSet(test_file_1))
        NREC2 = len(AlignmentSet(test_file_2))
        NREC3 = len(AlignmentSet(test_file_3))
        NREC = NREC1 + NREC2 + NREC3
        self.assertNotEqual(NREC1, 0)
        self.assertNotEqual(NREC2, 0)
        self.assertNotEqual(NREC3, 0)
        self.assertNotEqual(NREC, 0)
        ds1 = AlignmentSet(test_file_1, test_file_2, test_file_3)
        # used to get total:
        #self.assertEqual(sum(1 for _ in ds1), N_RECORDS)
        self.assertEqual(len(ds1), NREC)
        dss = ds1.split_references(1)
        self.assertEqual(len(dss), 1)
        self.assertEqual(sum([len(ds_) for ds_ in dss]), NREC)
        self.assertEqual(len(ds1), NREC)
        self.assertFalse(ds1.filters)

        dss = ds1.split_references(12)
        self.assertEqual(len(dss), 2)
        self.assertEqual(sum([len(ds_) for ds_ in dss]),
                         NREC)
        self.assertEqual(len(set(dss[0].index.tId)), 1)
        self.assertEqual(len(set(dss[-1].index.tId)), 1)
        self.assertEqual(
            dss[0].tid2rname[list(set(dss[0].index.tId))[0]],
            'ecoliK12_pbi_March2013')
        self.assertEqual(len(dss[0]), NREC2 + NREC3)
        self.assertEqual(
            dss[-1].tid2rname[list(set(dss[-1].index.tId))[0]],
            'lambda_NEB3011')
        self.assertEqual(len(dss[-1]), NREC1)
Exemplo n.º 54
0
 def test_filter_cli(self):
     outdir = tempfile.mkdtemp(suffix="dataset-unittest")
     outfn = os.path.join(outdir, "filtered8.xml")
     log.debug(outfn)
     cmd = "dataset filter {i} {o} {f}".format(
         i=data.getXml(8),
         o=outfn,
         f="rname=E.faecalis.1")
     log.debug(cmd)
     o, r, m = backticks(cmd)
     if r != 0:
         log.debug(m)
     self.assertEqual(r, 0)
     self.assertTrue(os.path.exists(outfn))
     aln = AlignmentSet(data.getXml(8))
     aln.filters.addRequirement(rname=[('=', 'E.faecalis.1')])
     aln.updateCounts()
     dset = AlignmentSet(outfn)
     self.assertEqual(str(aln.filters), str(dset.filters))
     self.assertEqual(aln.totalLength, dset.totalLength)
     self.assertEqual(aln.numRecords, dset.numRecords)
Exemplo n.º 55
0
 def loadSharedAlignmentSet(self, cmpH5Filename):
     """
     Read the input AlignmentSet so the indices can be shared with the
     slaves.  This is also used to pass to ReferenceUtils for setting up
     the ipdModel object.
     """
     logging.info("Reading AlignmentSet: %s" % cmpH5Filename)
     logging.info("           reference: %s" % self.args.reference)
     self.alignments = AlignmentSet(cmpH5Filename,
                                    referenceFastaFname=self.args.reference)
     # XXX this should ensure that the file(s) get opened, including any
     # .pbi indices - but need to confirm this
     self.refInfo = self.alignments.referenceInfoTable
Exemplo n.º 56
0
    def test_referenceInfoTableMerging(self):
        log.info("Testing refIds, etc. after merging")
        ds = DataSet(data.getXml(17))
        also_lambda = ds.toExternalFiles()[0]
        aln = AlignmentSet(data.getBam(0), data.getBam(0), also_lambda)
        readers = aln.resourceReaders()

        ids = sorted([i for _, i in aln.refInfo('ID')])
        self.assertEqual(range(len(ids)), ids)

        accNames = aln.refNames
        expNames = reduce(np.append,
                          [reader.referenceInfoTable['Name']
                           for reader in readers])
        expNames = np.unique(expNames)
        self.assertEqual(sorted(expNames), sorted(accNames))

        accNames = aln.fullRefNames
        expNames = reduce(np.append,
                          [reader.referenceInfoTable['FullName']
                           for reader in readers])
        expNames = np.unique(expNames)
        self.assertEqual(sorted(expNames), sorted(accNames))
Exemplo n.º 57
0
    def test_loadMetadata(self):
        aln = AlignmentSet(data.getXml(no=8))
        self.assertFalse(aln.metadata.collections)
        aln.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                         'SA3-Sequel/lambda/roche_SAT/'
                         'm54013_151205_032353.run.metadata.xml')
        self.assertTrue(aln.metadata.collections)
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                'SA3-Sequel/lambda/roche_SAT/'
                'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                          'SA3-Sequel/lambda/roche_SAT/'
                          'm54013_151205_032353.run.metadata.xml')
        stack = zip(sset.metadata, orig_metadata)
        fn = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name
        sset.write(fn)
        validateFile(fn)
        validateFile(sset_fn)
        self.assertEqual(sset.metadata, orig_metadata)


        # load the wrong thing...
        sset_fn = ('/pbi/dept/secondary/siv/testdata/'
                'SA3-Sequel/lambda/roche_SAT/'
                'm54013_151205_032353.subreadset.xml')
        sset = SubreadSet(sset_fn)
        orig_metadata = copy.deepcopy(sset.metadata)
        sset.metadata.collections = None
        self.assertFalse(sset.metadata.collections)
        with self.assertRaises(InvalidDataSetIOError):
            sset.loadMetadata('/pbi/dept/secondary/siv/testdata/'
                              'SA3-Sequel/lambda/roche_SAT/'
                              'm54013_151205_032353.sts.xml')
Exemplo n.º 58
0
    def test_absolutize_cli(self):
        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        aln = AlignmentSet(data.getXml(8))
        aln.copyTo(fn, relative=True)
        self.assertTrue(_is_relative(fn))
        cmd = "dataset absolutize {d}".format(d=fn)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
        self.assertTrue(os.path.exists(fn))
        self.assertFalse(_is_relative(fn))

        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        outfn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        aln = AlignmentSet(data.getXml(8))
        aln.copyTo(fn, relative=True)
        self.assertTrue(_is_relative(fn))
        cmd = "dataset absolutize {d} --outdir {o}".format(d=fn, o=outfn)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
        self.assertTrue(os.path.exists(fn))
        self.assertTrue(os.path.exists(outfn))
        self.assertTrue(_is_relative(fn))
        self.assertFalse(_is_relative(outfn))

        fn = tempfile.NamedTemporaryFile(suffix=".alignmentset.xml").name
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfn = os.path.join(outdir, os.path.split(fn)[1])
        aln = AlignmentSet(data.getXml(8))
        aln.copyTo(fn, relative=True)
        self.assertTrue(_is_relative(fn))
        cmd = "dataset absolutize {d} --outdir {o}".format(d=fn, o=outdir)
        log.debug(cmd)
        o, r, m = backticks(cmd)
        self.assertEqual(r, 0)
        self.assertTrue(os.path.exists(fn))
        self.assertTrue(os.path.exists(outfn))
        self.assertTrue(_is_relative(fn))
        self.assertFalse(_is_relative(outfn))
    def test_uuid(self):
        ds = AlignmentSet()
        old = ds.uuid
        _ = ds.newUuid()
        self.assertNotEqual(old, ds.uuid)

        aln = AlignmentSet(data.getXml(no=8))
        oldUuid = aln.uuid
        outdir = tempfile.mkdtemp(suffix="dataset-doctest")
        outXml = os.path.join(outdir, 'tempfile.xml')
        aln.write(outXml)
        aln = AlignmentSet(outXml)
        self.assertEqual(aln.uuid, oldUuid)
Exemplo n.º 60
0
    def test_write(self):
        outdir = tempfile.mkdtemp(suffix="dataset-unittest")
        outfile = os.path.join(outdir, 'tempfile.xml')
        ds1 = AlignmentSet(data.getBam())
        ds1.write(outfile)
        log.debug('Validated file: {f}'.format(f=outfile))
        validateFile(outfile)
        ds2 = AlignmentSet(outfile)
        self.assertTrue(ds1 == ds2)

        # Should fail when strict:
        ds3 = AlignmentSet(data.getBam())
        ds3.write(outfile)