def testReadName(self): EQ( "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9681_9727", self.fwdAln.readName) EQ( "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9561_9619", self.revAln.readName)
def testBaxAttaching(self): # Before attaching, should get sane exceptions with assert_raises(ValueError): self.fwdAln.zmw with assert_raises(ValueError): self.fwdAln.zmwRead # Now attach self.f.attach(self.BAX_FILE) EQ( 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9681_9727', self.fwdAln.readName) EQ( 'm140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957', self.fwdAln.zmwName) EQ( '<Zmw: m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957>', repr(self.fwdAln.zmw)) EQ( '<ZmwRead: m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7957/9681_9727>', repr(self.fwdAln.zmwRead)) # Check read contents, for every aln. for aln in self.alns: EQ(aln.read(aligned=False, orientation="native"), aln.zmwRead.basecalls())
def testClippedAlignments(self): # Get a more interesting (more gappy) fwd strand aln a = self.fwdAln EQ([(980, 'C', 'C'), (981, 'C', 'C'), (982, 'T', 'T'), (983, 'A', '-'), (984, 'C', 'C'), (985, '-', 'G'), (985, 'T', 'T'), (986, 'T', 'T') ], zip(a.referencePositions(), a.reference(), a.read())[308:316]) ac1 = a.clippedTo(983, 985) EQ(983, ac1.referenceStart) EQ(985, ac1.referenceEnd) EQ([(983, 'A', '-'), (984, 'C', 'C')], zip(ac1.referencePositions(), ac1.reference(), ac1.read())) ac2 = a.clippedTo(982, 986) EQ(982, ac2.referenceStart) EQ(986, ac2.referenceEnd) EQ([(982, 'T', 'T'), (983, 'A', '-'), (984, 'C', 'C'), (985, '-', 'G'), (985, 'T', 'T')], zip(ac2.referencePositions(), ac2.reference(), ac2.read())) ac3 = a.clippedTo(984, 985) EQ(984, ac3.referenceStart) EQ(985, ac3.referenceEnd) EQ([(984, 'C', 'C')], zip(ac3.referencePositions(), ac3.reference(), ac3.read()))
def test_readPositions(self): # Native orientation on a fwd strand read EQ([('A', 44), ('A', 45), ('C', 46), ('T', 47), ('G', 48), ('G', 49), ('T', 50), ('-', 51), ('-', 51), ('C', 51)], zip(self._inCmpH5[26].read()[:10], self._inCmpH5[26].readPositions()[:10])) # Genomic orientation on a fwd strand read EQ([('A', 44), ('A', 45), ('C', 46), ('T', 47), ('G', 48), ('G', 49), ('T', 50), ('-', 51), ('-', 51), ('C', 51)], zip(self._inCmpH5[26].read(orientation="genomic")[:10], self._inCmpH5[26].readPositions(orientation="genomic")[:10])) # Test native orientation on a rev. strand read EQ([('T', 295), ('C', 296), ('C', 297), ('G', 298), ('-', 299), ('C', 299), ('G', 300), ('C', 301), ('C', 302), ('C', 303)], zip(self.hit0.read()[-10:], self.hit0.readPositions()[-10:])) # Test genomic orientation on a rev. strand read EQ([('G', 303), ('G', 302), ('G', 301), ('C', 300), ('G', 299), ('-', 298), ('C', 298), ('G', 297), ('G', 296), ('A', 295)], zip( self.hit0.read(orientation="genomic")[:10], self.hit0.readPositions(orientation="genomic")[:10]))
def testErrorCounts(self): for aln in [self.fwdAln, self.revAln]: counts = Counter(aln.transcript()) EQ(counts["M"], aln.nM) EQ(counts["R"], aln.nMM) EQ(counts["I"], aln.nIns) EQ(counts["D"], aln.nDel)
def testUnalignedReference(self): expectedFwdNative = "GCCGCGCTGGATGAACTGATACCGGGGTTGCTGAGTGAATATATCGAACAGTCAGGTTAACAGGCTGCGGCATTTTGTCCGCGCCGGGCTTCGCTCACTGTTCAGGCCGGAGCCACAGACCGCCGTTGAATGGGCGGATGCTAATTACTATCTCCCGAAAGAATCCGCATACCAGGAAGGGCGCTGGGAAACACTGCCCTTTCAGCGGGCCATCATGAATGCGATGGGCAGCGACTACATCCGTGAGGTGAATGTGGTGAAGTCTGCCCGTGTCGGTTATTCCAAAATGCTGCTGGGTGTTTATGCCTACTTTATAGAGCATAAGCAGCGCAACACCCTTATCTGGTTGCC" EQ(expectedFwdNative, self.fwdAln.reference(aligned=False)) EQ(expectedFwdNative, self.fwdAln.reference(aligned=False, orientation="genomic")) expectedRevNative = "TAGCCACCGGATATCCCACAGGTGAGCCGTGTAGTTGAAGGTTTTTACGTCAGATTCTTTTGGGATTGGCTTGGGTTTATTTCTGGTGCGTTTCGTTGGAAGGTATTTGCAGTTTTCGCAGATTATGTCGGTGATACTTCGTCGCTGTCTCGCCACACGTCCTCCTTTTCCTGCGGTAGTGGTAACACCCC" EQ(expectedRevNative, self.revAln.reference(aligned=False)) EQ(RC(expectedRevNative), self.revAln.reference(aligned=False, orientation="genomic"))
def testReadName(self): EQ( "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/1_344", self.fwdAln.readName) EQ( "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/51534/1_200", self.revAln.readName)
def test_reads_in_range_bounds(self): EQ(len(self._inCmpH5.readsInRange(1, 0, 1)), 2) EQ(all([x.tStart == 0 for x in self._inCmpH5.readsInRange(1, 0, 1)]), True) EQ(len(self._inCmpH5.readsInRange(1, 1000, 1051)), 0) EQ(len(self._inCmpH5.readsInRange(1, 1000, 1052)), 1) EQ(len(self._inCmpH5.readsInRange(1, 0, 1e20)), len(self._inCmpH5))
def test_referencePositions(self): # Native orientation on a fwd strand read EQ([('G', 7466), ('A', 7467), ('A', 7468), ('G', 7469), ('-', 7470), ('C', 7470), ('T', 7471), ('-', 7472), ('G', 7472), ('C', 7473)], zip(self._inCmpH5[26].reference()[25:35], self._inCmpH5[26].referencePositions()[25:35])) # Genomic orientation on a fwd strand read EQ([('G', 7466), ('A', 7467), ('A', 7468), ('G', 7469), ('-', 7470), ('C', 7470), ('T', 7471), ('-', 7472), ('G', 7472), ('C', 7473)], zip( self._inCmpH5[26].reference(orientation="genomic")[25:35], self._inCmpH5[26].referencePositions( orientation="genomic")[25:35])) # Test native orientation on a rev. strand read EQ([('T', 8), ('C', 7), ('-', 6), ('G', 6), ('C', 5), ('C', 4), ('G', 3), ('C', 2), ('C', 1), ('C', 0)], zip(self._inCmpH5[0].reference()[-10:], self._inCmpH5[0].referencePositions()[-10:])) # Test genomic orientation on a rev. strand read EQ([('G', 0), ('G', 1), ('G', 2), ('C', 3), ('G', 4), ('G', 5), ('C', 6), ('-', 7), ('G', 7), ('A', 8)], zip(self._inCmpH5[0].reference(orientation="genomic")[:10], self._inCmpH5[0].referencePositions( orientation="genomic")[:10]))
def testReadGroupTable(self): rgFwd = self.fwdAln.readGroupInfo EQ([('ID', '<i4'), ('MovieName', 'O'), ('ReadType', 'O'), ('SequencingChemistry', 'O'), ('FrameRate', '<f8')], rgFwd.dtype) EQ("P6-C4", rgFwd.SequencingChemistry) EQ("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0", rgFwd.MovieName)
def testTranscript(self): EQ( 'MMMMMMRMDMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMDMMMDMMMMMMMMMMMMMMRMMMMMMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMDMMMMMMMMDMIMMMMMMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMM', self.fwdAln.transcript()) EQ( "MMMMMMMMMMMMMMMIMMMMMMMMMMIMMMMMMMIMMMMMDMMMIMMMMIMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM", self.revAln.transcript())
def testUnalignedRead(self): expectedFwdNative = "TACGGTCATCATCTGACACTACAGACTCTGGCATCGCTGTGAAGAC" EQ(expectedFwdNative, self.fwdAln.read(aligned=False)) EQ(expectedFwdNative, self.fwdAln.read(aligned=False, orientation="genomic")) expectedRevNative = "CTTGTGAAAATGCTGAATTCTGCGTCGCTTCACCAGCGATGCCAAGTCTGTAGTGTCA" EQ(expectedRevNative, self.revAln.read(aligned=False)) EQ(RC(expectedRevNative), self.revAln.read(aligned=False, orientation="genomic"))
def test_load_updated_mapping(self): import os from os.path import dirname from pbcore.chemistry.chemistry import _loadBarcodeMappings os.environ["SMRT_CHEMISTRY_BUNDLE_DIR"] = dirname(data.getMappingXml()) mappings = _loadBarcodeMappings() EQ(mappings.get(("1", "2", "3.4"), None), "FOUND") del os.environ["SMRT_CHEMISTRY_BUNDLE_DIR"] mappings = _loadBarcodeMappings() EQ(mappings.get(("1", "2", "3.4"), None), None)
def testHoleNumbers(self): c = Counter([a.holeNumber for a in self.f]) # from records c2 = Counter(self.f.holeNumber) # from index expected = Counter({ 37134: 14, 6251: 10, 32861: 8, 14743: 4, 35858: 3, 39571: 3, 13473: 3, 32560: 3, 46835: 3, 47698: 3, 16996: 3, 30983: 2, 38025: 2, 36363: 2, 7957: 2, 49050: 2, 23454: 2, 49194: 2, 24494: 2, 20211: 2, 50621: 2, 12736: 2, 19915: 2, 6469: 2, 31174: 2, 32328: 2, 42827: 2, 7247: 2, 50257: 2, 2771: 2, 1650: 2, 45203: 2, 24962: 1, 32901: 1, 36628: 1, 26262: 1, 15641: 1, 19360: 1, 42165: 1, 44356: 1, 51534: 1, 29843: 1, 38754: 1, 52206: 1, 49521: 1, 7670: 1, 54396: 1, 19837: 1 }) EQ(expected, c) EQ(expected, c2)
def testReadsByName(self): reads2771_1 = self.f.readsByName("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771/*") reads2771_2 = self.f.readsByName("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771") reads2771_3 = self.f.readsByName("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771/") expectedReadNames = ["m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771/8741_8874", "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/2771/8942_9480"] EQ(expectedReadNames, [r.readName for r in reads2771_1]) EQ(expectedReadNames, [r.readName for r in reads2771_2]) EQ(expectedReadNames, [r.readName for r in reads2771_3])
def test_cigar(self): EQ( "6M2D12M1I10M1D21M1I2M2I7M1I2M1I10M1I4M1I11M1D1I4M1I1M1I36M1I4M2I1M" + "1D2M1I9M1I15M1I9M1I4M9D9M1I2M1D16M1I20M1D4M1D8M3I12M1I2M2I7M1I4M1I" + "1M1D4M1I6M1I1M1D5M", self._inCmpH5[0].cigar()) EQ( "5M1D1M1I6M1I4M1D1M1I4M1I7M2I2M1I12M3I8M1D4M1D20M1I16M1D2M1I9M9D4M1" + "I9M1I15M1I9M1I2M1D1M2I4M1I36M1I1M1I4M1I1D11M1I4M1I10M1I2M1I7M2I2M1" + "I21M1D10M1I12M2D6M", self._inCmpH5[0].cigar(orientation="genomic"))
def test_diploid_variantsFromAlignment(): refWin = (0, 10, 17) EQ([], variantsFromAlignment(refWin, "GATTACA", "GATTACA")) EQ([Variant(0, 13, 14, "T", "G")], variantsFromAlignment(refWin, "GATTACA", "GATGACA")) EQ([Variant(0, 12, 14, "TT", "GG")], variantsFromAlignment(refWin, "GATTACA", "GAGGACA")) EQ([Variant(0, 12, 13, "T", "G"), Variant(0, 14, 15, "A", "G")], variantsFromAlignment(refWin, "GATTACA", "GAGNGCA")) EQ([Variant(0, 15, 16, "C", "")], variantsFromAlignment(refWin, "GATTACA", "GATTAA")) EQ([Variant(0, 12, 12, "", "T")], variantsFromAlignment(refWin, "GATTACA", "GATTTACA")) EQ([Variant(0, 13, 14, "T", "A", "T")], variantsFromAlignment(refWin, "GATTACA", "GATWACA")) EQ([Variant(0, 12, 13, "T", "A", "T"), Variant(0, 13, 14, "T", "A", "T")], variantsFromAlignment(refWin, "GATTACA", "GAWWACA"))
def test_retrieve_read_group_properties(self): f1 = tempfile.NamedTemporaryFile(suffix=".sam").name f2 = tempfile.NamedTemporaryFile(suffix=".bam").name with open(f1, "w") as f: f.write(self.SAM_IN) with pysam.AlignmentFile(f1) as sam_in: with pysam.AlignmentFile(f2, 'wb', template=sam_in) as bam_out: for aln in sam_in: bam_out.write(aln) movie_names = [] with BamReader(f2) as bam_in: for aln in bam_in: EQ(aln.sequencingChemistry, "P6-C4") movie_names.append(aln.movieName) EQ(movie_names, ['movie1', 'm140906_231018_42161_c100676332550000001823129611271486_s1_p0'])
def test_empty_bam_reads_in_range(self): with IndexedBamReader(data.getEmptyAlignedBam()) as bam: reads = bam.readsInRange("lambda_NEB3011", 0, 50000, justIndices=True) EQ(len(reads), 0)
def testNoCallBasesInReference1(self): a = PairwiseAlignment("GATTNGATT", "GAGGATATT") vs = utils.variantsFromAlignment(a, (1, 1000, 2000)) EQ([ Variant(1, 1002, 1004, "TT", "GG", refPrev="A", readPrev="A"), Variant(1, 1005, 1006, "G", "T", refPrev="N", readPrev="A") ], vs)
def testTwoSubstitutions(self): a = PairwiseAlignment("GATTACA", "GAGTAGA") vs = utils.variantsFromAlignment(a, (1, 1000, 2000)) EQ([ Variant(1, 1002, 1003, "T", "G", refPrev="A", readPrev="A"), Variant(1, 1005, 1006, "C", "G", refPrev="A", readPrev="A") ], vs)
def test_algorithm_selection(): EQ("quiver", bestAlgorithm_(["P6-C4"])) EQ("quiver", bestAlgorithm_(["P6-C4", "P5-C3"])) EQ("arrow", bestAlgorithm_(["S/P1-C1/beta"])) EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1/beta"])) EQ(None, bestAlgorithm_(["P6-C4", "unknown"])) EQ("arrow", bestAlgorithm_(["S/P1-C1"])) EQ("arrow", bestAlgorithm_(["P6-C4", "S/P1-C1.1"])) EQ("arrow", bestAlgorithm_(["P5-C3", "S/P1-C1.1"])) # (Arrow pres. no training for P5. But it will tell us that)
def testClippingsVsBaxData(self): self.f.attach(self.BAX_FILE) for aln in [self.fwdAln, self.revAln]: for cS in xrange(aln.tStart, aln.tEnd + 1): for cE in xrange(cS + 1, min(aln.tEnd, cS + 10)): ca = aln.clippedTo(cS, cE) EQ(ca.zmwRead.basecalls(), ca.read(aligned=False, orientation="native"))
def testReadsInRange(self): wLen = 1000 for wStart in xrange(0, 50000, wLen): wEnd = wStart + wLen expectedNames = set([ a.readName for a in self.alns if (a.referenceName == "lambda_NEB3011" and a.overlapsReferenceRange(wStart, wEnd)) ]) EQ(expectedNames, set([ a.readName for a in self.f.readsInRange("lambda_NEB3011", wStart, wEnd) ]))
def test_alignment_identity(self): """ Check that the values of the 'identity' property are consistent between IndexedBamReader (numpy array) and BamAlignment (float) """ fn = data.getBamAndCmpH5()[0] with IndexedBamReader(fn) as bam_in: i1 = bam_in.identity i2 = np.array([rec.identity for rec in bam_in]) EQ((i2 == i1).all(), True)
def testIpd(self): """Check that 'Ipd' feature is recognized correctly.""" pfa = self.bam.pulseFeaturesAvailable() EQ( pfa, frozenset([ 'Ipd', 'DeletionTag', 'MergeQV', 'SubstitutionQV', 'InsertionQV', 'DeletionQV' ])) ipd = self.bamRead0.IPD(aligned=False, orientation="native")
def test_alignment_identity_unindexed(self): """ Check that the value of the 'identity' property is the same whether or not the .pbi index was used to calculate it. """ fn1 = data.getBamAndCmpH5()[0] fn2 = tempfile.NamedTemporaryFile(suffix=".bam").name shutil.copyfile(fn1, fn2) with IndexedBamReader(fn1) as bam_pbi: with BamReader(fn2) as bam_noindex: i1 = np.array([rec.identity for rec in bam_pbi]) i2 = np.array([rec.identity for rec in bam_noindex]) EQ((i2 == i1).all(), True)
def testVariantsFromAlignment4(self): a = PairwiseAlignment("GA-TACA", "GATTACA") qvs = [0, 0, 1, 0, 0, 0, 0] vs = utils.variantsFromAlignment(a, (1, 1000, 2000), qvs) EQ([ Variant(1, 1002, 1002, "", "T", confidence=1, refPrev="A", readPrev="A") ], vs)
def testVariantsFromAlignment5(self): a = PairwiseAlignment("-ATTACA", "GATTACA") qvs = [1, 0, 0, 0, 0, 0, 0] vs = utils.variantsFromAlignment(a, (1, 1000, 2000), qvs) EQ([ Variant(1, 1000, 1000, "", "G", confidence=1, refPrev="N", readPrev="N") ], vs)
def testVariantsFromAlignment6(self): a = PairwiseAlignment("GATTAC-", "GATTACA") qvs = [0, 0, 0, 0, 0, 0, 1] vs = utils.variantsFromAlignment(a, (1, 1000, 2000), qvs) EQ([ Variant(1, 1006, 1006, "", "A", confidence=1, refPrev="C", readPrev="C") ], vs)