def testAccessByPosition(self): ft = IndexedFastaReader(self.FASTAPATH) r000001 = ft[0] assert "<IndexedFastaRecord: ref000001|EGFR_Exon_2>" == repr(r000001) firstTwo = ft[:2] assert [ft[0], ft[1]] == firstTwo lastTwo = ft[-2:] assert [ft[-2], ft[-1]] == lastTwo
def testAccessByPosition(self): ft = IndexedFastaReader(self.fastaPath) r000001 = ft[0] assert_equal("<IndexedFastaRecord: ref000001|EGFR_Exon_2>", repr(r000001)) firstTwo = ft[:2] assert_equal([ft[0], ft[1]], firstTwo) lastTwo = ft[-2:] assert_equal([ft[-2], ft[-1]], lastTwo)
def testAccessById(self): ft = IndexedFastaReader(self.FASTAPATH) r000021 = ft["ref000021|EGFR_Exon_22"] assert "ref000021|EGFR_Exon_22\tMetadataTest" == r000021.header assert "ref000021|EGFR_Exon_22" == r000021.id assert "MetadataTest" == r000021.comment assert ("CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT" "TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA" "TGCTAATGAGTTTGTACTGAGGCCAAGCTGG") == r000021.sequence[:]
def testAccessById(self): ft = IndexedFastaReader(self.fastaPath) r000021 = ft["ref000021|EGFR_Exon_22"] assert_equal("ref000021|EGFR_Exon_22\tMetadataTest", r000021.header) assert_equal("ref000021|EGFR_Exon_22", r000021.id) assert_equal("MetadataTest", r000021.comment) assert_equal("CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT" "TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA" "TGCTAATGAGTTTGTACTGAGGCCAAGCTGG", r000021.sequence[:])
def testSlice(self): ft = IndexedFastaReader(self.FASTAPATH) r000021 = ft["ref000021|EGFR_Exon_22"] sequence = r000021.sequence assert "CACTGCCTCA" == sequence[0:10] assert "GCCAAGCTGG" == sequence[-10:] assert "G" == sequence[-1] assert "T" == sequence[-3] assert "C" == sequence[0] assert "A" == sequence[1]
def testSlice(self): ft = IndexedFastaReader(self.fastaPath) r000021 = ft["ref000021|EGFR_Exon_22"] sequence = r000021.sequence assert_equal("CACTGCCTCA", sequence[0:10]) assert_equal("GCCAAGCTGG", sequence[-10:]) assert_equal("G", sequence[-1]) assert_equal("T", sequence[-3]) assert_equal("C", sequence[0]) assert_equal("A", sequence[1])
def test_dosLineEndingsFasta(self): fr = FastaReader(data.getDosFormattedFasta()) frEntries = list(fr) ft = IndexedFastaReader(data.getDosFormattedFasta()) ftEntries = list(ft) assert_equal(len(frEntries), len(ftEntries)) for (frE, ftE) in zip(frEntries, ftEntries): assert_equal(frE.header, ftE.header) assert_equal(frE.sequence, ftE.sequence[:])
def test_dosLineEndingsFasta(self): fr = FastaReader(data.getDosFormattedFasta()) frEntries = list(fr) ft = IndexedFastaReader(data.getDosFormattedFasta()) ftEntries = list(ft) assert len(frEntries) == len(ftEntries) for (frE, ftE) in zip(frEntries, ftEntries): assert frE.header == ftE.header assert frE.sequence == ftE.sequence[:]
def testIteration(self): ft = IndexedFastaReader(self.fastaPath) fr = FastaReader(self.fastaPath) ftContigs = list(ft) frContigs = list(fr) assert_equal(len(frContigs), len(ftContigs)) assert_equal(48, len(ftContigs)) for ftC, frC in zip(ftContigs, frContigs): assert_equal(frC.header, ftC.header) assert_equal(frC.sequence, ftC.sequence[:]) # Unlike FastaReader, IndexedFastaReader iteration is repeatable. assert_equal(48, len(list(ft)))
def testIteration(self): ft = IndexedFastaReader(self.FASTAPATH) fr = FastaReader(self.FASTAPATH) ftContigs = list(ft) frContigs = list(fr) assert len(frContigs) == len(ftContigs) assert 48 == len(ftContigs) for ftC, frC in zip(ftContigs, frContigs): assert frC.header == ftC.header assert frC.sequence == ftC.sequence[:] # Unlike FastaReader, IndexedFastaReader iteration is repeatable. assert 48 == len(list(ft))
def SummarizeData(indexedFasta, windows, adps): summaries = [] fa = IndexedFastaReader(indexedFasta) for hn, (_, tid, s, e, target) in windows.iteritems(): # First skip ZMWs with no adp results, i.e. with <= 1 adp try: leftTc6, leftAlt, rightTc6, rightAlt = adps[hn] except: continue chrm = fa[tid] # Search for restriction sites near the ends fiveP = chrm.sequence[max(s - 5, 0):s + 6] threeP = chrm.sequence[e - 5:e + 6] fiveEco = HasEcoR1(fiveP) threeEco = HasEcoR1(threeP) # Search for restriction sites contained within inside = chrm.sequence[s + 6:e - 5] insideEco = HasEcoR1(inside) # Count and summarize any PolyA/T regions region = chrm.sequence[s:e] AT = LargestAsAndTs(region) maxAT = 0 if len(AT) == 0 else max(AT) # Check for Guide RNA matches OutFiveP = chrm.sequence[max(s - 33, 0):s + 10] InFiveP = FastaRecord("tmp", chrm.sequence[max(s - 10, 0):s + 33]).reverseComplement().sequence InThreeP = chrm.sequence[e - 33:e + 10] OutThreeP = FastaRecord( "tmp", chrm.sequence[e - 10:e + 33]).reverseComplement().sequence k1, s1, a1 = ScoreCas9SiteSides(OutFiveP, InFiveP) k2, s2, a2 = ScoreCas9SiteSides(OutThreeP, InThreeP) # Summary columns hasPolyA = "T" if maxAT > 0 else "F" hasLeft = "T" if (fiveEco == "T" or k1 != "N/A") else "F" hasRight = "T" if (threeEco == "T" or k2 != "N/A") else "F" summaries.append( (hn, tid, s, e, e - s, target, len(AT), maxAT, sum(AT), leftTc6, rightTc6, leftAlt, rightAlt, fiveEco, insideEco, threeEco, k1, s1, a1, k2, s2, a2, hasPolyA, hasLeft, hasRight)) return sorted(summaries)
def SummarizeRestrictionData(indexedFasta, windows, adps): """Summarize the data for each ZMW, and their left and right sides""" results = [] fa = IndexedFastaReader(indexedFasta) for hn, tid, s, e, target in windows: # First skip ZMWs with no adp results, i.e. with <= 1 adp try: polyA = adps[hn] except: continue chrm = fa[tid] # Search for restriction sites near the alignment edges fiveP = chrm.sequence[s - 5:s + 6] threeP = chrm.sequence[e - 5:e + 6] threeP_rc = threeP.translate(COMPLEMENT)[::-1] left = SearchSequence(fiveP) right = SearchSequence(threeP_rc) results.append((hn, tid, s, e, target, left, right)) return sorted(results)
def test_readWeirdFastaIndex(self): f = IndexedFastaReader(data.getWeird()) entries = list(f) assert_equal(1, len(entries)) assert_equal("chr1", entries[0].header) assert_equal("acgtacgtacgtact", entries[0].sequence[:])
def test_readWeirdFastaIndex(self): f = IndexedFastaReader(data.getWeird()) entries = list(f) assert 1 == len(entries) assert "chr1" == entries[0].header assert "acgtacgtacgtact" == entries[0].sequence[:]
if len(currOvl) > 1000: ovls.append(currOvl) currOvl = [(hn, s, e, t)] currE = e retval[ch] = ovls return retval # Second, tabulate the number of usable reads/ZMWs windows = ReadGenomeWindowsFromPBI([inputFile], TARGETS) adps = ReadAdaptersFromScraps(scrapsBam) #byChrom = SortWindowsByChromosome( windows ) #ovls = FindOverlaps( byChrom ) print "HoleNumber,Chromosome,Start,End,Target,PolyAAdp,PolyARegion,MaxPolyARegion,TotalPolyARegion,LeftEcoR1,LeftBamH1,RightEcoRI,RightBamH1,LeftRna,LeftRnaSide,LeftRnaAcc,RightRna,RightRnaSide,RightRna,HasPolyA,HasLeft,HasRight" fa = IndexedFastaReader(indexedFasta) for hn, tid, s, e, target in windows: # First skip ZMWs with no adp results, i.e. with <= 1 adp try: polyA = adps[hn] except: continue chrm = fa[tid] # Search for restriction sites near fiveP = chrm.sequence[s - 5:s + 6] threeP = chrm.sequence[e - 5:e + 6] fiveEco, fiveBam = HasEcoR1(fiveP), HasBamH1(fiveP) threeEco, threeBam = HasEcoR1(threeP), HasBamH1(threeP)