예제 #1
0
 def testAccessByPosition(self):
     ft = IndexedFastaReader(self.FASTAPATH)
     r000001 = ft[0]
     assert "<IndexedFastaRecord: ref000001|EGFR_Exon_2>" == repr(r000001)
     firstTwo = ft[:2]
     assert [ft[0], ft[1]] == firstTwo
     lastTwo = ft[-2:]
     assert [ft[-2], ft[-1]] == lastTwo
 def testAccessByPosition(self):
     ft = IndexedFastaReader(self.fastaPath)
     r000001 = ft[0]
     assert_equal("<IndexedFastaRecord: ref000001|EGFR_Exon_2>", repr(r000001))
     firstTwo = ft[:2]
     assert_equal([ft[0], ft[1]], firstTwo)
     lastTwo = ft[-2:]
     assert_equal([ft[-2], ft[-1]], lastTwo)
예제 #3
0
 def testAccessById(self):
     ft = IndexedFastaReader(self.FASTAPATH)
     r000021 = ft["ref000021|EGFR_Exon_22"]
     assert "ref000021|EGFR_Exon_22\tMetadataTest" == r000021.header
     assert "ref000021|EGFR_Exon_22" == r000021.id
     assert "MetadataTest" == r000021.comment
     assert ("CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT"
             "TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA"
             "TGCTAATGAGTTTGTACTGAGGCCAAGCTGG") == r000021.sequence[:]
 def testAccessById(self):
     ft = IndexedFastaReader(self.fastaPath)
     r000021 = ft["ref000021|EGFR_Exon_22"]
     assert_equal("ref000021|EGFR_Exon_22\tMetadataTest", r000021.header)
     assert_equal("ref000021|EGFR_Exon_22", r000021.id)
     assert_equal("MetadataTest", r000021.comment)
     assert_equal("CACTGCCTCATCTCTCACCATCCCAAGGTGCCTATCAAGTGGATGGCATTGGAATCAATT"
                  "TTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGTGAGTCATAATCCTGA"
                  "TGCTAATGAGTTTGTACTGAGGCCAAGCTGG",
                  r000021.sequence[:])
예제 #5
0
 def testSlice(self):
     ft = IndexedFastaReader(self.FASTAPATH)
     r000021 = ft["ref000021|EGFR_Exon_22"]
     sequence = r000021.sequence
     assert "CACTGCCTCA" == sequence[0:10]
     assert "GCCAAGCTGG" == sequence[-10:]
     assert "G" == sequence[-1]
     assert "T" == sequence[-3]
     assert "C" == sequence[0]
     assert "A" == sequence[1]
예제 #6
0
 def testSlice(self):
     ft = IndexedFastaReader(self.fastaPath)
     r000021 = ft["ref000021|EGFR_Exon_22"]
     sequence = r000021.sequence
     assert_equal("CACTGCCTCA", sequence[0:10])
     assert_equal("GCCAAGCTGG", sequence[-10:])
     assert_equal("G", sequence[-1])
     assert_equal("T", sequence[-3])
     assert_equal("C", sequence[0])
     assert_equal("A", sequence[1])
    def test_dosLineEndingsFasta(self):
        fr = FastaReader(data.getDosFormattedFasta())
        frEntries = list(fr)

        ft = IndexedFastaReader(data.getDosFormattedFasta())
        ftEntries = list(ft)

        assert_equal(len(frEntries), len(ftEntries))
        for (frE, ftE) in zip(frEntries, ftEntries):
            assert_equal(frE.header, ftE.header)
            assert_equal(frE.sequence, ftE.sequence[:])
예제 #8
0
    def test_dosLineEndingsFasta(self):
        fr = FastaReader(data.getDosFormattedFasta())
        frEntries = list(fr)

        ft = IndexedFastaReader(data.getDosFormattedFasta())
        ftEntries = list(ft)

        assert len(frEntries) == len(ftEntries)
        for (frE, ftE) in zip(frEntries, ftEntries):
            assert frE.header == ftE.header
            assert frE.sequence == ftE.sequence[:]
    def testIteration(self):
        ft = IndexedFastaReader(self.fastaPath)
        fr = FastaReader(self.fastaPath)
        ftContigs = list(ft)
        frContigs = list(fr)
        assert_equal(len(frContigs), len(ftContigs))
        assert_equal(48, len(ftContigs))
        for ftC, frC in zip(ftContigs, frContigs):
            assert_equal(frC.header, ftC.header)
            assert_equal(frC.sequence, ftC.sequence[:])

        # Unlike FastaReader, IndexedFastaReader iteration is repeatable.
        assert_equal(48, len(list(ft)))
예제 #10
0
    def testIteration(self):
        ft = IndexedFastaReader(self.FASTAPATH)
        fr = FastaReader(self.FASTAPATH)
        ftContigs = list(ft)
        frContigs = list(fr)
        assert len(frContigs) == len(ftContigs)
        assert 48 == len(ftContigs)
        for ftC, frC in zip(ftContigs, frContigs):
            assert frC.header == ftC.header
            assert frC.sequence == ftC.sequence[:]

        # Unlike FastaReader, IndexedFastaReader iteration is repeatable.
        assert 48 == len(list(ft))
예제 #11
0
def SummarizeData(indexedFasta, windows, adps):
    summaries = []

    fa = IndexedFastaReader(indexedFasta)
    for hn, (_, tid, s, e, target) in windows.iteritems():
        # First skip ZMWs with no adp results, i.e. with <= 1 adp
        try:
            leftTc6, leftAlt, rightTc6, rightAlt = adps[hn]
        except:
            continue

        chrm = fa[tid]

        # Search for restriction sites near the ends
        fiveP = chrm.sequence[max(s - 5, 0):s + 6]
        threeP = chrm.sequence[e - 5:e + 6]
        fiveEco = HasEcoR1(fiveP)
        threeEco = HasEcoR1(threeP)

        # Search for restriction sites contained within
        inside = chrm.sequence[s + 6:e - 5]
        insideEco = HasEcoR1(inside)

        # Count and summarize any PolyA/T regions
        region = chrm.sequence[s:e]
        AT = LargestAsAndTs(region)
        maxAT = 0 if len(AT) == 0 else max(AT)

        # Check for Guide RNA matches
        OutFiveP = chrm.sequence[max(s - 33, 0):s + 10]
        InFiveP = FastaRecord("tmp",
                              chrm.sequence[max(s - 10, 0):s +
                                            33]).reverseComplement().sequence
        InThreeP = chrm.sequence[e - 33:e + 10]
        OutThreeP = FastaRecord(
            "tmp", chrm.sequence[e - 10:e + 33]).reverseComplement().sequence
        k1, s1, a1 = ScoreCas9SiteSides(OutFiveP, InFiveP)
        k2, s2, a2 = ScoreCas9SiteSides(OutThreeP, InThreeP)

        # Summary columns
        hasPolyA = "T" if maxAT > 0 else "F"
        hasLeft = "T" if (fiveEco == "T" or k1 != "N/A") else "F"
        hasRight = "T" if (threeEco == "T" or k2 != "N/A") else "F"

        summaries.append(
            (hn, tid, s, e, e - s, target, len(AT), maxAT, sum(AT), leftTc6,
             rightTc6, leftAlt, rightAlt, fiveEco, insideEco, threeEco, k1, s1,
             a1, k2, s2, a2, hasPolyA, hasLeft, hasRight))

    return sorted(summaries)
def SummarizeRestrictionData(indexedFasta, windows, adps):
    """Summarize the data for each ZMW, and their left and right sides"""
    results = []

    fa = IndexedFastaReader(indexedFasta)
    for hn, tid, s, e, target in windows:
        # First skip ZMWs with no adp results, i.e. with <= 1 adp
        try:
            polyA = adps[hn]
        except:
            continue
        chrm = fa[tid]

        # Search for restriction sites near the alignment edges
        fiveP = chrm.sequence[s - 5:s + 6]
        threeP = chrm.sequence[e - 5:e + 6]
        threeP_rc = threeP.translate(COMPLEMENT)[::-1]
        left = SearchSequence(fiveP)
        right = SearchSequence(threeP_rc)

        results.append((hn, tid, s, e, target, left, right))

    return sorted(results)
 def test_readWeirdFastaIndex(self):
     f = IndexedFastaReader(data.getWeird())
     entries = list(f)
     assert_equal(1, len(entries))
     assert_equal("chr1", entries[0].header)
     assert_equal("acgtacgtacgtact", entries[0].sequence[:])
예제 #14
0
 def test_readWeirdFastaIndex(self):
     f = IndexedFastaReader(data.getWeird())
     entries = list(f)
     assert 1 == len(entries)
     assert "chr1" == entries[0].header
     assert "acgtacgtacgtact" == entries[0].sequence[:]
예제 #15
0
                if len(currOvl) > 1000:
                    ovls.append(currOvl)
                currOvl = [(hn, s, e, t)]
                currE = e
        retval[ch] = ovls
    return retval


# Second, tabulate the number of usable reads/ZMWs
windows = ReadGenomeWindowsFromPBI([inputFile], TARGETS)
adps = ReadAdaptersFromScraps(scrapsBam)
#byChrom = SortWindowsByChromosome( windows )
#ovls = FindOverlaps( byChrom )

print "HoleNumber,Chromosome,Start,End,Target,PolyAAdp,PolyARegion,MaxPolyARegion,TotalPolyARegion,LeftEcoR1,LeftBamH1,RightEcoRI,RightBamH1,LeftRna,LeftRnaSide,LeftRnaAcc,RightRna,RightRnaSide,RightRna,HasPolyA,HasLeft,HasRight"
fa = IndexedFastaReader(indexedFasta)
for hn, tid, s, e, target in windows:
    # First skip ZMWs with no adp results, i.e. with <= 1 adp
    try:
        polyA = adps[hn]
    except:
        continue

    chrm = fa[tid]

    # Search for restriction sites near
    fiveP = chrm.sequence[s - 5:s + 6]
    threeP = chrm.sequence[e - 5:e + 6]
    fiveEco, fiveBam = HasEcoR1(fiveP), HasBamH1(fiveP)
    threeEco, threeBam = HasEcoR1(threeP), HasBamH1(threeP)