def setup(self): self.header = "chr1|blah|blah\tblah blah" self.rc_header = "chr1|blah|blah\tblah blah [revcomp]" self.id = "chr1|blah|blah" self.comment = "blah blah" self.sequence = "GATTACA" * 20 self.rc_sequence = "TGTAATC" * 20 self.length = 140 self.expected__str__ = ( ">chr1|blah|blah\tblah blah\n" "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n" "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n" "ATTACAGATTACAGATTACA") self.rc1_expected__str__ = ( ">chr1|blah|blah\tblah blah [revcomp]\n" "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n" "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n" "GTAATCTGTAATCTGTAATC") self.rc2_expected__str__ = ( ">chr1|blah|blah\tblah blah\n" "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n" "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n" "GTAATCTGTAATCTGTAATC") self.record = FastaRecord(self.header, self.sequence) self.rc1_record = self.record.reverseComplement() self.rc2_record = self.record.reverseComplement(True)
def _create_chimera_pair(locus, sequences): """ Create chimeras for a single locus """ assert len(sequences) == 2 first_5p, first_3p = _split_sequence(sequences[0]) second_5p, second_3p = _split_sequence(sequences[1]) A_chimera = FastaRecord(name='Locus%s_ChimeraA' % locus, sequence=first_5p + second_3p) B_chimera = FastaRecord(name='Locus%s_ChimeraB' % locus, sequence=second_5p + first_3p) return [A_chimera, B_chimera]
def SummarizeData(indexedFasta, windows, adps): summaries = [] fa = IndexedFastaReader(indexedFasta) for hn, (_, tid, s, e, target) in windows.iteritems(): # First skip ZMWs with no adp results, i.e. with <= 1 adp try: leftTc6, leftAlt, rightTc6, rightAlt = adps[hn] except: continue chrm = fa[tid] # Search for restriction sites near the ends fiveP = chrm.sequence[max(s - 5, 0):s + 6] threeP = chrm.sequence[e - 5:e + 6] fiveEco = HasEcoR1(fiveP) threeEco = HasEcoR1(threeP) # Search for restriction sites contained within inside = chrm.sequence[s + 6:e - 5] insideEco = HasEcoR1(inside) # Count and summarize any PolyA/T regions region = chrm.sequence[s:e] AT = LargestAsAndTs(region) maxAT = 0 if len(AT) == 0 else max(AT) # Check for Guide RNA matches OutFiveP = chrm.sequence[max(s - 33, 0):s + 10] InFiveP = FastaRecord("tmp", chrm.sequence[max(s - 10, 0):s + 33]).reverseComplement().sequence InThreeP = chrm.sequence[e - 33:e + 10] OutThreeP = FastaRecord( "tmp", chrm.sequence[e - 10:e + 33]).reverseComplement().sequence k1, s1, a1 = ScoreCas9SiteSides(OutFiveP, InFiveP) k2, s2, a2 = ScoreCas9SiteSides(OutThreeP, InThreeP) # Summary columns hasPolyA = "T" if maxAT > 0 else "F" hasLeft = "T" if (fiveEco == "T" or k1 != "N/A") else "F" hasRight = "T" if (threeEco == "T" or k2 != "N/A") else "F" summaries.append( (hn, tid, s, e, e - s, target, len(AT), maxAT, sum(AT), leftTc6, rightTc6, leftAlt, rightAlt, fiveEco, insideEco, threeEco, k1, s1, a1, k2, s2, a2, hasPolyA, hasLeft, hasRight)) return sorted(summaries)
def Write(self): """Clean-up the sequences and write out a Genomic Fasta""" sets = [] writers = [] for allele, seq in self._dict.iteritems(): exons = seq.split("|") while len(writers) < len(exons): fasta = "{0}_exon{1}.fasta".format(self._locus, len(writers) + 1) writers.append(FastaWriter(fasta)) sets.append(set()) for i, exon in enumerate(exons): exon = re.sub("[.|*]", "", exon) if len(exon) == 0: continue if exon in sets[i]: continue record = FastaRecord(allele, exon) writers[i].writeRecord(record) sets[i].add(exon)
def write_temporary_fasta(record_list): temp_fasta = tempfile.NamedTemporaryFile(suffix=".fasta", delete=False) with FastaWriter(temp_fasta.name) as handle: for record in record_list: rec = FastaRecord(record.name, record.sequence) handle.writeRecord(rec) return temp_fasta
def parse_primer_sequences(primers_str): """ Return a list of primer FastaRecords if primers_str only contains valid primers. Otherwise raise a ValueError. """ if isinstance(primers_str, str) or isinstance(primers_str, unicode): primer_fasta_records = [] primers_str = str(primers_str) if '>' not in primers_str: raise ValueError("Invalid primer header, could not find leading '>'.") for str_index, primer_str in enumerate(primers_str.split('>')[1:]): lines = [line.strip().translate(None, '\'\" ') for line in primer_str.split('\n')] lines = [line for line in lines if len(line) > 0] # remove empty lines if len(lines) < 2: raise ValueError("Primer %s must have a sequence." % lines[0]) primer_name = lines[0] primer_sequence = ''.join(lines[1:]) primer_index = int(str_index / 2) primer_strand = 'F' if str_index % 2 == 0 else 'R' expected_primer_name = "{s}{i}".format(s=primer_strand, i=primer_index) if primer_name != expected_primer_name: raise ValueError("Primers should be placed in order F0, R0, F1, R1...") for base in primer_sequence: if base.upper() not in ('A', 'T', 'G', 'C'): raise ValueError("Primer sequence %s must only contain ATGC" % primer_sequence) primer_fasta_records.append(FastaRecord(header=primer_name, sequence=primer_sequence)) return primer_fasta_records raise ValueError("Input primers_str %s must be either str or unicode" % type(primers_str))
def gconFunc(tp): # called bcause multiprocess rootDir, barcode = tp bcdir = "/".join((rootDir, barcode)) ## call gcon logging.info("In gconFunc for: %s" % barcode) cmd = "gcon.py r --min_cov 3 %s/subreads.fasta %s/seed_read.fasta -d %s" % \ (bcdir, bcdir, bcdir) subprocess.call(cmd, shell=True) ## check to see if the file is empty r = FastaReader("%s/g_consensus.fa" % bcdir) if not list(r)[0].sequence: return None ## check to see if we are going to run quiver if not runner.args.noQuiver: # setup the blasr / sam / quiver stuff. logging.info("Setup regions file, now running blasr through quiver.") cmd = ('blasr %s %s/g_consensus.fa -nproc 1 -sam -regionTable %s/region.fofn -out ' + \ '%s/aligned_reads.sam') % (runner.args.inputFofn, bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell=True) cmd = 'samtoh5 %s/aligned_reads.sam %s/g_consensus.fa %s/aligned_reads.cmp.h5' % \ (bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell=True) cmd = ('loadPulses %s %s/aligned_reads.cmp.h5 -byread -metrics ' + \ 'QualityValue,InsertionQV,MergeQV,DeletionQV,DeletionTag,SubstitutionTag,' + \ 'SubstitutionQV') % (runner.args.inputFofn, bcdir) logging.debug(cmd) subprocess.call(cmd, shell=True) cmd = 'cmph5tools.py sort --inPlace %s/aligned_reads.cmp.h5' % bcdir logging.debug(cmd) subprocess.call(cmd, shell=True) cmd = ('quiver -vv --algorithm quiver -p P4-C2.AllQVsMergingByChannelModel ' \ '%s/aligned_reads.cmp.h5 --outputFilename %s/q_consensus.fasta ' + \ '--referenceFilename %s/g_consensus.fa') % (bcdir, bcdir, bcdir) logging.debug(cmd) subprocess.call(cmd, shell=True) cFilename = 'q_consensus.fasta' else: cFilename = 'g_consensus.fa' ## append results to output file. bcCons = "%s/%s/%s" % (rootDir, barcode, cFilename) if os.path.exists(bcCons): return FastaRecord(barcode, list(FastaReader(bcCons))[0].sequence) else: return None
def Write(self): """Clean-up the sequences and write out a Genomic Fasta""" filename = "{0}_genomic.fasta".format(self._locus) with FastaWriter(filename) as handle: for allele, seq in self._dict.iteritems(): # Remove inserts, exon/intron boundaries, and trimmed regions seq = re.sub("[.|*]", "", seq) record = FastaRecord(allele, seq) handle.writeRecord(record)
def setup(self): self.name = "chr1|blah|blah" self.sequence = "GATTACA" * 20 self.expected__str__ = \ ">chr1|blah|blah\n" \ "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n" \ "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n" \ "ATTACAGATTACAGATTACA" self.record = FastaRecord(self.name, self.sequence)
def write_temp_fasta(fastq_file): """ Write a temporary Fasta file from a Fastq """ temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False) with FastaWriter(temp.name) as handle: for record in FastqReader(fastq_file): temp_record = FastaRecord(record.name, record.sequence) handle.writeRecord(temp_record) return temp
def _fastq_to_fasta(fastq_path, fasta_path): """Convert a fastq file to fasta file""" with FastqReader(fastq_path) as r: with FastaWriter(fasta_path) as w: for fastq_record in r: fasta_record = FastaRecord(fastq_record.name, fastq_record.sequence) w.writeRecord(fasta_record) log.info("Completed converting {q} to {f}".format(q=fastq_path, f=fasta_path)) return 0
def rename_record( record ): new_name = '|'.join( record.name.strip().split('|')[:-1] ) if isinstance( record, FastaRecord ): return FastaRecord( new_name, record.sequence ) elif isinstance( record, FastqRecord ): return FastqRecord( new_name, record.sequence, record.quality ) else: msg = "Object must be a valid Fasta or Fastq Record" log.error( msg ) raise TypeError( msg )
def merge_fasta_sequences(sequences, positions): merged = [] for i, part_list in enumerate(positions): name = "Merged%s_NumReads100" % (i + 1) sequence = '' for part in part_list: source = part['name'] start = part['start'] end = part['end'] sequence += sequences[source].sequence[start:end] record = FastaRecord(name=name, sequence=sequence) merged.append(record) return merged
def rename_imgt_fasta(input_file, output_file): with FastaWriter(output_file) as handle: for record in FastaReader(input_file): # Check that this is an IMGT-formatted FASTA record assert record.header.startswith('HLA:') # Extract the header and replace spaces with underscores new_header = record.header.strip().replace(' ', '_') # Create a new record with the same sequence and the type # in place of it's id. new_record = FastaRecord(new_header, record.sequence) handle.writeRecord(new_record)
def _createUnrolledReference(refIdx, reference, adp): refName = "Reference{0}".format(refIdx) seq = "" idx = 0 adpPos = [] while len(seq) < MIN_TEMPLATE_SIZE: if idx % 2 == 0: seq += reference.reverseComplement().sequence.upper() else: seq += reference.sequence.upper() adpPos.append((refName, len(seq), len(seq) + len(adp.sequence), 0)) seq += adp.sequence.upper() idx += 1 return FastaRecord(refName, seq), adpPos
def _createForwardUnrolledReference(refIdx, reference, adps): refName = "Reference{0}_Forward".format(refIdx) seq = "" idx = 0 adpPos = [] while len(seq) < MIN_TEMPLATE_SIZE: if idx == 0: seq += reference.sequence.upper() else: seq += reference.reverseComplement().sequence.upper() adpPos.append( (refName, len(seq), len(seq) + len(adps[idx].sequence), idx)) seq += adps[idx].sequence.upper() idx += 1 if idx >= len(adps): idx = 0 return FastaRecord(refName, seq), adpPos
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) ccsFofn = (l.strip('\n') for l in args.ccs_fofn) # Get the read names that are not barcoded no_barcode = set() for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode.add('%s/%d' % (bcH5.movieName, row[0])) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for ccsFile in ccsFofn: ccsH5 = BasH5Reader(ccsFile) for ccsRead in ccsH5.ccsReads(): if ccsRead.zmw.zmwName in no_barcode: basecalls = ccsRead.basecalls() if len(basecalls) >= args.minMaxInsertLength: if args.fasta: outh.writeRecord( FastaRecord(ccsRead.zmw.zmwName, ccsRead.basecalls())) else: outh.writeRecord( FastqRecord(ccsRead.zmw.zmwName, ccsRead.basecalls(), ccsRead.QualityValue())) outh.close()
def main(parser): args = parser.parse_args() # Get outfile name if args.outFile is None: outfile = 'nobarcode.fasta' if args.fasta else 'nobarcode.fastq' else: outfile = args.outFile # Input files barcodeFofn = (l.strip('\n') for l in args.barcode_fofn) baxFofn = (l.strip('\n') for l in args.bax_fofn) # Get the read names that are not barcoded no_barcode = defaultdict(set) for barcodeFile in barcodeFofn: bcH5 = BarcodeH5Reader(barcodeFile) for row in bcH5.bestDS: if row[3] / row[1] < args.minAvgBarcodeScore: no_barcode[bcH5.movieName].add(row[0]) if args.fasta: outh = FastaWriter(outfile) else: outh = FastqWriter(outfile) for baxFile in baxFofn: baxH5 = BasH5Reader(baxFile) for holeNum in baxH5.sequencingZmws: if holeNum in no_barcode[baxH5.movieName]: zmw = baxH5[holeNum] if len(zmw.subreads) and max(len(sr.basecalls()) for sr in zmw.subreads) >= args.minMaxInsertLength: for subread in zmw.subreads: if len(subread.basecalls()) >= args.minSubreadLength: if args.fasta: outh.writeRecord(FastaRecord(subread.readName,subread.basecalls())) else: outh.writeRecord(FastqRecord(subread.readName,subread.basecalls(),subread.QualityValue())) outh.close()
#! /usr/bin/env python import sys from pbcore.io import FastaReader, FastaWriter, FastaRecord seqs = set() with FastaWriter(sys.stdout) as handle: for rec in FastaReader(sys.argv[1]): # Strip gaps from the sequence seq = rec.sequence.replace('-', '').replace('.', '') # If we've seen the ungapped sequence before, skip if seq in seqs: continue # Otherwise add the sequence and write the record else: new_rec = FastaRecord(rec.name, seq) seqs.add(seq) handle.writeRecord(new_rec)
class TestFastaRecord(object): def setup(self): self.header = "chr1|blah|blah\tblah blah" self.rc_header = "chr1|blah|blah\tblah blah [revcomp]" self.id = "chr1|blah|blah" self.comment = "blah blah" self.sequence = "GATTACA" * 20 self.rc_sequence = "TGTAATC" * 20 self.length = 140 self.expected__str__ = ( ">chr1|blah|blah\tblah blah\n" "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n" "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n" "ATTACAGATTACAGATTACA") self.rc1_expected__str__ = ( ">chr1|blah|blah\tblah blah [revcomp]\n" "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n" "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n" "GTAATCTGTAATCTGTAATC") self.rc2_expected__str__ = ( ">chr1|blah|blah\tblah blah\n" "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n" "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n" "GTAATCTGTAATCTGTAATC") self.record = FastaRecord(self.header, self.sequence) self.rc1_record = self.record.reverseComplement() self.rc2_record = self.record.reverseComplement(True) def test__init__(self): assert_equal(self.header, self.record.header) assert_equal(self.sequence, self.record.sequence) assert_equal(self.id, self.record.id) assert_equal(self.comment, self.record.comment) def test__str__(self): assert_equal(self.expected__str__, str(self.record)) def test_fromString(self): recordFromString = FastaRecord.fromString(self.expected__str__) assert_equal(self.header, recordFromString.header) assert_equal(self.sequence, recordFromString.sequence) def test_reverse_complement1(self): assert_equal(self.rc1_record.header, self.rc_header) assert_equal(self.rc1_record.sequence, self.rc_sequence) assert_equal(self.rc1_expected__str__, str(self.rc1_record)) def test_reverse_complement2(self): assert_equal(self.rc2_record.header, self.header) assert_equal(self.rc2_record.sequence, self.rc_sequence) assert_equal(self.rc2_expected__str__, str(self.rc2_record)) def test_len(self): assert_equal(self.length, len(self.record)) assert_equal(self.length, len(self.rc1_record)) assert_equal(self.length, len(self.rc2_record)) def test_eq(self): header = 'r1' seq = 'ACGT' r1 = FastaRecord(header, seq) r2 = FastaRecord(header, seq) assert_true(r1 == r2) def test_not_equal(self): r1 = FastaRecord('r1', 'ACGT') r2 = FastaRecord('r2', 'ACGT') r3 = FastaRecord('r1', 'ACGT') assert_true(r1 != r2) assert_false(r1 != r3)
def test_fromString(self): recordFromString = FastaRecord.fromString(self.expected__str__) assert_equal(self.header, recordFromString.header) assert_equal(self.sequence, recordFromString.sequence)
def callConsensus(): def makeReadAndReads(zmwsForBC): ccsData = filter(lambda x: x, [zmw.ccsRead for _, _, zmw in zmwsForBC if zmw]) srData = reduce(lambda x, y: x + y, [zmw.subreads for zmw, _, _ in zmwsForBC if zmw], []) if not srData and not ccsData: return (None, None) def getSeedRead(reads, lq=80, uq=90, sLambda=lambda x: -x.zmw.readScore): lens = map(len, reads) candidateRange = (n.percentile(lens, lq), n.percentile(lens, uq)) pfReads = [ read for read, l in zip(reads, lens) if l >= candidateRange[0] and l <= candidateRange[1] ] pfReads.sort(key=sLambda) return pfReads[0] if len(pfReads) else None if ccsData: ## all CCS reads should be the *same* length for an ## amplicon. Let's take the middle ones seedRead = getSeedRead(ccsData, lq=30, uq=70, sLambda=lambda x: -x.zmw.numPasses) if not seedRead: seedRead = getSeedRead(srData) logging.info("Unable to use a CCS read for the seed read.") else: logging.info("Using a CCS read for the seed read.") else: logging.info("Using a raw read for the seed read") seedRead = getSeedRead(srData) return (seedRead, srData) # check to make sure that you have the necessary dependencies, # i.e., hgap script, blasr, etc. try: import pbtools.pbdagcon except ImportError: raise ImportError( "Unable to find dependency `pbdagcon` - please install.") # retrieve ZMWs by barcode if runner.args.barcode: zmwsForBCs = getZmwsForBarcodes(runner.args.barcode) else: zmwsForBCs = getZmwsForBarcodes() # subsample zmwsForBCs = {k: subsampleReads(v) for k, v in zmwsForBCs.items()} logging.info("unfiltered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # filter ZMWs zmwsForBCs = filterZmws(zmwsForBCs) logging.info("filtered average zmws per barcode: %g" % n.round(n.mean(map(len, zmwsForBCs.values())))) # now choose the best subread to seed the assembly if runner.args.ccsFofn: # XXX: This part depends on the filenames of the ccs and input # fofns, this is essentially a workaround to the fact the the # part isn't part of the API ccsReaders = { movieNameFromFile(l): BasH5Reader(l) for l in open(runner.args.ccsFofn).read().splitlines() } # fill in the CCS spot. for k, v in zmwsForBCs.items(): l = [] for zmw, lZmw in v: r = ccsReaders[movieNameFromFile(zmw.baxH5.file.filename)] l.append((zmw, lZmw, r[zmw.holeNumber])) zmwsForBCs[k] = l else: # add none to the CCS spot. zmwsForBCs = { k: [(zmw, lZmw, None) for zmw, lZmw in v] for k, v in zmwsForBCs.iteritems() } readAndReads = {k: makeReadAndReads(v) for k, v in zmwsForBCs.items()} # remove barcodes that don't have a seed read and a set of useable reads. readAndReads = {k: v for k, v in readAndReads.items() if v[0] and v[1]} # generate FASTA files outDir = runner.args.outDir for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) if not os.path.exists(bcdir): os.makedirs(bcdir) # emit the seeds to separte files with FastaWriter("%s/seed_read.fasta" % bcdir) as w: w.writeRecord(FastaRecord(reads[0].readName, reads[0].basecalls())) subreads = reads[1] # emit the subreads to a single file with FastaWriter("%s/subreads.fasta" % bcdir) as w: for r in subreads: w.writeRecord(FastaRecord(r.readName, r.basecalls())) # construct the region file by subsetting the ZMWs that you # are interested in. nfofn = [] for inFof, in zipFofns(runner.args.inputFofn): bh5 = BaxH5Reader(inFof) reg = bh5.file['/PulseData/Regions'] inMovie = filter(lambda z: z.baxH5.movieName == bh5.movieName, subreads) holes = n.in1d(reg[:, 0], n.array([a.holeNumber for a in inMovie])) if any(holes): nreg = reg[holes, :] else: nreg = n.empty(shape=(0, reg.shape[1]), dtype='int32') fname = "%s/%s.rgn.h5" % (bcdir, movieNameFromFile(inFof)) nfile = h5.File(fname, 'w') ndset = nfile.create_dataset('/PulseData/Regions', data=nreg, maxshape=(None, None)) copyAttributes(reg, ndset) nfile.close() nfofn.append(fname) ofile = open('%s/region.fofn' % bcdir, 'w') ofile.writelines("\n".join(nfofn)) ofile.close() ## call gcon outDirs = [(outDir, k) for k in readAndReads.keys()] if runner.args.nProcs == 1: outFasta = filter(lambda z: z, map(gconFunc, outDirs)) else: pool = Pool(runner.args.nProcs) outFasta = filter(lambda z: z, pool.map(gconFunc, outDirs)) ## write the results with FastaWriter('/'.join((outDir, "consensus.fa"))) as w: for r in outFasta: w.writeRecord(r) ## optionally cleanup if not runner.args.keepTmpDir: for barcode, reads in readAndReads.items(): bcdir = '/'.join((outDir, barcode)) shutil.rmtree(bcdir)
def setup_class(cls): cls.record = FastaRecord(cls.HEADER, cls.SEQUENCE) cls.rc1_record = cls.record.reverseComplement() cls.rc2_record = cls.record.reverseComplement(True)
def test_fromString(self): recordFromString = FastaRecord.fromString(self.EXPECTED__STR__) assert self.HEADER == recordFromString.header assert self.SEQUENCE == recordFromString.sequence
def test_not_equal(self): r1 = FastaRecord('r1', 'ACGT') r2 = FastaRecord('r2', 'ACGT') r3 = FastaRecord('r1', 'ACGT') assert r1 != r2 assert not r1 != r3
chrm = fa[tid] # Search for restriction sites near fiveP = chrm.sequence[s - 5:s + 6] threeP = chrm.sequence[e - 5:e + 6] fiveEco, fiveBam = HasEcoR1(fiveP), HasBamH1(fiveP) threeEco, threeBam = HasEcoR1(threeP), HasBamH1(threeP) # Count and summarize any PolyA/T regions region = chrm.sequence[s:e] AT = LargestAsAndTs(region) maxAT = 0 if len(AT) == 0 else max(AT) # Check for Guide RNA matches OutFiveP = chrm.sequence[s - 33:s + 10] InFiveP = FastaRecord( "tmp", chrm.sequence[s - 10:s + 33]).reverseComplement().sequence InThreeP = chrm.sequence[e - 33:e + 10] OutThreeP = FastaRecord( "tmp", chrm.sequence[e - 10:e + 33]).reverseComplement().sequence k1, s1, a1 = ScoreCas9SiteSides(OutFiveP, InFiveP) k2, s2, a2 = ScoreCas9SiteSides(OutThreeP, InThreeP) # Summary columns hasPolyA = "T" if (polyA == "T" or maxAT > 0) else "F" hasLeft = "T" if (fiveEco == "T" or fiveBam == "T" or k1 != "N/A") else "F" hasRight = "T" if (threeEco == "T" or threeBam == "T" or k2 != "N/A") else "F" print "{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15},{16},{17},{18},{19},{20},{21}".format( hn, tid, s, e, target, polyA, len(AT), maxAT, sum(AT), fiveEco, fiveBam, threeEco, threeBam, k1, s1, a1, k2, s2, a2, hasPolyA, hasLeft,
def test_not_equal(self): r1 = FastaRecord('r1', 'ACGT') r2 = FastaRecord('r2', 'ACGT') r3 = FastaRecord('r1', 'ACGT') assert_true(r1 != r2) assert_false(r1 != r3)
def test_eq(self): name = 'r1' seq = 'ACGT' r1 = FastaRecord(name, seq) r2 = FastaRecord(name, seq) assert_true(r1 == r2)
def test_fromString(self): recordFromString = FastaRecord.fromString(self.expected__str__) assert_equal(self.name, recordFromString.name) assert_equal(self.sequence, recordFromString.sequence)
def test_eq(self): header = 'r1' seq = 'ACGT' r1 = FastaRecord(header, seq) r2 = FastaRecord(header, seq) assert_true(r1 == r2)
class TestFastaRecord: def setup(self): self.header = "chr1|blah|blah\tblah blah" self.rc_header = "chr1|blah|blah\tblah blah [revcomp]" self.id = "chr1|blah|blah" self.comment = "blah blah" self.sequence = "GATTACA" * 20 self.rc_sequence = "TGTAATC" * 20 self.length = 140 self.expected__str__ = ( ">chr1|blah|blah\tblah blah\n" "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATT\n" "ACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAG\n" "ATTACAGATTACAGATTACA") self.rc1_expected__str__ = ( ">chr1|blah|blah\tblah blah [revcomp]\n" "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n" "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n" "GTAATCTGTAATCTGTAATC") self.rc2_expected__str__ = ( ">chr1|blah|blah\tblah blah\n" "TGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTA\n" "ATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCTGTAATCT\n" "GTAATCTGTAATCTGTAATC") self.record = FastaRecord(self.header, self.sequence) self.rc1_record = self.record.reverseComplement() self.rc2_record = self.record.reverseComplement(True) def test__init__(self): assert_equal(self.header, self.record.header) assert_equal(self.sequence, self.record.sequence) assert_equal(self.id, self.record.id) assert_equal(self.comment, self.record.comment) def test__str__(self): assert_equal(self.expected__str__, str(self.record)) def test_fromString(self): recordFromString = FastaRecord.fromString(self.expected__str__) assert_equal(self.header, recordFromString.header) assert_equal(self.sequence, recordFromString.sequence) def test_reverse_complement1(self): assert_equal(self.rc1_record.header, self.rc_header) assert_equal(self.rc1_record.sequence, self.rc_sequence) assert_equal(self.rc1_expected__str__, str(self.rc1_record)) def test_reverse_complement2(self): assert_equal(self.rc2_record.header, self.header) assert_equal(self.rc2_record.sequence, self.rc_sequence) assert_equal(self.rc2_expected__str__, str(self.rc2_record)) def test_len(self): assert_equal(self.length, len(self.record)) assert_equal(self.length, len(self.rc1_record)) assert_equal(self.length, len(self.rc2_record)) def test_eq(self): header = 'r1' seq = 'ACGT' r1 = FastaRecord(header, seq) r2 = FastaRecord(header, seq) assert_true(r1 == r2) def test_not_equal(self): r1 = FastaRecord('r1', 'ACGT') r2 = FastaRecord('r2', 'ACGT') r3 = FastaRecord('r1', 'ACGT') assert_true(r1 != r2) assert_false(r1 != r3)
def _to_fasta_record(header, seq): return FastaRecord(header, seq)