def test(dna,AA,codeline,iAA,fAA,cds,strand): firstBreak = codeline.find(',') secondBreak = codeline.find(',',firstBreak+1) thirdBreak = codeline.find(',',secondBreak+1) fourthBreak = codeline.rfind(',') codonStartSite = int(codeline[firstBreak+1:secondBreak]) initialCodon = codeline[secondBreak+1:thirdBreak] codonEndSite = int(codeline[thirdBreak+1:fourthBreak]) finalCodon = codeline[fourthBreak+1:] TranslatableInitialCodon = initialCodon TranslatableFinalCodon = finalCodon if strand ==-1: TranslatableInitialCodon = reverse_complement(initialCodon) TranslatableFinalCodon = reverse_complement(finalCodon) #TEST CASES if AA != cds.qualifiers['translation'][0]: #protein seqs match up print "AA seqs not equal" return False elif dna[codonStartSite-1:codonEndSite] != initialCodon: #codon that is being modified is where its supposed to be print dna[codonStartSite-1:codonEndSite] +'!=' + initialCodon + " :so codeline doesnt match up" return False elif translate(TranslatableInitialCodon) != iAA: #starting codon is what its supposed to be return False elif translate(TranslatableFinalCodon) != fAA: #final codon is what its supposed to be return False else: return True
def flip_record(toks): vcf_ref = toks[3] vcf_alts = toks[4].split(",") fix_ref = reverse_complement(vcf_ref) fix_alts = ",".join([reverse_complement(x) for x in vcf_alts]) fix_line = toks[0:3] + [fix_ref, fix_alts] + toks[5:] fix_line = "\t".join(fix_line) return fix_line + "\n"
def set_mirsseq(self, mirdict, famdict): self.mirseq = mirdict.get(self.mirid, "undef"); if(self.mirseq != "undef"): self.seed = reverse_complement(self.mirseq[1:7]); return 0; else: self.seed = reverse_complement(famdict[self.mirid][1:7]); self.longseed = famdict[self.mirid] return 1;
def _c_to_g(self): t = self._map_cds_to_genome() if self.c.strand == '-1': ref = reverse_complement(self.c.ref) alt = reverse_complement(self.c.alt) else: ref = self.c.ref alt = self.c.alt self.g = G(t['seq_region_name'], t['start'], t['end'], ref, alt, self.edit_type)
def __init__(self, name, seq, seed_start, seed_stop): self.name = name; self.seq = seq; self.seed_start = seed_start self.seed_stop = seed_stop; self.expression = 0; self.seed = seq[seed_start: seed_stop]; self.match = get_seed_match(seq, seed_start, seed_stop); self.m27 = get_seed_match(seq, 1, 7) self.m38 = get_seed_match(seq, 2, 8) self.m8 = reverse_complement(seq[7]) self.m9 = reverse_complement(seq[8]) self.first = 'A';
def getBgCounts_FAST_ACCURATE_BIGMEMORY( seqs, order=3, include_revComp=True ): from collections import Counter d = {} for ord in range(order+1): ## get counts for 1,2,3,...order print ord ord1 = ord+1 all_combos = generateAllKmers( ord1 ) for ss in all_combos: d[ss] = 0 for i in xrange( len(seqs) ): seq = None if type(seqs) == np.ndarray: seq = seqs[i] elif type(seqs) == dict: seq = str(seqs.values()[i].seq) seq_rev = reverse_complement(seq) if include_revComp else '' if ord == 0: c1 = Counter([seq[i] for i in xrange(len(seq)-ord)]) ## xrange makes it ~10% faster and uses less memory if include_revComp: c2 = Counter([seq_rev[i] for i in xrange(len(seq_rev)-ord)]) else: c1 = Counter([seq[i:(i+ord1)] for i in xrange(len(seq)-ord)]) if include_revComp: c2 = Counter([seq_rev[i:(i+ord1)] for i in xrange(len(seq_rev)-ord)]) for ss in all_combos: d[ss] += c1[ss] + ( c2[ss] if include_revComp else 0 ) return d
def find_downstream_start(name, transcript, current_start, strand): """function to call other functions to find the next ATG start site. Takes in the transcript and the current start codon and strand coding direction""" if strand == "+": outstr = ("Looking for ATG after %d in seq: %s" % (current_start, name)) logger.info(outstr) if transcript[current_start:current_start+3] != "ATG": outstr = ("WARNING - existing annotation for " + " %s does not start ATG" % name) logger.info(outstr) return find_positive_next_ATG(transcript, current_start, strand) elif strand == "-": outstr = ("Looking for CAT (i.e. ATG rev-comp) before " + " %d in sequence %s" % (current_start, name)) logger.info(outstr) new = find_positive_next_ATG(reverse_complement(transcript), len(transcript) - current_start, "+") if new is None: # No start codon found return None return len(transcript) - new else: raise ValueError("Bad strand value %r" % strand)
def get_feature_nuc(f, parent_seq) : """Extract SeqFeature sequence from parent sequence (as Seq).""" if f.sub_features : if f.location_operator!="join": raise ValueError(f.location_operator) if f.strand == -1 and set(f_sub.strand for f_sub in f.sub_features)==set([-1]) : #This is nasty... maybe we can tweak how the GenBank parser works? #It is important we do not double reverse-complement! parts = [parent_seq[f_sub.location.nofuzzy_start:f_sub.location.nofuzzy_end] \ for f_sub in f.sub_features] else : #This copes with mixed strand features: parts = [get_feature_nuc(f_sub, parent_seq) for f_sub in f.sub_features] f_seq = parts[0] for part in parts[1:] : f_seq += part else : f_seq = parent_seq[f.location.nofuzzy_start:f.location.nofuzzy_end] if f.strand == -1 : #TODO - MutableSeq? try : f_seq = f_seq.reverse_complement() except AttributeError : assert isinstance(f_seq, str) f_seq = reverse_complement(f_seq) return f_seq
def multitargeting(self): self.mirids = set(); for inter in self.interactions: self.mirids.update(inter.mirid.split(",")); self.seeds = set([reverse_complement(x.mirseq[1:7]) for x in self.interactions]) self.s2m = defaultdict(set); for inter in self.interactions: self.s2m[reverse_complement(inter.mirseq[1:7])].add(tuple(inter.mirid)); self.onehit = len(self.mirids) == 1 and self.interactions[0].indreads == 1; self.onemir = len(self.mirids) == 1; self.onefam = False for k,v in self.s2m.iteritems(): if(len(v) > 1 and not bool(set(list(v)[0]).intersection(v))): self.onefam = True; break; self.diffam = len(self.seeds) > 1
def writeSTF(): global difference, seqRecordToCheck, seqRecordToCheckComplement, variation, featureName, featureSeq, seqLength, m difference = len(record.seq) % 3 seqRecordToCheck = str(record.seq) if difference != 0: seqRecordToCheck = str(record.seq)[:-difference] else: seqRecordToCheck = str(record.seq) seqRecordToCheckComplement = str(reverse_complement(seqRecordToCheck)) # Reading Frames firstReadingFrame = translate(seqRecordToCheck) secondReadingFrame = translate(seqRecordToCheck[1::] + seqRecordToCheck[0]) thirdReadingFrame = translate(seqRecordToCheck[2::] + seqRecordToCheck[0:2]) # Reading Frames (reverseComplement) firstReadingFrameComplement = translate(seqRecordToCheckComplement) secondReadingFrameComplement = translate(seqRecordToCheckComplement[1::] + seqRecordToCheckComplement[0]) thirdReadingFrameComplement = translate(seqRecordToCheckComplement[2::] + seqRecordToCheckComplement[0:2]) for variation in featureStatistic_container[feature]: featureName = variation.note featureSeq = str(variation.seq) featureLength = len(variation.seq) seqLength = len(seqRecordToCheck) firstReadingFrameCircular = firstReadingFrame + firstReadingFrame[0:featureLength - 1] secondReadingFrameCircular = secondReadingFrame + secondReadingFrame[0:featureLength - 1] thirdReadingFrameCircular = thirdReadingFrame + thirdReadingFrame[0:featureLength - 1] firstReadingFrameComplementCircular = firstReadingFrameComplement + firstReadingFrameComplement[ 0:featureLength - 1] secondReadingFrameComplementCircular = secondReadingFrameComplement + secondReadingFrameComplement[ 0:featureLength - 1] thirdReadingFrameComplementCircular = thirdReadingFrameComplement + thirdReadingFrameComplement[ 0:featureLength - 1] # Find Matches firstFrameMatchesCircular = re.finditer(featureSeq, firstReadingFrameCircular) secondFrameMatchesCircular = re.finditer(featureSeq, secondReadingFrameCircular) thirdFrameMatchesCircular = re.finditer(featureSeq, thirdReadingFrameCircular) firstFrameComplementMatchesCircular = re.finditer(featureSeq, firstReadingFrameComplementCircular) secondFrameComplementMatchesCircular = re.finditer(featureSeq, secondReadingFrameComplementCircular) thirdFrameComplementMatchesCircular = re.finditer(featureSeq, thirdReadingFrameComplementCircular) for m in firstFrameMatchesCircular: addFeatureSTF() for m in secondFrameMatchesCircular: addFeatureSTF() for m in thirdFrameMatchesCircular: addFeatureSTF() for m in firstFrameComplementMatchesCircular: addFeatureComplSTF() for m in secondFrameComplementMatchesCircular: addFeatureComplSTF() for m in thirdFrameComplementMatchesCircular: addFeatureComplSTF()
def align(request): names=request.POST.getlist('sequences[]') directions=request.POST.getlist('directions[]') fasta_string="" aligned_sequences={} try: for (name, direction) in zip(names, directions): seq=getdropboxsequence(request,name) fasta_string+=">"+name+"\n" print("direction"+str(direction)) if direction=="1": print("forward") fasta_string+=str(seq.seq)+"\n" else: print("reverse") fasta_string+=reverse_complement(str(seq.seq))+"\n" cmd=Popen(['muscle'],stdout=PIPE,stdin=PIPE) stdout_data,stderr_data=cmd.communicate(input=fasta_string.encode("utf-8")) align=AlignIO.read(io.StringIO(stdout_data.decode('utf-8')),"fasta") for record in align: aligned_sequences[record.id]=str(record.seq) except Exception as e: print(str(e),file=sys.stderr) raise e return HttpResponse(json.dumps(aligned_sequences), content_type='application/json')
def get_spliced_seq(self, strand=None): if not self.seq: return None seq = ''.join(self.get_exon_seqs()) if strand and self.strand != strand: seq = reverse_complement(seq) return seq
def misprime_check(self): u = UnaFolder( t=self.tm(), safety=self.construct.settings.ss_safety, mg_salt=self.construct.settings.mg_salt, na_salt=self.construct.settings.na_salt, ) if self.stick.top: target = str(self.stick.cfragment.sequence()) else: target = str(reverse_complement(Seq(self.stick.cfragment.sequence()))) if u.mis_prime(target, str(self.seq())): self.warning.all().filter(type="mp").delete() for warning in u.warnings: w = Warning.objects.create( primer=self, type="mp", text="Potentital mis-priming " + (str(warning[1]) + " bp from " if warning[1] > 0 else " of ") + "3' end of primer at bp " + str(warning[2]) + ", length " + str(warning[0]) + ", energy " + str(warning[3]), )
def test_mixed_strand_dna_join(self): """Feature on DNA (join, mixed strand)""" s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna) f1 = SeqFeature(FeatureLocation(5,10), strand=+1) f2 = SeqFeature(FeatureLocation(12,15), strand=-1) f = make_join_feature([f1,f2]) self.check(s, f, "CCCCC"+reverse_complement("TTT"), "join(6..10,complement(13..15))")
def test_simple_dna_join_after(self): """Feature on DNA (join, strand -1, after position)""" s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna) f1 = SeqFeature(FeatureLocation(5,10), strand=-1) f2 = SeqFeature(FeatureLocation(12,AfterPosition(15)), strand=-1) f = make_join_feature([f1,f2]) self.check(s, f, reverse_complement("CCCCC"+"TTT"), "complement(join(6..10,13..>15))")
def __getitem__(self, index): if isinstance(index, slice) and index.start is not None and index.stop is not None and index.start > index.stop: index = slice(index.stop, index.start, index.step) retval = super().__getitem__(index) retval.seq = reverse_complement(retval.seq) return retval else: return super().__getitem__(index)
def test_mixed_strand_dna_multi_join(self): """Feature on DNA (multi-join, mixed strand)""" s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna) f1 = SeqFeature(FeatureLocation(5,10), strand=+1) f2 = SeqFeature(FeatureLocation(12,15), strand=-1) f3 = SeqFeature(FeatureLocation(BeforePosition(0),5), strand=+1) f = make_join_feature([f1,f2,f3]) self.check(s, f, "CCCCC"+reverse_complement("TTT")+"AAAAA", "join(6..10,complement(13..15),<1..5)")
def test_mixed_strand_dna_join(self): """Extract feature from DNA (join, mixed strand)""" s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna) f1 = SeqFeature(FeatureLocation(5,10), strand=+1) f2 = SeqFeature(FeatureLocation(12,15), strand=-1) f = make_join_feature([f1,f2]) self.assertEqual(_insdc_feature_location_string(f), "join(6..10,complement(13..15))") self.check(s, f, "CCCCC"+reverse_complement("TTT"))
def test_simple_dna_join_after(self): """Extract feature from DNA (join, strand -1, after position)""" s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna) f1 = SeqFeature(FeatureLocation(5,10), strand=-1) f2 = SeqFeature(FeatureLocation(12,AfterPosition(15)), strand=-1) f = make_join_feature([f1,f2]) self.assertEqual(_insdc_feature_location_string(f), "complement(join(6..10,13..>15))") self.check(s, f, reverse_complement("CCCCC"+"TTT"))
def __str__(self): global cached_pairs, read_group_ids if self.ref_rc: flag = 0x10 #maps to reverse strand read_seq = reverse_complement(self.read_seq) read_qual = self.read_qual[::-1] else: flag = 0 read_seq = self.read_seq read_qual = self.read_qual mate_ref_name = "*" mate_ref_pos = 0 if not self.template_name: assert self.read_name self.template_name = self.read_name if self.is_paired(): flag += 1 #paired if self.first_in_pair: flag += 0x40 #forward partner else: flag += 0x80 #reverse partner try: mate = self.get_partner() except KeyError: #Paired but no parter in ACE file flag += 0x08 #mate unmapped else: mate_ref_name = mate.contig_name mate_ref_pos = mate.ref_pos if mate_ref_name == self.contig_name: #Since MIRA seems happy and both on same contig, flag += 0x02 #properly aligned assert not self.tags read_seq_unpadded = read_seq.replace("*", "") read_qual_unpadded = "".join(q for (l,q) in zip(read_seq,read_qual) if l!="*") cigar = self.cigar assert "M" not in cigar, cigar if "D" not in cigar: #Sum of lengths of the M/I/S/=/X operations should match the sequence length #By construction there are no M entries in our CIGAR string. #TODO - Improve this check to consider D in CIGAR? if len(read_seq_unpadded) != sum(int(x) for x in cigar.replace("I","=").replace("S","=").replace("X","=").split("=") if x): raise ValueError("%s vs %i for %s" % (cigar, len(read_seq_unpadded), read_seq)) assert len(read_seq_unpadded) == len(read_qual_unpadded) line = "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\t%s" % \ (self.template_name, flag, self.contig_name, self.ref_pos, self.map_qual, cigar, mate_ref_name, mate_ref_pos, self.insert_size, read_seq_unpadded, read_qual_unpadded) assert self.seq_tech line += "\tRG:Z:%s" % read_group_ids[(self.seq_tech, self.strain)] for tag in self.tags: assert not tag.startswith("RG:"), tag line += "\t" + tag return line
def get_exon_seqs(self): exons = [] for i in range(self.blockCount): # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) if self.strand == '-': #reverse complement exons.reverse() for i,s in enumerate(exons): exons[i] = reverse_complement(s) return exons
def extract(self, parent_sequence): """Extract feature sequence from the supplied parent sequence. The parent_sequence can be a Seq like object or a string, and will generally return an object of the same type. The exception to this is a MutableSeq as the parent sequence will return a Seq object. This should cope with complex locations including complements, joins and fuzzy positions. Even mixed strand features should work! This also covers features on protein sequences (e.g. domains), although here reverse strand features are not permitted. >>> from Bio.Seq import Seq >>> from Bio.Alphabet import generic_protein >>> from Bio.SeqFeature import SeqFeature, FeatureLocation >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein) >>> f = SeqFeature(FeatureLocation(8,15), type="domain") >>> f.extract(seq) Seq('VALIVIC', ProteinAlphabet()) Note - currently only sub-features of type "join" are supported. """ if isinstance(parent_sequence, MutableSeq): #This avoids complications with reverse complements #(the MutableSeq reverse complement acts in situ) parent_sequence = parent_sequence.toseq() if self.sub_features: if self.location_operator!="join": raise ValueError(self.location_operator) if self.strand == -1: #This is a special case given how the GenBank parser works. #Must avoid doing the reverse complement twice. parts = [] for f_sub in self.sub_features: assert f_sub.strand==-1 parts.append(parent_sequence[f_sub.location.nofuzzy_start:\ f_sub.location.nofuzzy_end]) else: #This copes with mixed strand features: parts = [f_sub.extract(parent_sequence) \ for f_sub in self.sub_features] #We use addition rather than a join to avoid alphabet issues: f_seq = parts[0] for part in parts[1:] : f_seq += part else: f_seq = parent_sequence[self.location.nofuzzy_start:\ self.location.nofuzzy_end] if self.strand == -1: #TODO - MutableSeq? try: f_seq = f_seq.reverse_complement() except AttributeError: assert isinstance(f_seq, str) f_seq = reverse_complement(f_seq) return f_seq
def gff_to_fasta(gff, genome, min_length, Max_length, outfile, upstream, into_TSS): """take in gff file. Gets the seq defined by the gff coords. If negative direction coding, the reverse complement is generated. A min length of seq to return and max len is applied to remove seq less than, for example 3 which cant be real and less that e.g., 25k which will be flase positives and not informative in downstream analysis """ print("Indexing the genome") min_length = int(min_length) genome_database = index_genome_file(genome) print("Now iterating through the GFF. Assume it is sorted") f_out = open(outfile, "w") bind_out = outfile.split(".fa")[0] + "_%dnt_upstream_%d_into_TSS.fasta" % (upstream, into_TSS) bind_out_fa = open(bind_out, "w") upstream = int(upstream) with open(gff, "r") as f_handle: for line in f_handle: line = check_line(line) if not line: continue scaff, source, feature, start, stop, score, \ direction, frame, gene_info = split_line(line) seq_record = genome_database[scaff] if direction == "+": UTR = seq_record.seq[start:stop] bind_seq = seq_record.seq[(start - upstream):(start + into_TSS)] if direction == "-": UTR = reverse_complement(seq_record.seq[start:stop]) bind_seq = reverse_complement(seq_record.seq[(stop - into_TSS) :(stop + upstream)]) outstr = ">%s\n%s\n" % (gene_info, UTR) bind_str = ">%s_%d_upstream_TSS\n%s\n" % (gene_info, upstream, bind_seq) if len(UTR) > min_length and len(UTR) < Max_length: f_out.write(outstr) if "NNNN" in bind_seq: continue # we dont want NNNs if len(bind_seq) >= upstream: bind_out_fa.write(bind_str) f_out.close() bind_out_fa.close()
def extractReadSequences(self, genoGenChr, lInsertsChr, barcode, lenRead): """ >>> i = TestGbs() >>> genoGenChr = SeqRecord(Seq("AATTTAGGGA"), id="chr1") >>> i.extractReadSequences(genoGenChr, [[3,9]], "C", 3) [['CTT', 'CCC']] """ return [[str(barcode) \ + str(genoGenChr.seq[(i[0]-1):(i[0]-1+lenRead-len(barcode))]), reverse_complement(str(genoGenChr.seq[(i[1]-lenRead):i[1]]))] for i in lInsertsChr]
def get_exon_seqs(self): if not self.seq: return None exons = [] for i in range(self.blockCount): exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) if self.strand == '-': # reverse complement exons.reverse() for i, s in enumerate(exons): exons[i] = reverse_complement(s) return exons
def seq(self): s = Seq(self.cfragment.fragment.sequence) if self.cfragment.direction == 'r': end = self.cfragment.end() - self.start() start = end - self.length else: start = self.start() end = self.end() s = s[start:end] if self.top ^ (self.cfragment.direction == 'r'): s = reverse_complement(s) return s
def six_frame_translations(seq, genetic_code=1): """Formatted string showing the 6 frame translations and GC content. nice looking 6 frame translation with GC content - code from xbbtools similar to DNA Striders six-frame translation e.g. from Bio.SeqUtils import six_frame_translations print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") """ from Bio.Seq import reverse_complement, translate anti = reverse_complement(seq) comp = anti[::-1] length = len(seq) frames = {} for i in range(0, 3): frames[i + 1] = translate(seq[i:], genetic_code) frames[-(i + 1)] = reverse(translate(anti[i:], genetic_code)) # create header if length > 20: short = "%s ... %s" % (seq[:10], seq[-10:]) else: short = seq # TODO? Remove the date as this would spoil any unit test... date = time.strftime("%y %b %d, %X", time.localtime(time.time())) header = "GC_Frame: %s, " % date for nt in ["a", "t", "g", "c"]: header += "%s:%d " % (nt, seq.count(nt.upper())) header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % (short.lower(), length, GC(seq)) res = header for i in range(0, length, 60): subseq = seq[i : i + 60] csubseq = comp[i : i + 60] p = i / 3 res = res + "%d/%d\n" % (i + 1, i / 3 + 1) res = res + " " + " ".join(map(None, frames[3][p : p + 20])) + "\n" res = res + " " + " ".join(map(None, frames[2][p : p + 20])) + "\n" res = res + " ".join(map(None, frames[1][p : p + 20])) + "\n" # seq res = res + subseq.lower() + "%5d %%\n" % int(GC(subseq)) res = res + csubseq.lower() + "\n" # - frames res = res + " ".join(map(None, frames[-2][p : p + 20])) + " \n" res = res + " " + " ".join(map(None, frames[-1][p : p + 20])) + "\n" res = res + " " + " ".join(map(None, frames[-3][p : p + 20])) + "\n\n" return res
def extract(self, parent_sequence): """Extract feature sequence from the supplied parent sequence.""" if isinstance(parent_sequence, MutableSeq): #This avoids complications with reverse complements #(the MutableSeq reverse complement acts in situ) parent_sequence = parent_sequence.toseq() f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end] if self.strand == -1: try: f_seq = f_seq.reverse_complement() except AttributeError: assert isinstance(f_seq, str) f_seq = reverse_complement(f_seq) return f_seq
def misprime_check(self): u = UnaFolder(t=self.tm(), safety=self.construct.settings.ss_safety, mg_salt=self.construct.settings.mg_salt, na_salt=self.construct.settings.na_salt) if(self.stick.top): target = str(self.stick.cfragment.sequence()) else: target = str(reverse_complement(Seq(self.stick.cfragment.sequence()))) if u.mis_prime(target, str(self.seq())): self.warning.all().filter(type='mp').delete() for warning in u.warnings: w = Warning.objects.create( primer = self, type = 'mp', text= 'Potentital mis-priming ' + (str(warning[1]) + ' bp from ' if warning[1] > 0 else ' of ') + '3\' end of primer at bp ' + str(warning[2]) + ', length ' + str(warning[0]) + ', energy ' + str(warning[3]), )
def __init__(self, arws): if(len(set([x.qname for x in arws])) != 1): raise ChimeraException('Chimera cannot be made from aligned reads with different identifiers\nFollowing are given:\n%s\n' % "\n".join(["\t%s" % x for x.qname in ar.wrappers])) self.arws = arws; self.control = any([x.rname.split("_")[0] == "random" for x in self.arws]) self.gap = arws[1].qstart - arws[0].qend; self.AS = sum([x.AS for x in arws]); self.coordinates = []; if(arws[0].aligned_read.is_reverse): self.gap_seq = reverse_complement(arws[0].aligned_read.query_sequence)[self.arws[1].qstart:self.arws[0].qend] else: self.gap_seq = arws[0].aligned_read.query_sequence[self.arws[1].qstart:self.arws[0].qend]
def align_call(elem): """ this function call the aligner software :param elem: :return: """ record = elem[0] adapter = elem[1] lEle = [] dRc = {} dEle2Int = {} dInt2Ele = {} nMatch = 2 nMismatch = 1 nOpen = 1 nExt = -2 nFlag = 0 #if not args.sMatrix: lEle = ['A', 'C', 'G', 'T', 'N'] for i,ele in enumerate(lEle): dEle2Int[ele] = i dEle2Int[ele.lower()] = i dInt2Ele[i] = ele nEleNum = len(lEle) lScore = [0 for i in range(nEleNum**2)] for i in range(nEleNum-1): for j in range(nEleNum-1): if lEle[i] == lEle[j]: lScore[i*nEleNum+j] = nMatch else: lScore[i*nEleNum+j] = -nMismatch mat = (len(lScore) * ct.c_int8) () mat[:] = lScore ssw = ssw_lib.CSsw("./") sQSeq = record.seq sQId = record.id if len(sQSeq) > 30: nMaskLen = len(sQSeq) / 2 else: nMaskLen = 15 outputAlign = [] qNum = to_int(sQSeq, lEle, dEle2Int) qProfile = ssw.ssw_init(qNum, ct.c_int32(len(sQSeq)), mat, len(lEle), 2) sQRcSeq = reverse_complement(sQSeq) qRcNum = to_int(sQRcSeq, lEle, dEle2Int) qRcProfile = ssw.ssw_init(qRcNum, ct.c_int32(len(sQSeq)), mat, len(lEle), 2) sRSeq = adapter.seq sRId = adapter.id rNum = to_int(sRSeq, lEle, dEle2Int) res = align_one(ssw, qProfile, rNum, len(sRSeq), nOpen, nExt, nFlag, nMaskLen) resRc = None resRc = align_one(ssw, qRcProfile, rNum, len(sRSeq), nOpen, nExt, nFlag, nMaskLen) strand = 0 if res[0] == resRc[0]: next if res[0] > resRc[0]: res = res strand = 0 outputAlign = [sRId , sQId, strand, res] elif res[0] < resRc[0]: res = resRc strand = 1 outputAlign = [sRId , sQId, strand, res] ssw.init_destroy(qProfile) ssw.init_destroy(qRcProfile) return outputAlign
def handle_bulk_manual_swaps(genome_record, input_file, mg1655_genome_record): """Method that allows handling bulk swaps. NOTE: This method is pretty much hard-coded to work with derivatives of MG1655. """ # After considering all the various options for finding the exact position # to change, I've decided to go with parsing the 'AGR ID', which has the # name of the gene as well as the codon position. Let's see how it goes. # Eventually we'll need a way to get from gene name to feature in the # MDS42 genome. One problem that arises when matching gene names is we # have gene name synonyms. Thus we can use the MG1655 record which # has a lot more synonyms recorded. # Create a bi-map linking various gene synonyms of the MG1655 genbank. # This allows us to handle more flexible cases. # TODO: In general, this synonym-finding functionality could be useful # elsewhere. Maybe use Ecocyc or regulondb data for this purpose. gene_to_synonym_bimap = {} mg1655_cds_features = [feature for feature in mg1655_genome_record.features if feature.type in set(['CDS', 'gene'])] for feature in mg1655_cds_features: if not feature.type in ['CDS', 'gene']: continue maybe_gene = get_feature_gene(feature) if not maybe_gene: continue if 'gene_synonym' in feature.qualifiers: # Build a list containing all synonyms which will serve as the # value of the bimap. synonym_list = [maybe_gene] # Check each to see if it can be split. for synonym_phrase in feature.qualifiers['gene_synonym']: split_phrase = synonym_phrase.split(';') for synonym in split_phrase: clean_synonym = synonym.strip() if len(clean_synonym) > 0: synonym_list.append(clean_synonym) for synonym in synonym_list: gene_to_synonym_bimap[synonym] = synonym_list # Create a map from gene name to CDS feature for that gene in genome_record. gene_to_feature_map = {} cds_features = [feature for feature in genome_record.features if feature.type == 'CDS'] for feature in cds_features: maybe_gene = get_feature_gene(feature) if not maybe_gene: continue # Always add the feature for the gene. It's possible one of the # synonyms was added earlier, so we override it. gene_to_feature_map[maybe_gene] = feature # Get all synonyms and build up the map. if 'gene_synonym' in feature.qualifiers: synonym_set = set( [maybe_gene] + feature.qualifiers['gene_synonym'] + gene_to_synonym_bimap.get(maybe_gene, [])) for synonym in synonym_set: # Don't override if it's already there. We want actual genes # to get precedence over less reliable synonyms. if not synonym in gene_to_feature_map: gene_to_feature_map[synonym] = feature # And use the bimap to pickup any missing synonym connections. for gene, synonym_list in gene_to_synonym_bimap.iteritems(): if gene in gene_to_feature_map: continue for synonym in synonym_list: if synonym in gene_to_feature_map: feature = gene_to_feature_map[synonym] gene_to_feature_map[gene] = feature break # Now iterate through the manual fixes and make the changes. with open(input_file) as input_fh: reader = csv.DictReader(input_fh, delimiter='\t') for manual_fix in reader: clean_id = manual_fix['AGR ID'].strip() if clean_id in KNOWN_PROBLEM_IDS: continue parsed_id = re.match( r'(?P<gene>[a-zA-Z]+)_[a-zA-Z]+_(?P<mutation_start>[0-9]+)?.*', clean_id) # prfB is the only know weird case, so check that anything weird # is indeed prfB and skip it for now. if not parsed_id: parsed_id = re.match(r'(?P<gene>[a-zA-Z]+)_.*', manual_fix['AGR ID']) gene = parsed_id.group('gene') # Make it 0-indexed per the method that does the swap's API. mutation_start = int(parsed_id.group('mutation_start')) - 1 if not gene in gene_to_feature_map: print "%s not in map." % gene assert False feature = gene_to_feature_map[gene] # Parse the old and new sequence data. previous_seq = manual_fix['wt Genotype'].upper() new_seq = manual_fix['Destination Genotype'].upper() if len(new_seq) == 0: continue assert len(previous_seq) == 3 assert len(new_seq) == 3 # Ugh, looks like the data gives changes always in the forward # strand. We'll have to reverse it ourselves in the negative # strand case. if feature.strand == -1: mutation_start = len(feature) - mutation_start - len(previous_seq) previous_seq = reverse_complement(previous_seq) new_seq = reverse_complement(new_seq) swap_feature_codon_at_position(genome_record, feature.id, mutation_start, previous_seq, new_seq)
type=str, help="name of input sequence file") parser.add_argument('-o', '--output', dest='output', type=str, help="output file name for reverse complemented sequences (optional)") args = parser.parse_args() infile = args.seqfile outseq = args.output if args.seqfile: if args.output: output_name = outseq else: output_name = 'reverse_comp_'+infile outfile = open(output_name, 'w') seqfile = open(infile, 'r') for seq in SeqIO.parse(seqfile, 'fasta'): rec_id = str(seq.id) rec_seq = seq.seq rev = str(reverse_complement(rec_seq)) new = SeqRecord(Seq(rev), id=rec_id, description='') SeqIO.write(new, outfile, 'fasta') else: DNAseq = input("enter sequence to reverse complement: ") from Bio.Seq import reverse_complement RCseq = reverse_complement(DNAseq) print('reverse complement is: ',RCseq)
def gff_to_fasta(gff_file, fasta_file, protein_coding=False, qc=None, log=None): '''Convert a gff file with the appended FASTA to protein/all fasta file gff_file = input gff file fasta_file = output file output: a protein coding FASTA file OR nucleotide FASTA file''' out_tmp = ''.join( random.choice(string.ascii_lowercase + string.ascii_uppercase + string.digits) for _ in range(7)) + "_fasta.fa" out = open(out_tmp, "w") contigs = {} with open(gff_file) as f: fasta = False for line in f: if fasta: out.write(line) continue if line.startswith("##FASTA"): fasta = True continue if line.startswith("#"): continue toks = line.strip().split() if toks[2] != "CDS": continue name = toks[-1].split("|")[-1] if toks[0] not in contigs: contigs[toks[0]] = [] contigs[toks[0]].append({ "name": name, "start": int(toks[3]) - 1, "stop": int(toks[4]), "strand": toks[6] }) out.close() # not protein coding, in this case we apply QC measures if not protein_coding: if qc_fasta(out_tmp, qc, log, name=gff_file): # if passed quality control # rename the temp to the final and return os.rename(out_tmp, fasta_file) return fasta_file else: os.remove(out_tmp) # remove the temp file if didnt pass QC return None # read the contigs and save the final fasta file out = open(fasta_file, "w") with open(out_tmp) as handle: for values in SimpleFastaParser(handle): curr_contig = values[0] if curr_contig not in contigs: # no CDSs in this contig continue for cds in contigs[curr_contig]: out.write(">" + cds["name"] + "\n") seq = values[1][cds["start"]:cds["stop"]] if cds["strand"] == "-": seq = reverse_complement(seq) out.write(translate(seq) + "\n") out.close() os.remove(out_tmp) return
#! /home/a_filipchyk/soft/home/a_filipchyk/anaconda3/bin/python '''Outputs reverse complement for the provided sequences''' import pybedtools import sys from Bio.Seq import reverse_complement for seq in sys.argv[1:]: print(reverse_complement(seq))
def __str__(self): global cached_pairs, read_group_ids if self.ref_rc: flag = 0x10 #maps to reverse strand read_seq = reverse_complement(self.read_seq) read_qual = self.read_qual[::-1] else: flag = 0 read_seq = self.read_seq read_qual = self.read_qual mate_ref_name = "*" mate_ref_pos = 0 if not self.template_name: assert self.read_name self.template_name = self.read_name if self.is_paired(): flag += 1 #paired if self.first_in_pair: flag += 0x40 #forward partner else: flag += 0x80 #reverse partner try: mate = self.get_partner() except KeyError: #Paired but no parter in ACE file flag += 0x08 #mate unmapped else: mate_ref_name = mate.contig_name mate_ref_pos = mate.ref_pos if mate_ref_name == self.contig_name: #Since MIRA seems happy and both on same contig, flag += 0x02 #properly aligned assert not self.tags read_seq_unpadded = read_seq.replace("*", "") read_qual_unpadded = "".join(q for (l, q) in zip(read_seq, read_qual) if l != "*") cigar = self.cigar #assert "M" not in cigar, cigar if "D" not in cigar: #Sum of lengths of the M/I/S/=/X operations should match the sequence length #By construction there are no M entries in our CIGAR string. #TODO - Improve this check to consider D in CIGAR? if len(read_seq_unpadded) != sum( int(x) for x in cigar.replace("I", "=").replace("S", "=").replace( "M", "X").replace("X", "=").split("=") if x): raise ValueError("%s vs %i for %s" % (cigar, len(read_seq_unpadded), read_seq)) assert len(read_seq_unpadded) == len(read_qual_unpadded) line = "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\t%s" % \ (self.template_name, flag, self.contig_name, self.ref_pos, self.map_qual, cigar, mate_ref_name, mate_ref_pos, self.insert_size, read_seq_unpadded, read_qual_unpadded) if self.read_group: #MIRA v3.9+ assigns this if self.read_group not in read_group_ids: log("Undeclared read group %r" % self.read_group) log(line) sys.exit(1) line += "\tRG:Z:%s" % self.read_group else: #We assign this on old MIRA assert self.seq_tech line += "\tRG:Z:%s" % read_group_ids[(self.seq_tech, self.strain)] for tag in self.tags: assert not tag.startswith("RG:"), tag line += "\t" + tag return line
def antiparallel(self, seq): return reverse_complement(seq)
s_nucls = defaultdict(int) e_nucls = defaultdict(int) #print(starts) for seqrecord in SeqIO.parse(args.genome, 'fasta'): s_plus = starts[(seqrecord.name, '+')]; #print('bu') s_minus = starts[(seqrecord.name, '-')]; e_plus = ends[(seqrecord.name, '+')]; e_minus = ends[(seqrecord.name, '-')]; for s, e, n1, n2 in zip(s_plus[1:], e_plus[1:], seqrecord.seq, seqrecord.seq[1:]): nn = n1+n2 s_nucls[nn] += s; e_nucls[nn] += e; for s, e, n1, n2 in zip(s_minus[1:], e_minus[1:], seqrecord.seq, seqrecord.seq[1:]): nn = reverse_complement(n1+n2) s_nucls[nn] += e; e_nucls[nn] += s; def rnase_plot(ndict, output, title, normed=False): xticklabels = ["".join(x) for x in product('ACTG', repeat=2)] bars = np.array([ndict[x] for x in xticklabels]) brange = range(16) if(normed): bars = bars/sum(bars); ylabel = 'Fraction' else: ylabel = 'Counts' fig, ax = plt.subplots(figsize=(16, 9)) ax.bar(brange, bars, 0.5, color='lightblue') plt.title(title)
from Bio.Seq import translate from Bio.Seq import reverse_complement dna = "TTCCGGTATTTAGAGGATCGGGGCCAGTAATGCGAAGGTGATTGTGCCTTCGCGCAGTTGAATGCGAAAGCATTGTCACCTTATAGGTTTGCGGATCATCAGCTTGATTTGAATACGCTTTGTCCTGCCCCCCTTGATACGATGAAACAGGATTTACTGCTGACAACTCAGAATGGAAAGCAAGGTATGATGCGCTGACTTGGTTAGAACGACCGGATCATCGAGATGGCGGTGAATTATTCAGACGTAACGTCGGGAAGCTTCACTTCCTACTGGCTACGCGATTTAAAACTCACTGCCGGTGTATGACCATATTATACATCGGATATCTGTATCGCTGTTTGTGCCGCGGTTATACGCCACTTGTTGCTAATGGTTATCTACTATTCTGCACAGATAGAGAGCATTTGATGTGGAAATGGGGAAAGGCGCTTTTCTAGCTAGAAAAGCGCCTTTCCCCATTTCCACATGCGGTATTTCTCAATGGAAACTTTCATGTAATCTGTCCCTTGAACAGAGCCGTTCAGTACAGCCCTACTCAAACGCATTTGCTCTGTTCACACCCTAGTGCAACATAAACTAGGGGATGTGAGTTACTCGCCTGTGAACCGAGGTCCTCTTTCATATCTTGGTATTAAAAACTCCCTTATGGCGTGCATCAATGAACTTTGCACTTACGGAAGCTGACTTGAATCCTCCCAGCCCGCATTGTATTGCGTTAGACACAGGTGTCGAATGCTGCAACCATCTAGCCGCTTAAGTCGTACCACCCTGCCGCGCAGGGTTACATATTTACTTATTGTTCTTACTACGATCGCAAGCTCAATTAGCTTCCCTGCATCCAAAGCAAAGGGCTCGGACGAGCAGCAGCCCATGACCA" initial = [] for start in xrange(0, 3): initial.append(translate(dna[start:-(3 - start)])) initial.append(translate(reverse_complement(dna)[start:-(3 - start)])) longest = 0 best = "" for j in xrange(len(initial)): init = initial[j] st = len(init) for i in range(len(init) - 1, -1, -1): if (init[i] == '*'): st = i + 1 break initial[j] = init[:st] best = "" for init in initial: frames = init.split("*") for frame in frames: st = -1 for i in range(len(frame)): if frame[i] == 'M': st = i break if st != -1 and len(frame[st:]) > len(best): best = frame[st:] print best
def revcomp(dna_string): """Return the reverse complement of a string""" return reverse_complement(dna_string)
mylottia = '' if '|' in item[0]: if len(item[0].split('|')) > 6: mylottia = item[0].split('|')[3] else: mylottia = item[0].split('|')[2] else: mylottia = item[0] #header includes 1. species name 2. assembled contig ID 3. coverage 4. percent sequence above a certain number of bp 5. lottia target 6. lot start 7. lot end 8. seq start 9. seq end out.write(">" + ID + '|' + item[1] + '|' + str(item[2]) + '|' + str(item[4]) + '|' + mylottia + '|' + myblastline[6] + '|' + myblastline[7] + '|' + myblastline[8] + '|' + myblastline[9] +'\n') if int(myblastline[9]) > int(myblastline[8]): out.write(item[-2] + '\n') else: out.write(reverse_complement(item[-2]) + '\n') #if lengthcounter == 0: # out.write('\t'.join([info[0], info[1], str(float(covcounter)/len(myseq)), '0', str(float(lengthcounter)/len(myseq))]) + '\n'), #else: # out.write('\t'.join([info[0], info[1], str(float(covcounter)/len(myseq)), str(float(hetcounter)/lengthcounter), str(float(lengthcounter)/len(myseq))]) + '\n'), out.close() myfasta.close() mycov.close() # myhet.close()
def six_frame_translations(seq, genetic_code=1): """Return pretty string showing the 6 frame translations and GC content. Nice looking 6 frame translation with GC content - code from xbbtools similar to DNA Striders six-frame translation >>> from Bio.SeqUtils import six_frame_translations >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) GC_Frame: a:5 t:0 g:8 c:5 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC <BLANKLINE> <BLANKLINE> 1/1 G H C N G P L W P L * W A A M A I V M G R * auggccauuguaaugggccgcuga 54 % uaccgguaacauuacccggcgacu A M T I P R Q H G N Y H A A S P W Q L P G S <BLANKLINE> <BLANKLINE> """ # noqa for pep8 W291 trailing whitespace from Bio.Seq import reverse_complement, translate anti = reverse_complement(seq) comp = anti[::-1] length = len(seq) frames = {} for i in range(0, 3): fragment_length = 3 * ((length - i) // 3) frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code) frames[-(i + 1)] = translate(anti[i:i + fragment_length], genetic_code)[::-1] # create header if length > 20: short = '%s ... %s' % (seq[:10], seq[-10:]) else: short = seq header = 'GC_Frame: ' for nt in ['a', 't', 'g', 'c']: header += '%s:%d ' % (nt, seq.count(nt.upper())) header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(), length, GC(seq)) res = header for i in range(0, length, 60): subseq = seq[i:i + 60] csubseq = comp[i:i + 60] p = i // 3 res += '%d/%d\n' % (i + 1, i / 3 + 1) res += ' ' + ' '.join(frames[3][p:p + 20]) + '\n' res += ' ' + ' '.join(frames[2][p:p + 20]) + '\n' res += ' '.join(frames[1][p:p + 20]) + '\n' # seq res += subseq.lower() + '%5d %%\n' % int(GC(subseq)) res += csubseq.lower() + '\n' # - frames res += ' '.join(frames[-2][p:p + 20]) + ' \n' res += ' ' + ' '.join(frames[-1][p:p + 20]) + '\n' res += ' ' + ' '.join(frames[-3][p:p + 20]) + '\n\n' return res
def revcomp(seq): return reverse_complement(seq)
def get_sequence(feature, raw_sequence, record, locusTags, rRNAnum, trans_table): # Get the locus tag locus_tag = record.features[feature].qualifiers.get('locus_tag') # Set the locus tag if not locus_tag: locus_tag = "no_tag_%s" % rRNAnum else: locus_tag = locus_tag[0] # Check to see whether the locus has already been used if locus_tag in locusTags: locus_tag = "%s_%s" % (locus_tag, samelocus + 1) quit() else: locusTags.append(locus_tag) # Get the location of the gene # Note, the start site is -1 to the true location location = str(record.features[feature].location) # Check to see whether there are multiple parts joincheck = re.search('join', location) # If there is a join in the gene if joincheck: joinCDSstart = 0 joinCDSend = 0 geneSeq = '' # Locate the splits region = location[location.find("{") + 1:location.find("}")] splits = re.sub(', ', ',', region) splits = re.split(',', splits) cdsstrand = record.features[feature].strand intronNum = 0 loc = '' # For each intron for i in range(0, len(splits)): intronNum += 1 strand = record.features[feature].strand # strand = splits[i][splits[i].find("(")+1:splits[i].find(")")] locations = re.findall('\d+', splits[i]) cdsStart = int(locations[0]) cdsEnd = int(locations[1]) if joinCDSstart == 0: joinCDSstart = int(cdsStart) else: if int(cdsStart) < joinCDSstart: joinCDSstart = int(cdsStart) if joinCDSend == 0: joinCDSend = int(cdsEnd) else: if int(cdsEnd) > cdsEnd: joinCDSend = int(cdsEnd) if intronNum == len(splits): loc += "%s..%s" % (cdsStart + 1, cdsEnd) else: loc += "%s..%s," % (cdsStart + 1, cdsEnd) seq = raw_sequence[cdsStart:cdsEnd] if strand == -1: strandType = 1 seq = seq.reverse_complement() geneSeq += seq else: strandType = 0 geneSeq += seq cdsStart = joinCDSstart cdsEnd = joinCDSend # Otherwise else: # Locate the gene start (note -1 from true start site) cdsStart = record.features[feature].location.nofuzzy_start # Locate the gene end cdsEnd = record.features[feature].location.nofuzzy_end # Write the location loc = "%s..%s" % (cdsStart + 1, cdsEnd) # Get the raw sequence seq = raw_sequence[cdsStart:cdsEnd] strand = record.features[feature].strand if strand == -1: strandType = 1 geneSeq = reverse_complement(seq) else: strandType = 0 geneSeq = seq return geneSeq, cdsStart, cdsEnd, strand
def frame(self, seq, frame, translation_table=1): """Translate DNA sequence in a chosen frame.""" if frame < 0: seq = reverse_complement(seq) seq = seq[(abs(frame) - 1):] return translate(seq, table=translation_table)
def __main__(): parser = argparse.ArgumentParser( description='Generate proBED and proBAM from mz.sqlite') parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML") parser.add_argument( 'genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table") parser.add_argument('-R', '--genomeReference', default='Unknown', help='Genome reference sequence in 2bit format') parser.add_argument('-t', '--twobit', default=None, help='Genome reference sequence in 2bit format') parser.add_argument('-r', '--reads_bam', default=None, help='reads alignment bam path') parser.add_argument('-g', '--gffutils_sqlite', default=None, help='gffutils GTF sqlite DB') parser.add_argument('-B', '--probed', default=None, help='proBed path') parser.add_argument('-s', '--prosam', default=None, help='proSAM path') parser.add_argument('-b', '--probam', default=None, help='proBAM path') parser.add_argument('-l', '--limit', type=int, default=None, help='limit numbers of PSMs for testing') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') parser.add_argument('-d', '--debug', action='store_true', help='Debug') args = parser.parse_args() def get_sequence(chrom, start, end): if twobit: if chrom in twobit and 0 <= start < end < len(twobit[chrom]): return twobit[chrom][start:end] contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom if contig in twobit and 0 <= start < end < len(twobit[contig]): return twobit[contig][start:end] return '' return None twobit = TwoBitFile(args.twobit) if args.twobit else None samfile = pysam.AlignmentFile(args.reads_bam, "rb") if args.reads_bam else None seqlens = twobit.sequence_sizes() probed = open(args.probed, 'w') if args.probed else sys.stdout gff_cursor = get_connection( args.gffutils_sqlite).cursor() if args.gffutils_sqlite else None map_cursor = get_connection(args.genomic_mapping_sqlite).cursor() mz_cursor = get_connection(args.mzsqlite).cursor() unmapped_accs = set() timings = dict() def add_time(name, elapsed): if name in timings: timings[name] += elapsed else: timings[name] = elapsed XG_TYPES = [ 'N', 'V', 'W', 'J', 'A', 'M', 'C', 'E', 'B', 'O', 'T', 'R', 'I', 'G', 'D', 'U', 'X', '*' ] FT_TYPES = ['CDS', 'five_prime_utr', 'three_prime_utr', 'transcript'] def get_peptide_type(exons): ## XG classify peptide ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. ## W Indel peptide. An insertion or deletion is present as compared to the reference. ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. ## X Unknown. peptide_type = '*' if gff_cursor: ts = time() etypes = ['*'] * len(exons) efeatures = [None] * len(exons) if args.debug: print('exons:%d\t%s' % (len(exons), etypes), file=sys.stderr) for i, exon in enumerate(exons): (acc, gc, gs, ge, st, cs, ce) = exon fr = cs % 3 if args.debug: print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc, gc, gs, ge, st, cs, ce, fr), file=sys.stderr) ft_params = { "seqid": str(gc).replace('chr', ''), "start": gs, "end": ge, 'strand': st, 'frame': fr, 'ftype': 'CDS' } features = [ f for f in gff_cursor.execute(FEATURE_ANY_QUERY, ft_params) ] efeatures[i] = features for i, exon in enumerate(exons): (acc, gc, gs, ge, st, cs, ce) = exon for f in efeatures[i]: (id, seqid, start, end, featuretype, strand, frame, in_frame) = f if args.debug: print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (id, seqid, start, end, featuretype, strand, frame, in_frame), file=sys.stderr) if strand == st: if start <= gs and ge <= end: if in_frame: etypes[i] = 'N' break elif XG_TYPES.index('O') < XG_TYPES.index( etypes[i]): etypes[i] = 'O' break else: if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]): etypes[i] = 'O' peptide_type = etypes[i] te = time() add_time('pep_type', te - ts) return peptide_type def classify_exon(exon, exons, features): ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. # 1 exon, contained, in_frame # 2+ exons, contained, in_frame, on_exon_boundary ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. # 1 exon, contained, in_frame, AA_mismatch # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch ## W Indel peptide. An insertion or deletion is present as compared to the reference. # 1 exon, contained, in_frame, AA_mismatch # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). # 1 exon overlaps but not contained ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. # exon overlaps a three_prime_utr ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. # exon contained but not in_frame ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. # exon contained in transcript, not not overlapping any exon ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. # exonis from different seqs, strand, or transcripts ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. ## X Unknown. return '*' def get_variant_cds(exons, ref_prot, peptide, pep_cds): if ref_prot != peptide and samfile: try: if args.debug: print('name: %s \nref: %s\npep: %s\n' % (scan_name, ref_prot, peptide), file=sys.stderr) ts = time() for exon in exons: (acc, chrom, start, end, strand, c_start, c_end) = exon a_start = c_start / 3 * 3 a_end = c_end / 3 * 3 if ref_prot[a_start:a_end] != peptide[a_start:a_end]: pileup = get_exon_pileup(chrom, start, end) for i, (bi, ai, ao) in enumerate([ (i, i / 3, i % 3) for i in range(c_start, c_end) ]): if ao == 0 or i == 0: if ref_prot[ai] != peptide[ai]: codon = get_pep_codon( pileup, bi - c_start, peptide[ai], ao) if args.debug: print('%d %d %d %s : %s %s %s' % (bi, ai, ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi + 3:])), file=sys.stderr) if codon: pep_cds = pep_cds[: bi] + codon + pep_cds[ bi + 3:] te = time() add_time('var_cds', te - ts) except Exception as e: print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name, ref_prot, peptide, e), file=sys.stderr) return pep_cds def get_mapping(acc, pep_start, pep_end): ts = time() p_start = (pep_start - 1) * 3 p_end = pep_end * 3 map_params = {"acc": acc, "p_start": p_start, "p_end": p_end} if args.debug: print('%s' % map_params, file=sys.stderr) locs = [l for l in map_cursor.execute(MAP_QUERY, map_params)] exons = [] ## ========= pep ## --- continue ## --- trim ## --- copy ## --- trim ## --- break c_end = 0 for i, (acc, chrom, start, end, strand, cds_start, cds_end) in enumerate(locs): if args.debug: print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc, chrom, start, end, strand, cds_start, cds_end), file=sys.stderr) c_start = c_end if cds_end < p_start: continue if cds_start >= p_end: break if strand == '+': if cds_start < p_start: start += p_start - cds_start if cds_end > p_end: end -= cds_end - p_end else: if cds_start < p_start: end -= p_start - cds_start if cds_end > p_end: start += cds_end - p_end c_end = c_start + abs(end - start) if args.debug: print('Pep: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc, chrom, start, end, strand, cds_start, cds_end), file=sys.stderr) exons.append([acc, chrom, start, end, strand, c_start, c_end]) te = time() add_time('get_mapping', te - ts) return exons def get_cds(exons): ts = time() seqs = [] for i, (acc, chrom, start, end, strand, cds_start, cds_end) in enumerate(exons): seq = get_sequence(chrom, min(start, end), max(start, end)) if strand == '-': seq = reverse_complement(seq) seqs.append(seq) te = time() add_time('get_cds', te - ts) if args.debug: print('CDS: %s' % str(seqs), file=sys.stderr) return ''.join(seqs) if seqs else '' def genomic_mapping_count(peptide): ts = time() params = {"sequence": peptide} acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY, params)] te = time() add_time('PEPTIDE_ACC_QUERY', te - ts) if acc_locs: if len(acc_locs) == 1: return 1 locations = set() for i, acc_loc in enumerate(acc_locs): (acc, pep_start, pep_end) = acc_loc if acc in unmapped_accs: continue try: add_time('GENOMIC_POS_QUERY_COUNT', 1) ts = time() p_start = pep_start * 3 p_end = pep_end * 3 params = {"acc": acc, "cds_offset": p_start} (start_chrom, start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() params = {"acc": acc, "cds_offset": p_end} (end_chrom, end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() locations.add('%s:%s-%s:%s' % (start_chrom, start_pos, end_chrom, end_pos)) te = time() add_time('GENOMIC_POS_QUERY', te - ts) except: unmapped_accs.add(acc) if args.debug: print('Unmapped: %s' % acc, file=sys.stderr) return len(locations) return -1 def spectrum_peptide_count(spectrum_id): ts = time() params = {"sr_id": spectrum_id} pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0] te = time() add_time('SPECTRUM_PEPTIDES_QUERY', te - ts) return pep_count def get_exon_pileup(chrom, chromStart, chromEnd): cols = [] for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd): if chromStart <= pileupcolumn.reference_pos <= chromEnd: bases = dict() col = { 'depth': 0, 'cov': pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases': bases } for pileupread in pileupcolumn.pileups: if not pileupread.is_del and not pileupread.is_refskip: col['depth'] += 1 base = pileupread.alignment.query_sequence[ pileupread.query_position] if base not in bases: bases[base] = 1 else: bases[base] += 1 cols.append(col) return cols codon_map = { "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", "TAT": "Y", "TAC": "Y", "TAA": "*", "TAG": "*", "TGT": "C", "TGC": "C", "TGA": "*", "TGG": "W", "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L", "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P", "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q", "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R", "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M", "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T", "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K", "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R", "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V", "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A", "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E", "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G", } aa_codon_map = dict() for c, a in codon_map.items(): aa_codon_map[a] = [ c ] if a not in aa_codon_map else aa_codon_map[a] + [c] aa_na_map = dict() # m[aa]{bo : {b1 : [b3] for c, a in codon_map.items(): if a not in aa_na_map: aa_na_map[a] = dict() d = aa_na_map[a] for i in range(3): b = c[i] if i < 2: if b not in d: d[b] = dict() if i < 1 else set() d = d[b] else: d.add(b) def get_pep_codon(pileup, idx, aa, ao): try: ts = time() bases = [] for i in range(3): if i < ao: bases.append(list(set([c[i] for c in aa_codon_map[aa]]))) else: bases.append([ b for b, cnt in reversed( sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k, v): (v, k))) ]) if args.debug: print('%s' % bases, file=sys.stderr) for b0 in bases[0]: if b0 not in aa_na_map[aa]: continue for b1 in bases[1]: if b1 not in aa_na_map[aa][b0]: continue for b2 in bases[2]: if b2 in aa_na_map[aa][b0][b1]: return '%s%s%s' % (b0, b1, b2) te = time() add_time('pep_codon', te - ts) except Exception as e: print("get_pep_codon: %s %s %s %s" % (aa, ao, idx, pileup), file=sys.stderr) raise e return None def write_probed(chrom, chromStart, chromEnd, strand, blockCount, blockSizes, blockStarts, spectrum, protacc, peptide, uniqueness, genomeReference, score=1000, psmScore='.', fdr='.', mods='.', charge='.', expMassToCharge='.', calcMassToCharge='.', psmRank='.', datasetID='.', uri='.'): probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount, ','.join([str(v) for v in blockSizes]), ','.join([str(v) for v in blockStarts]), protacc,peptide,uniqueness, genomeReference, psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri)) def get_genomic_location(exons): chrom = exons[0][1] strand = exons[0][4] pos = [exon[2] for exon in exons] + [exon[3] for exon in exons] chromStart = min(pos) chromEnd = max(pos) blockCount = len(exons) blockSizes = [abs(exon[3] - exon[2]) for exon in exons] blockStarts = [min(exon[2], exon[3]) - chromStart for exon in exons] return (chrom, chromStart, chromEnd, strand, blockCount, blockSizes, blockStarts) def get_psm_modifications(peptide_ref): mods = [] ts = time() params = {"peptide_ref": peptide_ref} pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)] if pepmods: for (location, residue, name, modType, unimod) in pepmods: mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name, residue))) te = time() add_time('PEP_MODS_QUERY', te - ts) return ';'.join(mods) """ QNAME FLAG RNAME POS CIGAR SEQ 'NH' : 'i', #number of genomic locations to which the peptide sequence maps 'XO' : 'Z', #uniqueness of the peptide mapping 'XL' : 'i', #number of peptides to which the spectrum maps 'XP' : 'Z', #peptide sequence 'YP' : 'Z', #Protein accession ID from the original search result 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2) 'XI' : 'f', #Peptide intensity 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;). 'XR' : 'Z', #reference peptide sequence 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before). 'YA' : 'Z', #Following amino acids (2 AA, A stands for after). 'XS' : 'f', #PSM score 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP). 'XC' : 'i', #peptide charge 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown; 'XM' : 'Z', #Modifications 'XN' : 'i', #Number of missed cleavages in the peptide (XP) 'XT' : 'i', #Enzyme specificity 'XE' : 'i', #Enzyme used in the experiment 'XG' : 'A', #Peptide type 'XU' : 'Z', #URI """ psm_cursor = get_connection(args.mzsqlite).cursor() ts = time() psms = psm_cursor.execute(PSM_QUERY) te = time() add_time('PSM_QUERY', te - ts) proBAM = ProBAM(species=None, assembly=args.genomeReference, seqlens=seqlens, comments=[]) proBED = ProBED(species=None, assembly=args.genomeReference, comments=[]) for i, psm in enumerate(psms): probam_dict = PROBAM_DEFAULTS.copy() (acc, pep_start, pep_end, aa_pre, aa_post, peptide, spectrum_id, spectrum_title, rank, charge, calcmass, exprmass, pepref) = psm scan_name = spectrum_title if spectrum_title else spectrum_id if args.debug: print('\nPSM: %d\t%s' % (i, '\t'.join([ str(v) for v in (acc, pep_start, pep_end, peptide, spectrum_id, scan_name, rank, charge, calcmass, exprmass) ])), file=sys.stderr) exons = get_mapping(acc, pep_start, pep_end) if args.debug: print('%s' % exons, file=sys.stderr) if not exons: continue mods = get_psm_modifications(pepref) (chrom, chromStart, chromEnd, strand, blockCount, blockSizes, blockStarts) = get_genomic_location(exons) ref_cds = get_cds(exons) if args.debug: print('%s' % ref_cds, file=sys.stderr) ref_prot = translate(ref_cds) if args.debug: print('%s' % ref_prot, file=sys.stderr) print('%s' % peptide, file=sys.stderr) spectrum_peptides = spectrum_peptide_count(spectrum_id) peptide_locations = genomic_mapping_count(peptide) if args.debug: print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides, peptide_locations), file=sys.stderr) uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]' ts = time() proBEDEntry = ProBEDEntry(chrom, chromStart, chromEnd, '%s_%s' % (acc, scan_name), 1000, strand, blockCount, blockSizes, blockStarts, acc, peptide, uniqueness, args.genomeReference, charge=charge, expMassToCharge=exprmass, calcMassToCharge=calcmass, mods=mods if mods else '.', psmRank=rank) proBED.add_entry(proBEDEntry) te = time() add_time('add_probed', te - ts) if len(ref_prot) != len(peptide): continue ts = time() probam_dict['NH'] = peptide_locations probam_dict['XO'] = uniqueness probam_dict['XL'] = peptide_locations probam_dict['XP'] = peptide probam_dict['YP'] = acc probam_dict['XC'] = charge probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass) probam_dict['XR'] = ref_prot # ? dbSequence probam_dict['YA'] = aa_post probam_dict['YB'] = aa_pre probam_dict['XM'] = mods if mods else '*' flag = 16 if strand == '-' else 0 if str(rank) != str(1) and rank != '*' and rank != [] and rank != "": flag += 256 probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons]) ## check for variation from ref_cds pep_cds = get_variant_cds(exons, ref_prot, peptide, ref_cds) peptide_type = '*' ## XG classify peptide probam_dict['XG'] = get_peptide_type(exons) ## probam_dict['MD'] = peptide ## FIX SAM sequence is forward strand seq = pep_cds if strand == '+' else reverse_complement(pep_cds) ## cigar based on plus strand cigar = '' if strand == '+': blkStarts = blockStarts blkSizes = blockSizes else: blkStarts = [x for x in reversed(blockStarts)] blkSizes = [x for x in reversed(blockSizes)] for j in range(blockCount): if j > 0: intron = blkStarts[j] - (blkStarts[j - 1] + blkSizes[j - 1]) if intron > 0: cigar += '%dN' % intron cigar += '%dM' % blkSizes[j] ## Mods TODO proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart + 1, cigar=cigar, seq=seq, optional=probam_dict) proBAM.add_entry(proBAMEntry) te = time() add_time('add_probam', te - ts) if args.debug: print('%s' % probam_dict, file=sys.stderr) if args.limit and i >= args.limit: break if args.probed: ts = time() with open(args.probed, 'w') as fh: proBED.write(fh) te = time() add_time('write_probed', te - ts) if args.prosam or args.probam: samfile = args.prosam if args.prosam else 'temp.sam' ts = time() with open(samfile, 'w') as fh: proBAM.write(fh) te = time() add_time('write_prosam', te - ts) if args.probam: ts = time() bamfile = args.prosam.replace('.sam', '.bam') pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False) te = time() add_time('write_probam', te - ts) pysam.index(args.probam) print('\n%s\n' % str(timings), file=sys.stderr)
def sequence(self): seq = self.fragment.sequence if self.direction == 'r': seq = str(reverse_complement(Seq(seq))) return seq[self.start():self.end()]
def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads, a): """ Filters out reads longer than length provided and it is used to call the alignemnt and parse the outputs """ seq_dict = {} first_dict_score = {} first_dict_seq = {} score_dict = {} listA_adapter = [] final_seq = [] list_seq_adap = [] record_dict = {} max_score = 0 if a and not adapter: out_filename = wd + fastq_filename + '.longreads.filtered.fasta' elif a and adapter: out_filename = wd + fastq_filename + '.longreads.filtered.oriented.fasta' else: out_filename = fastq_filename + '.longreads.filtered.fasta' filter_count = 0 if os.path.isfile(out_filename): sys.stdout.write(('Filtered FASTQ existed already: ' + out_filename + ' --- skipping\n')) return out_filename, 0 if fastq_filename.endswith('fastq') or fastq_filename.endswith('fq'): for record in SeqIO.parse(fastq_filename, "fastq"): if len(str(record.seq)) > int(min_length) < int(max_length): record.description= "" record.name = "" record.id = str(filter_count) filter_count += 1 record_dict[record.id] = record elif fastq_filename.endswith('fasta') or fastq_filename.endswith('fa'): for record in SeqIO.parse(fastq_filename, "fasta"): if int(min_length) < len(str(record.seq)) < int(max_length): record.description= "" record.name = "" record.id = str(filter_count) filter_count += 1 record_dict[record.id] = record if adapter: for adpt in SeqIO.parse(adapter, "fasta"): listA_adapter.append(adpt.id) list_seq_adap.append(adpt) outFile = open(out_filename, 'w') filter_count = 0 if len(listA_adapter) == 1: filter_count = 0 list_command = [] for key in record_dict: for adpter in list_seq_adap: list_command.append([record_dict[key], adpter]) with Pool(processes=int(threads), maxtasksperchild=1000) as p: align_resul = p.map(align_call, list_command, chunksize=1) for aling_res in align_resul: if len(aling_res) == 0: next else: seq_dict[aling_res[1]] = [record_dict[aling_res[1]], aling_res[2]] score_dict[aling_res[1]] = aling_res[3] numbers = [score_dict[key][0] for key in score_dict] value_optimal = float(sum(numbers)) / max(len(numbers), 1) #for key in score_dict: # if score_dict[key][0] > max_score: # max_score = score_dict[key][0] #value_optimal = max_score - (max_score/20) for key in score_dict: if score_dict[key][0] > value_optimal and seq_dict[key][1] == 0: filter_count += 1 final_seq.append(seq_dict[key][0]) elif score_dict[key][0] > value_optimal and seq_dict[key][1] == 1: filter_count += 1 sequenze = reverse_complement(seq_dict[key][0].seq) seq_dict[key][0].seq = sequenze final_seq.append(seq_dict[key][0]) elif len(listA_adapter) == 2: filter_count = 0 list_command = [] for key in record_dict: for adpter in list_seq_adap: list_command.append([record_dict[key], adpter]) with Pool(processes=int(threads), maxtasksperchild=1000) as p: align_resul = p.map(align_call, list_command, chunksize=1) for aling_res in align_resul: if len(aling_res) == 0: next elif aling_res[1] in first_dict_seq: seq_dict[aling_res[1]] = first_dict_seq[aling_res[1]] + [ aling_res[2], aling_res[0]] score_dict[aling_res[1]] = first_dict_score[aling_res[1]] + [ aling_res[0], aling_res[3]] else: first_dict_seq[aling_res[1]] = [record_dict[aling_res[1]], aling_res[2], aling_res[0]] first_dict_score[aling_res[1]] = [ aling_res[0], aling_res[3]] max_score_first = 0 max_score_second = 0 for key in score_dict: score = score_dict[key] if score[0] == listA_adapter[0] and score[1][0] > max_score_first: max_score_first = score[1][0] if score[2] == listA_adapter[0] and score[3][0] > max_score_first: max_score_first = score[3][0] if score[0] == listA_adapter[1] and score[1][0] > max_score_second: max_score_second = score[1][0] if score[2] == listA_adapter[1] and score[3][0] > max_score_second: max_score_second = score[3][0] value_optimal_first = max_score_first - (max_score_first/30) value_optimal_second = max_score_second - (max_score_second/30) listReadsOverLimit = [] for key in score_dict: score = score_dict[key] if (score[0] == listA_adapter[0] and score[1][0] > value_optimal_first) and (score[2] == listA_adapter[1] and score[3][0] > value_optimal_second): listReadsOverLimit.append(key) elif (score[2] == listA_adapter[0] and score[3][0] > value_optimal_first) and (score[0] == listA_adapter[1] and score[1][0] > value_optimal_second): listReadsOverLimit.append(key) for key in listReadsOverLimit: if seq_dict[key][1] == 1 and seq_dict[key][3] == 0: if seq_dict[key][2] == listA_adapter[0] and seq_dict[key][4] == listA_adapter[1]: final_seq.append(seq_dict[key][0]) elif seq_dict[key][2] in listA_adapter[1] and seq_dict[key][4] in listA_adapter[0]: sequenze = reverse_complement(seq_dict[key][0].seq) seq_dict[key][0].seq = sequenze final_seq.append(seq_dict[key][0]) elif seq_dict[key][1] == 0 and seq_dict[key][3] == 1: if seq_dict[key][2] == listA_adapter[0] and seq_dict[key][4] == listA_adapter[1]: sequenze = reverse_complement(seq_dict[key][0].seq) seq_dict[key][0].seq = sequenze final_seq.append(seq_dict[key][0]) elif seq_dict[key][2] in listA_adapter[1] and seq_dict[key][4] in listA_adapter[0]: final_seq.append(seq_dict[key][0]) elif len(listA_adapter) == 0: for key in record_dict: if int(min_length) < len(str(record_dict[key].seq)) < int(max_length): filter_count += 1 final_seq.append(record_dict[key]) SeqIO.write(final_seq, out_filename, "fasta") fmtdate = '%H:%M:%S %d-%m' now = datetime.datetime.now().strftime(fmtdate) sys.stdout.write(("###FINISHED FILTERING AT:\t" + now + "###\n\n###LOREAN KEPT\t\033[32m" + str(filter_count) + "\033[0m\tREADS AFTER LENGTH FILTERING###\n")) return out_filename
overlap_counter_m += 1 if int(v[0]) <= min_coord: #print max_coord min_coord = int(v[0]) #print max_coord #else: # print "blah" # int(v[1]) #~ print k,v, gene, coords #~ print int(int(coords[0])-50) #~ print int(v[1]) #~ print int(coords[0]) if overlap_counter_m > 0: #print overlap_counter_m,int(coords[1]),int(min_coord) sequence = record.seq.tomutable() sequence = sequence[int(coords[1]):int(min_coord)] reverse_seq = reverse_complement(sequence) #print sequence, gene.split("*")[1], int(coords[0])-500, "this on +, normal" if len(sequence) > 100: output.write( ">%s=%s=%s=%s=%s=%s=%s\n%s\n" % (record.id, gene.split("*")[1], int(coords[1]), int(min_coord), len(sequence), gene.split("*")[2], "OVERLAPPED", reverse_seq)) print("%s\t%s\t%s\t%s\t%s" % (record.id, gene.split("*")[1], int(coords[1]), int(min_coord), len(sequence))) else: n_short_genes += 1 elif overlap_counter_m == 0:
import argparse if __name__ == '__main__': parser = argparse.ArgumentParser('') parser.add_argument('fa', help='name of fasta file', type=argparse.FileType('r')) parser.add_argument('seqnames', help='file containing names sequence to complement', type=argparse.FileType('r')) args = parser.parse_args() ids = [line.strip() for line in args.seqnames.readlines()] from Bio.SeqIO import parse from Bio.Seq import reverse_complement import re with open(args.fa.name + '.rc', 'w') as fout: for fasta in parse(args.fa.name, 'fasta'): fout.write('>' + fasta.id + '\n') if fasta.id in ids: fout.write( re.sub("(.{64})", "\\1\n", str(reverse_complement(fasta.seq)), 0, re.DOTALL)) else: fout.write( re.sub("(.{64})", "\\1\n", str(fasta.seq), 0, re.DOTALL)) fout.write('\n')
#The interval [x2 y2] of the read (i.e., the unclipped #data, also called the 'clear range') aligns with the #interval [x1 y1] of the contig. If x1 > y1 (the contig #positions), then the reverse complement of the read is #aligned to the contig. For the read positions, x2 is #always < y2. # # If MIRA is used in mapping mode, the soft trimmed # region can contain gaps which must be discarded # for getting the CIGAR S operator count. if x1 > y1: current_read.ref_rc = True #SAM stores these backwards: cigar = make_cigar( padded_con_seq[y1 - 1:x1], reverse_complement( current_read.read_seq[x2 - 1:y2])) if x2 > 1: clipped = len( current_read.read_seq[:x2 - 1].replace( "*", "")) cigar += "%iS" % clipped if y2 < len(current_read.read_seq): clipped = len( current_read.read_seq[y2:].replace( "*", "")) cigar = "%iS%s" % (clipped, cigar) else: cigar = make_cigar( padded_con_seq[x1 - 1:y1], current_read.read_seq[x2 - 1:y2]) if x2 > 1:
def worker(conting, start, stop): global parent_pid, dictRefSeq, options pid = os.getpid() tmp_file_name = 'tmp_' + str(parent_pid) + '_' + str(pid) max_depth = options.max_depth if "GL" in conting: max_depth = options.rRNA_max_depth else: max_depth = options.max_depth with pysam.AlignmentFile(options.input, 'rb') as samfile, file(tmp_file_name, 'a') as tmp_file: try: for pileupcolumn in samfile.pileup(conting, start=start, stop=stop, max_depth=max_depth, ignore_orphans=False, ignore_overlaps=False, truncate=True): chr = pileupcolumn.reference_name ref_seq = dictRefSeq.get(pileupcolumn.reference_name) ref_base = ref_seq[pileupcolumn.pos].upper() pos = pileupcolumn.pos + 1 positive_seq = [] positive_C_seg = [] negative_seq = [] negative_C_seg = [] PF_positive_base = {} PF_negative_base = {} PF_positive_qual = {} PF_negative_qual = {} PF_positive_C = {} PF_negative_C = {} for pileupread in pileupcolumn.pileups: if pileupread.query_position is not None and pileupread.alignment.query_qualities[ pileupread.query_position] >= options.qual: not_at_end = True if pileupread.alignment.is_reverse: if options.trim_tail and pileupread.query_position < options.trim_tail: not_at_end = False if not_at_end == True and options.trim_head and pileupread.alignment.query_length - options.trim_head <= pileupread.query_position: not_at_end = False else: if options.trim_tail and pileupread.alignment.query_length - options.trim_tail <= pileupread.query_position: not_at_end = False if not_at_end == True and pileupread.query_position < options.trim_head: not_at_end = False if not_at_end == True: query_name, query_base, query_qual = pileupread.alignment.query_name, pileupread.alignment.query_sequence[ pileupread. query_position], pileupread.alignment.query_qualities[ pileupread.query_position] if pileupread.alignment.get_tag("YG") == "C2T": C_segment = pileupread.alignment.query_sequence.count( 'C') if PF_positive_base.get(query_name) is None: PF_positive_base[query_name] = query_base PF_positive_qual[query_name] = query_qual PF_positive_C[query_name] = C_segment else: lastRead = PF_positive_base.get(query_name) lastCcount = PF_positive_C.get(query_name) if lastRead != query_base: if options.omit: PF_positive_base.pop(query_name) PF_positive_qual.pop(query_name) PF_positive_C.pop(query_name) else: lastQual = PF_positive_qual.get( query_name) if lastQual < query_qual: PF_positive_base[ query_name] = query_base PF_positive_qual[ query_name] = query_qual PF_positive_C[ query_name] = C_segment elif lastRead == query_base: if lastCcount < PF_positive_C.get( query_name): PF_positive_base[ query_name] = query_base PF_positive_qual[ query_name] = query_qual PF_positive_C[ query_name] = C_segment elif pileupread.alignment.get_tag("YG") == "G2A": C_segment = pileupread.alignment.query_sequence.count( 'G') query_base = reverse_complement(query_base) if PF_negative_base.get(query_name) is None: PF_negative_base[query_name] = query_base PF_negative_qual[query_name] = query_qual PF_negative_C[query_name] = C_segment else: lastRead = PF_negative_base.get(query_name) lastCcount = PF_negative_C.get(query_name) if lastRead != query_base: if options.omit: PF_negative_base.pop(query_name) PF_negative_qual.pop(query_name) PF_negative_C.pop(query_name) else: lastQual = PF_negative_qual.get( query_name) if lastQual < query_qual: PF_negative_base[ query_name] = query_base PF_negative_qual[ query_name] = query_qual PF_negative_C[ query_name] = C_segment elif lastRead == query_base: if lastCcount < PF_negative_C.get( query_name): PF_negative_base[ query_name] = query_base PF_negative_qual[ query_name] = query_qual PF_negative_C[ query_name] = C_segment list_tmp_positive = PF_positive_base.values() list_tmp_negative = PF_negative_base.values() if len(list_tmp_positive) > 0: positive_seq = PF_positive_base.values() positive_C_seg = [str(i) for i in PF_positive_C.values()] tmp_file.write("\t".join([ chr, str(pos), '+', ref_base, 'Genome', str(pos), ','.join(positive_seq), ','.join(positive_C_seg) ])) tmp_file.write("\n") if len(list_tmp_negative) > 0: negative_seq = PF_negative_base.values() negative_C_seg = [str(i) for i in PF_negative_C.values()] ref_base = reverse_complement(ref_base) tmp_file.write("\t".join([ chr, str(pos), '-', ref_base, 'Genome', str(pos), ','.join(negative_seq), ','.join(negative_C_seg) ])) tmp_file.write("\n") except ValueError: print "Conting [%s] does not exist in @SQ header, pass" % conting
def complement(self, seq): #TODO - use Seq methods instead of this hack:? return reverse_complement(seq)[::-1]
def six_frame_translations(seq, genetic_code=1): """Return pretty string showing the 6 frame translations and GC content. coded and written by casesagar aka sagar saini >>> from Bio.SeqUtils import six_frame_translations >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")) GC_Frame: a:5 t:0 g:8 c:5 Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC <BLANKLINE> <BLANKLINE> 1/1 G H C N G P L W P L * W A A M A I V M G R * auggccauuguaaugggccgcuga uaccgguaacauuacccggcgacu A M T I P R Q H G N Y H A A S P W Q L P G S <BLANKLINE> <BLANKLINE> """ # noqa for pep8 W291 trailing whitespace from Bio.Seq import reverse_complement, translate anti = reverse_complement(seq) comp = anti[::-1] length = len(seq) frames = {} for i in range(0, 3): fragment_length = 3 * ((length - i) // 3) frames[i + 1] = translate( seq[i:i + fragment_length], genetic_code, ) frames[-(i + 1)] = translate( anti[i:i + fragment_length], genetic_code, )[::-1] # create header if length > 20: short = "%s ... %s" % (seq[:10], seq[-10:]) else: short = seq header = "GC_Frame: " for nt in ["a", "t", "g", "c"]: header += "%s:%d " % (nt, seq.count(nt.upper())) header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % ( short.lower(), length, GC(seq), ) res = header frame_3 = frames[3] frame_2 = frames[2] frame_1 = frames[1] # seq # - frames framecomp_2 = frames[-2] framecomp_1 = frames[-1] framecomp_3 = frames[-3] return res, frame_1, frame_2, frame_3, seq, comp, framecomp_2, framecomp_1, framecomp_3
def format_alignment(self, alignment): """Return a string with a single alignment formatted as one PSL line.""" if not isinstance(alignment, Alignment): raise TypeError("Expected an Alignment object") coordinates = alignment.coordinates if not coordinates.size: # alignment consists of gaps only return "" target, query = alignment.sequences try: qName = query.id except AttributeError: qName = "query" try: query = query.seq except AttributeError: pass try: tName = target.id except AttributeError: tName = "target" try: target = target.seq except AttributeError: pass tSize = len(target) qSize = len(query) # fmt: off dnax = None # set to True for translated DNA aligned to protein, # and to False for DNA/RNA aligned to DNA/RNA # noqa: E114, E116 if coordinates[1, 0] > coordinates[1, -1]: # DNA/RNA mapped to reverse strand of DNA/RNA strand = "-" query = reverse_complement(query, inplace=False) coordinates = coordinates.copy() coordinates[1, :] = qSize - coordinates[1, :] elif coordinates[0, 0] > coordinates[0, -1]: # protein mapped to reverse strand of DNA strand = "-" target = reverse_complement(target, inplace=False) coordinates = coordinates.copy() coordinates[0, :] = tSize - coordinates[0, :] dnax = True else: # mapped to forward strand strand = "+" # fmt: on wildcard = self.wildcard mask = self.mask # variable names follow those in the PSL file format specification matches = 0 misMatches = 0 repMatches = 0 nCount = 0 qNumInsert = 0 qBaseInsert = 0 tNumInsert = 0 tBaseInsert = 0 blockSizes = [] qStarts = [] tStarts = [] tStart, qStart = coordinates[:, 0] for tEnd, qEnd in coordinates[:, 1:].transpose(): if tStart == tEnd: if qStart > 0 and qEnd < qSize: qNumInsert += 1 qBaseInsert += qEnd - qStart qStart = qEnd elif qStart == qEnd: if tStart > 0 and tEnd < tSize: tNumInsert += 1 tBaseInsert += tEnd - tStart tStart = tEnd else: tCount = tEnd - tStart qCount = qEnd - qStart tStarts.append(tStart) qStarts.append(qStart) blockSizes.append(qCount) if tCount == qCount: assert dnax is not True dnax = False else: # translated DNA aligned to protein, typically generated by # blat -t=dnax -q=prot assert tCount == 3 * qCount assert dnax is not False dnax = True tSeq = target[tStart:tEnd] qSeq = query[qStart:qEnd] try: tSeq = bytes(tSeq) except TypeError: # string tSeq = bytes(tSeq, "ASCII") except UndefinedSequenceError: # sequence contents is unknown tSeq = None try: qSeq = bytes(qSeq) except TypeError: # string qSeq = bytes(qSeq, "ASCII") except UndefinedSequenceError: # sequence contents is unknown qSeq = None if tSeq is None or qSeq is None: # contents of at least one sequence is unknown; # count all aligned letters as matches: matches += qCount else: if mask == "lower": for u1, u2, c1 in zip(tSeq.upper(), qSeq.upper(), tSeq): if u1 == wildcard or u2 == wildcard: nCount += 1 elif u1 == u2: if u1 == c1: matches += 1 else: repMatches += 1 else: misMatches += 1 elif mask == "upper": for u1, u2, c1 in zip(tSeq.lower(), qSeq.lower(), tSeq): if u1 == wildcard or u2 == wildcard: nCount += 1 elif u1 == u2: if u1 == c1: matches += 1 else: repMatches += 1 else: misMatches += 1 else: for u1, u2 in zip(tSeq.upper(), qSeq.upper()): if u1 == wildcard or u2 == wildcard: nCount += 1 elif u1 == u2: matches += 1 else: misMatches += 1 tStart = tEnd qStart = qEnd try: matches = alignment.matches except AttributeError: pass try: misMatches = alignment.misMatches except AttributeError: pass try: repMatches = alignment.repMatches except AttributeError: pass try: nCount = alignment.nCount except AttributeError: pass tStart = tStarts[0] # start of alignment in target qStart = qStarts[0] # start of alignment in query tEnd = tStarts[-1] + tCount # end of alignment in target qEnd = qStarts[-1] + qCount # end of alignment in query if strand == "-": if dnax is True: tStart, tEnd = tSize - tEnd, tSize - tStart else: qStart, qEnd = qSize - qEnd, qSize - qStart blockCount = len(blockSizes) blockSizes = ",".join(map(str, blockSizes)) + "," qStarts = ",".join(map(str, qStarts)) + "," tStarts = ",".join(map(str, tStarts)) + "," if dnax: strand = "+" + strand words = [ str(matches), str(misMatches), str(repMatches), str(nCount), str(qNumInsert), str(qBaseInsert), str(tNumInsert), str(tBaseInsert), strand, qName, str(qSize), str(qStart), str(qEnd), tName, str(tSize), str(tStart), str(tEnd), str(blockCount), blockSizes, qStarts, tStarts, ] line = "\t".join(words) + "\n" return line
def build_filter(bloom_filename, linear_refs, circular_refs, kmer, mismatches, inserts, deletions, error_rate=0.01, rc=True): #Using 5e-06 is close to a set for my example, both in run time #(a fraction more) and the number of reads kept (9528 vs 8058 #with sets). simple = set() del_hashes = set() count = 0 t0 = time.time() if linear_refs: for fasta in linear_refs: sys.stderr.write("Hashing linear references in %s\n" % fasta) handle = open(fasta) for upper_seq, raw_read in fasta_iterator(handle): #assert set(upper_seq).issubset("ACGT"), "%s contains %s" \ # % (raw_read.split("\n",1)[0], set(upper_seq).difference("ACGT")) #Note we do the disambiguate call on the fragments rather than #the whole reference to avoid too many levels of recursion. for i in range(0, len(upper_seq) - kmer): for fragment in disambiguate(upper_seq[i:i + kmer]): assert set(fragment).issubset("ACGT"), fragment simple.add(fragment) #bloom.add(fragment, kmer) count += 1 #TODO - Can do this in one go from len(upper_seq) if deletions: for i in range(0, len(upper_seq) - kmer + 1): for fragment in make_deletions(upper_seq[i:i + kmer + 1]): del_hashes.add(fragment) handle.close() if circular_refs: for fasta in circular_refs: sys.stderr.write("Hashing circular references in %s\n" % fasta) handle = open(fasta) for upper_seq, raw_read in fasta_iterator(handle): #assert set(upper_seq).issubset("ACGT"), "%s contains %s" \ # % (raw_read.split("\n",1)[0], set(upper_seq).difference("ACGT")) #Want to consider wrapping round the origin, add k-mer length: upper_seq += upper_seq[:kmer] for i in range(0, len(upper_seq) - kmer): for fragment in disambiguate(upper_seq[i:i + kmer]): assert set(fragment).issubset("ACGT"), fragment simple.add(fragment) #bloom.add(fragment, kmer) count += 1 #TODO - Can do this in one go from len(upper_seq) if deletions: for i in range(0, len(upper_seq) - kmer + 1): for fragment in make_deletions(upper_seq[i:i + kmer + 1]): del_hashes.add(fragment) handle.close() if rc: #Would popping be slow? Should mean less memory at once temp = simple.copy() for fragment in temp: simple.add(reverse_complement(fragment)) del temp if mismatches or inserts or deletions: sys.stderr.write("Have %i unique k-mers before consider fuzzy matches\n" \ % (len(simple))) if deletions: #Do this first to avoid 3 large sets in memory! new = del_hashes del del_hashes new.update(simple) sys.stderr.write("Adding deletions brings this to %i unique k-mers\n" \ % len(new)) else: new = simple.copy() if mismatches: for fragment in simple: for var in make_variants(fragment, mismatches): new.add(var) sys.stderr.write("Adding %i mis-matches per k-mer, have %i unique k-mers\n" \ % (mismatches, len(new))) if inserts: for fragment in simple: for var in make_inserts(fragment): new.add(var) sys.stderr.write("Adding inserts brings this to %i unique k-mers\n" \ % len(new)) simple = new capacity = len(simple) bloom = pydablooms.Dablooms(capacity, error_rate, bloom_filename) for fragment in simple: bloom.add(fragment) bloom.flush() sys.stderr.write( "Set and bloom filter of %i-mers created (%i k-mers considered, %i unique)\n" % (kmer, count, len(simple))) sys.stderr.write( "Using Bloom filter with capacity %i and error rate %r\n" % (capacity, error_rate)) sys.stderr.write("Building filters took %0.1fs\n" % (time.time() - t0)) return simple, bloom
print(len(readsDict)) for s in spacersDic: for rd in spacerReadsDic[s]: seqRead = str(readsDict.get(rd,"None")) if seqRead != "None": seqSpacer = spacersDic[s] seqRead = str(readsDict[rd].seq) alignment = pairwise2.align.localms(seqRead,seqSpacer,2,-.1,-3,-2, one_alignment_only=True) if alignment[0][2] >= (len(seqSpacer)*2)-(args.r*2.1): print(alignment[0]), len(seqSpacer)*2 print(format_alignment(*alignment[0])) readNameDic[s].append(readsDict[rd].id) readQualDic[s].append(readsDict[rd].letter_annotations["phred_quality"][alignment[0][3]:alignment[0][4]]) ## Reverse aligment read 1: alignmentR = pairwise2.align.localms(reverse_complement(seqRead),seqSpacer,2,-.1,-3,-2, one_alignment_only=True) if alignmentR[0][2] >= (len(seqSpacer)*2)-(args.r*2.1): print(alignmentR[0]), len(seqSpacer)*2 print(format_alignment(*alignmentR[0])) readNameDic[s].append(readsDict[rd].id) rR = readsDict[rd].letter_annotations["phred_quality"] rR.reverse() readQualDic[s].append(rR[alignmentR[0][3]:alignmentR[0][4]]) readsDict = dict() ## clean memory # Generate out file: with open(args.out+".resultQ.file.test.csv",'wt') as spacers_out_file: spacers_out_file.write("id\tNR\tNR100%\tAvQ100%\tstdQ100%\tAveQ100%List\n") with open(args.out+".resultQ.file.test.full_report.txt",'wt') as spacers_out_file2:
def _exonic_transcript_effect(self, exon, exon_number, transcript): """Effect of this variant on a Transcript, assuming we already know that this variant overlaps some exon of the transcript. Parameters ---------- exon : pyensembl.Exon Exon which this variant overlaps exon_number : int Index (starting from 1) of the given exon in the transcript's sequence of exons. transcript : pyensembl.Transcript """ genome_ref = self.ref genome_alt = self.alt # clip mutation to only affect the current exon if self.start < exon.start: # if mutation starts before current exon then only look # at nucleotides which overlap the exon assert len(genome_ref) > 0, "Unexpected insertion into intron" n_skip_start = exon.start - self.start genome_ref = genome_ref[n_skip_start:] genome_alt = genome_alt[n_skip_start:] genome_start = exon.start else: genome_start = self.start if self.end > exon.end: # if mutation goes past exon end then only look at nucleotides # which overlap the exon n_skip_end = self.end - exon.end genome_ref = genome_ref[:-n_skip_end] genome_alt = genome_alt[:len(genome_ref)] genome_end = exon.end else: genome_end = self.end transcript_offset = interval_offset_on_transcript( genome_start, genome_end, transcript) if transcript.on_backward_strand: strand_ref = reverse_complement(genome_ref) strand_alt = reverse_complement(genome_alt) else: strand_ref = genome_ref strand_alt = genome_alt expected_ref = str(transcript.sequence[ transcript_offset:transcript_offset + len(strand_ref)]) if strand_ref != expected_ref: raise ValueError( ("Found ref nucleotides '%s' in sequence" " of %s at offset %d (chromosome positions %d:%d)" " but variant %s has '%s'") % ( expected_ref, transcript, transcript_offset, genome_start, genome_end, self, strand_ref)) utr5_length = min(transcript.start_codon_spliced_offsets) # does the variant start inside the 5' UTR? if utr5_length > transcript_offset: # does the variant end after the 5' UTR, within the coding region? if utr5_length < transcript_offset + len(strand_ref): return StartLoss(self, transcript) else: # if variant contained within 5' UTR return FivePrimeUTR(self, transcript) utr3_offset = max(transcript.stop_codon_spliced_offsets) + 1 if transcript_offset >= utr3_offset: return ThreePrimeUTR(self, transcript) exon_start_offset = interval_offset_on_transcript( exon.start, exon.end, transcript) exon_end_offset = exon_start_offset + len(exon) - 1 # Further below we're going to try to predict exonic splice site # modifications, which will take this effect_annotation as their # alternative hypothesis for what happens if splicing doesn't change. # If the mutation doesn't affect an exonic splice site, then # we'll just return this effect. coding_effect_annotation = coding_effect( ref=strand_ref, alt=strand_alt, transcript_offset=transcript_offset, variant=self, transcript=transcript) if changes_exonic_splice_site( transcript=transcript, transcript_ref=strand_ref, transcript_alt=strand_alt, transcript_offset=transcript_offset, exon_start_offset=exon_start_offset, exon_end_offset=exon_end_offset, exon_number=exon_number): return ExonicSpliceSite( variant=self, transcript=transcript, exon=exon, alternate_effect=coding_effect_annotation) return coding_effect_annotation
def antiparallel(self, seq): """Return reverse complementary sequence.""" return reverse_complement(seq)
def eq(*args, **kwargs): '''Compares two or more DNA sequences for equality i.e. they represent the same DNA molecule. Comparisons are case insensitive. Parameters ---------- args : iterable iterable containing sequences args can be strings, Biopython Seq or SeqRecord, Dseqrecord or dsDNA objects. circular : bool, optional Consider all molecules circular or linear linear : bool, optional Consider all molecules circular or linear Returns ------- eq : bool Returns True or False Notes ----- Compares two or more DNA sequences for equality i.e. if they represent the same DNA molecule. Two linear sequences are considiered equal if either: * They have the same sequence (case insensitive) * One sequence is the reverse complement of the other (case insensitive) Two circular sequences are considered equal if they are circular permutations: 1. They have the same lengt, AND 2. One sequence or can be found in the concatenation of the other sequence with itself, OR 3. The reverse complement can be found in the concatenation of the other sequence with itself. The topology for the comparison can be set using one of the keywords linear or circular to True or False. If circular or linear is not set, it will be deduced from the topology of each sequence for sequences that have a linear or circular attribute (like Dseq and Dseqrecord). Examples -------- >>> from pydna import eq, Dseqrecord >>> eq("aaa","AAA") True >>> eq("aaa","AAA","TTT") True >>> eq("aaa","AAA","TTT","tTt") True >>> eq("aaa","AAA","TTT","tTt", linear=True) True >>> eq("Taaa","aTaa", linear = True) False >>> eq("Taaa","aTaa", circular = True) True >>> a=Dseqrecord("Taaa") >>> b=Dseqrecord("aTaa") >>> eq(a,b) False >>> eq(a,b,circular=True) True >>> a=a.looped() >>> b=b.looped() >>> eq(a,b) True >>> eq(a,b,circular=False) False >>> eq(a,b,linear=True) False >>> eq(a,b,linear=False) True >>> eq("ggatcc","GGATCC") True >>> eq("ggatcca","GGATCCa") True >>> eq("ggatcca","tGGATCC") True ''' from Bio.Seq import reverse_complement from Bio.SeqRecord import SeqRecord import itertools args = list(args) for i, arg in enumerate(args): if not hasattr(arg, "__iter__") or isinstance(arg, SeqRecord): args[i] = (arg, ) args = list(itertools.chain.from_iterable(args)) topology = None if "linear" in kwargs: if kwargs["linear"] == True: topology = "linear" if kwargs["linear"] == False: topology = "circular" elif "circular" in kwargs: if kwargs["circular"] == True: topology = "circular" if kwargs["circular"] == False: topology = "linear" else: # topology keyword not set, look for topology associated to each sequence # otherwise raise exception topology = set([ arg.circular if hasattr(arg, "circular") else None for arg in args ]) if len(topology) != 1: raise Exception("sequences have different topologies") topology = topology.pop() if topology in (False, None): topology = "linear" elif topology == True: topology = "circular" #args_string_list = [str(arg.seq).lower() if hasattr(arg,"seq") else str(arg).lower() for arg in args] args = [arg.seq if hasattr(arg, "seq") else arg for arg in args] args_string_list = [ arg.watson.lower() if hasattr(arg, "watson") else str(arg).lower() for arg in args ] length = set((len(s) for s in args_string_list)) if len(length) != 1: return False same = True if topology == "circular": # force circular comparison of all given sequences for s1, s2 in itertools.combinations(args_string_list, 2): if not (s1 in s2 + s2 or reverse_complement(s1) in s2 + s2): same = False elif topology == "linear": # force linear comparison of all given sequences for s1, s2 in itertools.combinations(args_string_list, 2): if not (s1 == s2 or s1 == reverse_complement(s2)): same = False return same