示例#1
0
def test(dna,AA,codeline,iAA,fAA,cds,strand):
	firstBreak = codeline.find(',')
	secondBreak = codeline.find(',',firstBreak+1)
	thirdBreak = codeline.find(',',secondBreak+1)
	fourthBreak = codeline.rfind(',')

	codonStartSite = int(codeline[firstBreak+1:secondBreak])
	initialCodon = codeline[secondBreak+1:thirdBreak]
	codonEndSite = int(codeline[thirdBreak+1:fourthBreak])
	finalCodon = codeline[fourthBreak+1:]

	TranslatableInitialCodon = initialCodon
	TranslatableFinalCodon = finalCodon
	if strand ==-1:
		TranslatableInitialCodon = reverse_complement(initialCodon)
		TranslatableFinalCodon = reverse_complement(finalCodon)

	#TEST CASES
	if AA != cds.qualifiers['translation'][0]: #protein seqs match up
		print "AA seqs not equal"
		return False 
	elif dna[codonStartSite-1:codonEndSite] != initialCodon: #codon that is being modified is where its supposed to be
		print dna[codonStartSite-1:codonEndSite] +'!=' + initialCodon + "  :so codeline doesnt match up"
		return False
	elif translate(TranslatableInitialCodon) != iAA: #starting codon is what its supposed to be
		return False
	elif translate(TranslatableFinalCodon) != fAA: #final codon is what its supposed to be
		return False
	else: 
		return True
示例#2
0
def flip_record(toks):
    vcf_ref = toks[3]
    vcf_alts = toks[4].split(",")
    fix_ref = reverse_complement(vcf_ref)
    fix_alts = ",".join([reverse_complement(x) for x in vcf_alts])
    fix_line = toks[0:3] + [fix_ref, fix_alts] + toks[5:]
    fix_line = "\t".join(fix_line)
    return fix_line + "\n"
示例#3
0
	def set_mirsseq(self, mirdict, famdict):
		self.mirseq = mirdict.get(self.mirid, "undef");
		if(self.mirseq != "undef"):
			self.seed = reverse_complement(self.mirseq[1:7]);
			return 0;
		else:
			self.seed = reverse_complement(famdict[self.mirid][1:7]);
			self.longseed = famdict[self.mirid]
			return 1;
示例#4
0
    def _c_to_g(self):
        t = self._map_cds_to_genome()

        if self.c.strand == '-1':
            ref = reverse_complement(self.c.ref)
            alt = reverse_complement(self.c.alt)
        else:
            ref = self.c.ref
            alt = self.c.alt
        self.g = G(t['seq_region_name'],
                   t['start'],
                   t['end'],
                   ref,
                   alt,
                   self.edit_type)
示例#5
0
文件: mirna.py 项目: afilipch/nrlbio
	def __init__(self, name, seq, seed_start, seed_stop):
		self.name = name;
		self.seq = seq;
		self.seed_start = seed_start
		self.seed_stop = seed_stop;
		
		self.expression = 0;
		self.seed = seq[seed_start: seed_stop];
		self.match = get_seed_match(seq, seed_start, seed_stop);
		
		self.m27 = get_seed_match(seq, 1, 7)
		self.m38 = get_seed_match(seq, 2, 8)
		self.m8 = reverse_complement(seq[7])
		self.m9 = reverse_complement(seq[8])
		self.first = 'A';
示例#6
0
def getBgCounts_FAST_ACCURATE_BIGMEMORY( seqs, order=3, include_revComp=True ):
    from collections import Counter
    d = {}
    for ord in range(order+1): ## get counts for 1,2,3,...order
        print ord
        ord1 = ord+1
        all_combos = generateAllKmers( ord1 )
        for ss in all_combos:
            d[ss] = 0
        for i in xrange( len(seqs) ):
            seq = None
            if type(seqs) == np.ndarray:
                seq = seqs[i]
            elif type(seqs) == dict: 
                seq = str(seqs.values()[i].seq)
            seq_rev = reverse_complement(seq) if include_revComp else ''
            if ord == 0:
                c1 = Counter([seq[i] for i in xrange(len(seq)-ord)]) ## xrange makes it ~10% faster and uses less memory
                if include_revComp:
                    c2 = Counter([seq_rev[i] for i in xrange(len(seq_rev)-ord)])
            else:
                c1 = Counter([seq[i:(i+ord1)] for i in xrange(len(seq)-ord)])
                if include_revComp:
                    c2 = Counter([seq_rev[i:(i+ord1)] for i in xrange(len(seq_rev)-ord)])
            for ss in all_combos:
                d[ss] += c1[ss] + ( c2[ss] if include_revComp else 0 )
    return d
def find_downstream_start(name,
                          transcript,
                          current_start,
                          strand):
    """function to call other functions to find the next ATG
    start site.
    Takes in the transcript and the current
    start codon and strand coding direction"""
    if strand == "+":
        outstr = ("Looking for ATG after %d in seq: %s" % (current_start,
                                                           name))
        logger.info(outstr)
        if transcript[current_start:current_start+3] != "ATG":
            outstr = ("WARNING - existing annotation for " +
                      " %s does not start ATG" % name)
            logger.info(outstr)
        return find_positive_next_ATG(transcript, current_start, strand)
    elif strand == "-":
        outstr = ("Looking for CAT (i.e. ATG rev-comp) before " +
                  " %d in sequence %s" % (current_start, name))
        logger.info(outstr)
        new = find_positive_next_ATG(reverse_complement(transcript),
                                     len(transcript) - current_start, "+")
        if new is None:
            # No start codon found
            return None
        return len(transcript) - new
    else:
        raise ValueError("Bad strand value %r" % strand)
示例#8
0
def get_feature_nuc(f, parent_seq) :
    """Extract SeqFeature sequence from parent sequence (as Seq)."""
    if f.sub_features :
        if f.location_operator!="join":
            raise ValueError(f.location_operator)
        if f.strand == -1 and set(f_sub.strand for f_sub in f.sub_features)==set([-1]) :
            #This is nasty... maybe we can tweak how the GenBank parser works?
            #It is important we do not double reverse-complement!
            parts = [parent_seq[f_sub.location.nofuzzy_start:f_sub.location.nofuzzy_end] \
                     for f_sub in f.sub_features]
        else :
            #This copes with mixed strand features:
            parts = [get_feature_nuc(f_sub, parent_seq) for f_sub in f.sub_features]
        f_seq = parts[0]
        for part in parts[1:] : f_seq += part
    else :
        f_seq = parent_seq[f.location.nofuzzy_start:f.location.nofuzzy_end]
    if f.strand == -1 :
        #TODO - MutableSeq?
        try :
            f_seq = f_seq.reverse_complement()
        except AttributeError :
            assert isinstance(f_seq, str)
            f_seq = reverse_complement(f_seq)
    return f_seq
示例#9
0
	def multitargeting(self):	
		self.mirids = set();
		for inter in self.interactions:
			self.mirids.update(inter.mirid.split(","));
		self.seeds = set([reverse_complement(x.mirseq[1:7]) for x in self.interactions])	
		self.s2m = defaultdict(set);
		for inter in self.interactions:
			self.s2m[reverse_complement(inter.mirseq[1:7])].add(tuple(inter.mirid));
		self.onehit = len(self.mirids) == 1 and self.interactions[0].indreads == 1;
		self.onemir = len(self.mirids) == 1;
		self.onefam = False
		for k,v in self.s2m.iteritems():
			if(len(v) > 1 and not bool(set(list(v)[0]).intersection(v))):
				self.onefam = True;
				break;
		self.diffam = len(self.seeds) > 1		
示例#10
0
def writeSTF():
    global difference, seqRecordToCheck, seqRecordToCheckComplement, variation, featureName, featureSeq, seqLength, m
    difference = len(record.seq) % 3
    seqRecordToCheck = str(record.seq)
    if difference != 0:
        seqRecordToCheck = str(record.seq)[:-difference]
    else:
        seqRecordToCheck = str(record.seq)
    seqRecordToCheckComplement = str(reverse_complement(seqRecordToCheck))
    # Reading Frames
    firstReadingFrame = translate(seqRecordToCheck)
    secondReadingFrame = translate(seqRecordToCheck[1::] + seqRecordToCheck[0])
    thirdReadingFrame = translate(seqRecordToCheck[2::] + seqRecordToCheck[0:2])
    # Reading Frames (reverseComplement)
    firstReadingFrameComplement = translate(seqRecordToCheckComplement)
    secondReadingFrameComplement = translate(seqRecordToCheckComplement[1::] + seqRecordToCheckComplement[0])
    thirdReadingFrameComplement = translate(seqRecordToCheckComplement[2::] + seqRecordToCheckComplement[0:2])
    for variation in featureStatistic_container[feature]:
        featureName = variation.note
        featureSeq = str(variation.seq)
        featureLength = len(variation.seq)
        seqLength = len(seqRecordToCheck)

        firstReadingFrameCircular = firstReadingFrame + firstReadingFrame[0:featureLength - 1]
        secondReadingFrameCircular = secondReadingFrame + secondReadingFrame[0:featureLength - 1]
        thirdReadingFrameCircular = thirdReadingFrame + thirdReadingFrame[0:featureLength - 1]

        firstReadingFrameComplementCircular = firstReadingFrameComplement + firstReadingFrameComplement[
                                                                            0:featureLength - 1]
        secondReadingFrameComplementCircular = secondReadingFrameComplement + secondReadingFrameComplement[
                                                                         0:featureLength - 1]
        thirdReadingFrameComplementCircular = thirdReadingFrameComplement + thirdReadingFrameComplement[
                                                                            0:featureLength - 1]

        # Find Matches
        firstFrameMatchesCircular = re.finditer(featureSeq, firstReadingFrameCircular)
        secondFrameMatchesCircular = re.finditer(featureSeq, secondReadingFrameCircular)
        thirdFrameMatchesCircular = re.finditer(featureSeq, thirdReadingFrameCircular)

        firstFrameComplementMatchesCircular = re.finditer(featureSeq, firstReadingFrameComplementCircular)
        secondFrameComplementMatchesCircular = re.finditer(featureSeq, secondReadingFrameComplementCircular)
        thirdFrameComplementMatchesCircular = re.finditer(featureSeq, thirdReadingFrameComplementCircular)

        for m in firstFrameMatchesCircular:
            addFeatureSTF()

        for m in secondFrameMatchesCircular:
            addFeatureSTF()

        for m in thirdFrameMatchesCircular:
            addFeatureSTF()

        for m in firstFrameComplementMatchesCircular:
            addFeatureComplSTF()

        for m in secondFrameComplementMatchesCircular:
            addFeatureComplSTF()

        for m in thirdFrameComplementMatchesCircular:
            addFeatureComplSTF()
示例#11
0
def align(request):
	names=request.POST.getlist('sequences[]')
	directions=request.POST.getlist('directions[]')
	fasta_string=""
	aligned_sequences={}
	try:
		for (name, direction) in zip(names, directions):
			seq=getdropboxsequence(request,name)
			fasta_string+=">"+name+"\n"
			print("direction"+str(direction))
			if direction=="1":
				print("forward")
				fasta_string+=str(seq.seq)+"\n"
			else:
				print("reverse")
				fasta_string+=reverse_complement(str(seq.seq))+"\n"
		cmd=Popen(['muscle'],stdout=PIPE,stdin=PIPE)
		stdout_data,stderr_data=cmd.communicate(input=fasta_string.encode("utf-8"))
		align=AlignIO.read(io.StringIO(stdout_data.decode('utf-8')),"fasta")
		for record in align:
			aligned_sequences[record.id]=str(record.seq)
	except Exception as e:
		print(str(e),file=sys.stderr)
		raise e
	return HttpResponse(json.dumps(aligned_sequences), content_type='application/json')
示例#12
0
 def get_spliced_seq(self, strand=None):
     if not self.seq:
         return None
     seq = ''.join(self.get_exon_seqs())
     if strand and self.strand != strand:
         seq = reverse_complement(seq)
     return seq
示例#13
0
 def misprime_check(self):
     u = UnaFolder(
         t=self.tm(),
         safety=self.construct.settings.ss_safety,
         mg_salt=self.construct.settings.mg_salt,
         na_salt=self.construct.settings.na_salt,
     )
     if self.stick.top:
         target = str(self.stick.cfragment.sequence())
     else:
         target = str(reverse_complement(Seq(self.stick.cfragment.sequence())))
     if u.mis_prime(target, str(self.seq())):
         self.warning.all().filter(type="mp").delete()
         for warning in u.warnings:
             w = Warning.objects.create(
                 primer=self,
                 type="mp",
                 text="Potentital mis-priming "
                 + (str(warning[1]) + " bp from " if warning[1] > 0 else " of ")
                 + "3' end of primer at bp "
                 + str(warning[2])
                 + ", length "
                 + str(warning[0])
                 + ", energy "
                 + str(warning[3]),
             )
示例#14
0
 def test_mixed_strand_dna_join(self):
     """Feature on DNA (join, mixed strand)"""
     s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna)
     f1 = SeqFeature(FeatureLocation(5,10), strand=+1)
     f2 = SeqFeature(FeatureLocation(12,15), strand=-1)
     f = make_join_feature([f1,f2])
     self.check(s, f, "CCCCC"+reverse_complement("TTT"),
                "join(6..10,complement(13..15))")
示例#15
0
 def test_simple_dna_join_after(self):
     """Feature on DNA (join, strand -1, after position)"""
     s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna)
     f1 = SeqFeature(FeatureLocation(5,10), strand=-1)
     f2 = SeqFeature(FeatureLocation(12,AfterPosition(15)), strand=-1)
     f = make_join_feature([f1,f2])
     self.check(s, f, reverse_complement("CCCCC"+"TTT"),
                "complement(join(6..10,13..>15))")
示例#16
0
 def __getitem__(self, index):
     if isinstance(index, slice) and index.start is not None and index.stop is not None and index.start > index.stop:
         index = slice(index.stop, index.start, index.step)
         retval = super().__getitem__(index)
         retval.seq = reverse_complement(retval.seq)
         return retval
     else:
         return super().__getitem__(index)
示例#17
0
 def test_mixed_strand_dna_multi_join(self):
     """Feature on DNA (multi-join, mixed strand)"""
     s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna)
     f1 = SeqFeature(FeatureLocation(5,10), strand=+1)
     f2 = SeqFeature(FeatureLocation(12,15), strand=-1)
     f3 = SeqFeature(FeatureLocation(BeforePosition(0),5), strand=+1)
     f = make_join_feature([f1,f2,f3])
     self.check(s, f, "CCCCC"+reverse_complement("TTT")+"AAAAA",
                "join(6..10,complement(13..15),<1..5)")
示例#18
0
 def test_mixed_strand_dna_join(self):
     """Extract feature from DNA (join, mixed strand)"""
     s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna)
     f1 = SeqFeature(FeatureLocation(5,10), strand=+1)
     f2 = SeqFeature(FeatureLocation(12,15), strand=-1)
     f = make_join_feature([f1,f2])
     self.assertEqual(_insdc_feature_location_string(f),
                      "join(6..10,complement(13..15))")
     self.check(s, f, "CCCCC"+reverse_complement("TTT"))
示例#19
0
 def test_simple_dna_join_after(self):
     """Extract feature from DNA (join, strand -1, after position)"""
     s = Seq("AAAAACCCCCTTTTTGGGGG", generic_dna)
     f1 = SeqFeature(FeatureLocation(5,10), strand=-1)
     f2 = SeqFeature(FeatureLocation(12,AfterPosition(15)), strand=-1)
     f = make_join_feature([f1,f2])
     self.assertEqual(_insdc_feature_location_string(f),
                      "complement(join(6..10,13..>15))")
     self.check(s, f, reverse_complement("CCCCC"+"TTT"))
示例#20
0
文件: maf2sam.py 项目: wwood/maf2sam
    def __str__(self):
        global cached_pairs, read_group_ids
        if self.ref_rc:
            flag = 0x10 #maps to reverse strand
            read_seq = reverse_complement(self.read_seq)
            read_qual = self.read_qual[::-1]
        else:
            flag = 0
            read_seq = self.read_seq
            read_qual = self.read_qual
        mate_ref_name = "*"
        mate_ref_pos = 0
        if not self.template_name:
            assert self.read_name
            self.template_name = self.read_name
        if self.is_paired():
            flag += 1 #paired
            if self.first_in_pair:
                flag += 0x40 #forward partner
            else:
                flag += 0x80 #reverse partner
            try:
                mate = self.get_partner()
            except KeyError:
                #Paired but no parter in ACE file
                flag += 0x08 #mate unmapped
            else:
                mate_ref_name = mate.contig_name
                mate_ref_pos = mate.ref_pos
                if mate_ref_name == self.contig_name:
                    #Since MIRA seems happy and both on same contig,
                    flag += 0x02 #properly aligned

        assert not self.tags
        read_seq_unpadded = read_seq.replace("*", "")
        read_qual_unpadded = "".join(q for (l,q) in zip(read_seq,read_qual) if l!="*")
        cigar = self.cigar
        assert "M" not in cigar, cigar
        if "D" not in cigar:
            #Sum of lengths of the M/I/S/=/X operations should match the sequence length
            #By construction there are no M entries in our CIGAR string.
            #TODO - Improve this check to consider D in CIGAR?
            if len(read_seq_unpadded) != sum(int(x) for x in cigar.replace("I","=").replace("S","=").replace("X","=").split("=") if x):
                raise ValueError("%s vs %i for %s" % (cigar, len(read_seq_unpadded), read_seq))
        assert len(read_seq_unpadded) == len(read_qual_unpadded)
        line = "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\t%s" % \
            (self.template_name, flag, self.contig_name, self.ref_pos,
             self.map_qual, cigar,
             mate_ref_name, mate_ref_pos, self.insert_size,
             read_seq_unpadded, read_qual_unpadded)
        assert self.seq_tech
        line += "\tRG:Z:%s" % read_group_ids[(self.seq_tech, self.strain)]
        for tag in self.tags:
             assert not tag.startswith("RG:"), tag
             line += "\t" + tag
        return line
 def get_exon_seqs(self):
   exons = []
   for i in range(self.blockCount):
     # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1])
     exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]])
   if self.strand == '-':  #reverse complement
     exons.reverse()
     for i,s in enumerate(exons):
       exons[i] = reverse_complement(s)
   return exons
示例#22
0
    def extract(self, parent_sequence):
        """Extract feature sequence from the supplied parent sequence.

        The parent_sequence can be a Seq like object or a string, and will
        generally return an object of the same type. The exception to this is
        a MutableSeq as the parent sequence will return a Seq object.

        This should cope with complex locations including complements, joins
        and fuzzy positions. Even mixed strand features should work! This
        also covers features on protein sequences (e.g. domains), although
        here reverse strand features are not permitted.

        >>> from Bio.Seq import Seq
        >>> from Bio.Alphabet import generic_protein
        >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
        >>> seq = Seq("MKQHKAMIVALIVICITAVVAAL", generic_protein)
        >>> f = SeqFeature(FeatureLocation(8,15), type="domain")
        >>> f.extract(seq)
        Seq('VALIVIC', ProteinAlphabet())

        Note - currently only sub-features of type "join" are supported.
        """
        if isinstance(parent_sequence, MutableSeq):
            #This avoids complications with reverse complements
            #(the MutableSeq reverse complement acts in situ)
            parent_sequence = parent_sequence.toseq()
        if self.sub_features:
            if self.location_operator!="join":
                raise ValueError(self.location_operator)
            if self.strand == -1:
                #This is a special case given how the GenBank parser works.
                #Must avoid doing the reverse complement twice.
                parts = []
                for f_sub in self.sub_features:
                    assert f_sub.strand==-1
                    parts.append(parent_sequence[f_sub.location.nofuzzy_start:\
                                                 f_sub.location.nofuzzy_end])
            else:
                #This copes with mixed strand features:
                parts = [f_sub.extract(parent_sequence) \
                         for f_sub in self.sub_features]
            #We use addition rather than a join to avoid alphabet issues:
            f_seq = parts[0]
            for part in parts[1:] : f_seq += part
        else:
            f_seq = parent_sequence[self.location.nofuzzy_start:\
                                    self.location.nofuzzy_end]
        if self.strand == -1:
            #TODO - MutableSeq?
            try:
                f_seq = f_seq.reverse_complement()
            except AttributeError:
                assert isinstance(f_seq, str)
                f_seq = reverse_complement(f_seq)
        return f_seq
示例#23
0
def gff_to_fasta(gff, genome, min_length, Max_length,
                 outfile, upstream, into_TSS):
    """take in gff file. Gets the seq defined by the gff coords.
    If negative direction coding, the reverse complement is generated.
    A min length of seq to return and max len is applied to remove seq
    less than, for example 3 which cant be real and less that e.g., 25k
    which will be flase positives and not informative in downstream analysis
    """
    print("Indexing the genome")
    min_length = int(min_length)
    genome_database = index_genome_file(genome)
    print("Now iterating through the GFF. Assume it is sorted")
    f_out = open(outfile, "w")
    bind_out = outfile.split(".fa")[0] + "_%dnt_upstream_%d_into_TSS.fasta" % (upstream,
                                                                               into_TSS)
    bind_out_fa = open(bind_out, "w")
    upstream = int(upstream)
    with open(gff, "r") as f_handle:
        for line in f_handle:
            line = check_line(line)
            if not line:
                continue
            scaff, source, feature, start, stop, score, \
            direction, frame, gene_info = split_line(line)
            seq_record = genome_database[scaff]
            if direction == "+":
                UTR = seq_record.seq[start:stop]
                bind_seq = seq_record.seq[(start - upstream):(start + into_TSS)]
            if direction == "-":
                UTR = reverse_complement(seq_record.seq[start:stop])
                bind_seq = reverse_complement(seq_record.seq[(stop - into_TSS)
                                                             :(stop + upstream)])
            outstr = ">%s\n%s\n" % (gene_info, UTR)
            bind_str = ">%s_%d_upstream_TSS\n%s\n" % (gene_info, upstream, bind_seq)
            if len(UTR) > min_length and len(UTR) < Max_length:
                f_out.write(outstr)
                if "NNNN" in bind_seq:
                    continue  #  we dont want NNNs
                if len(bind_seq) >= upstream: 
                    bind_out_fa.write(bind_str)
    f_out.close()
    bind_out_fa.close()
示例#24
0
 def extractReadSequences(self, genoGenChr, lInsertsChr, barcode, lenRead):
     """
     >>> i = TestGbs()
     >>> genoGenChr = SeqRecord(Seq("AATTTAGGGA"), id="chr1")
     >>> i.extractReadSequences(genoGenChr, [[3,9]], "C", 3)
     [['CTT', 'CCC']]
     """
     return [[str(barcode) \
              + str(genoGenChr.seq[(i[0]-1):(i[0]-1+lenRead-len(barcode))]),
              reverse_complement(str(genoGenChr.seq[(i[1]-lenRead):i[1]]))]
             for i in lInsertsChr]
示例#25
0
 def get_exon_seqs(self):
     if not self.seq:
         return None
     exons = []
     for i in range(self.blockCount):
         exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i]
                      + self.blockSizes[i]])
     if self.strand == '-':  # reverse complement
         exons.reverse()
         for i, s in enumerate(exons):
             exons[i] = reverse_complement(s)
     return exons
示例#26
0
 def seq(self):
     s = Seq(self.cfragment.fragment.sequence)
     if self.cfragment.direction == 'r':
         end = self.cfragment.end() - self.start()
         start = end - self.length
     else:
         start = self.start()
         end = self.end()
     s = s[start:end]
     if self.top ^ (self.cfragment.direction == 'r'):
         s = reverse_complement(s)
     return s
示例#27
0
def six_frame_translations(seq, genetic_code=1):
    """Formatted string showing the 6 frame translations and GC content.

    nice looking 6 frame translation with GC content - code from xbbtools
    similar to DNA Striders six-frame translation

    e.g.
    from Bio.SeqUtils import six_frame_translations
    print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
    """
    from Bio.Seq import reverse_complement, translate

    anti = reverse_complement(seq)
    comp = anti[::-1]
    length = len(seq)
    frames = {}
    for i in range(0, 3):
        frames[i + 1] = translate(seq[i:], genetic_code)
        frames[-(i + 1)] = reverse(translate(anti[i:], genetic_code))

    # create header
    if length > 20:
        short = "%s ... %s" % (seq[:10], seq[-10:])
    else:
        short = seq
    # TODO? Remove the date as this would spoil any unit test...
    date = time.strftime("%y %b %d, %X", time.localtime(time.time()))
    header = "GC_Frame: %s, " % date
    for nt in ["a", "t", "g", "c"]:
        header += "%s:%d " % (nt, seq.count(nt.upper()))

    header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % (short.lower(), length, GC(seq))
    res = header

    for i in range(0, length, 60):
        subseq = seq[i : i + 60]
        csubseq = comp[i : i + 60]
        p = i / 3
        res = res + "%d/%d\n" % (i + 1, i / 3 + 1)
        res = res + "  " + "  ".join(map(None, frames[3][p : p + 20])) + "\n"
        res = res + " " + "  ".join(map(None, frames[2][p : p + 20])) + "\n"
        res = res + "  ".join(map(None, frames[1][p : p + 20])) + "\n"
        # seq
        res = res + subseq.lower() + "%5d %%\n" % int(GC(subseq))
        res = res + csubseq.lower() + "\n"
        # - frames
        res = res + "  ".join(map(None, frames[-2][p : p + 20])) + " \n"
        res = res + " " + "  ".join(map(None, frames[-1][p : p + 20])) + "\n"
        res = res + "  " + "  ".join(map(None, frames[-3][p : p + 20])) + "\n\n"
    return res
示例#28
0
 def extract(self, parent_sequence):
     """Extract feature sequence from the supplied parent sequence."""
     if isinstance(parent_sequence, MutableSeq):
         #This avoids complications with reverse complements
         #(the MutableSeq reverse complement acts in situ)
        parent_sequence = parent_sequence.toseq()
     f_seq = parent_sequence[self.nofuzzy_start:self.nofuzzy_end]
     if self.strand == -1:
         try:
             f_seq = f_seq.reverse_complement()
         except AttributeError:
             assert isinstance(f_seq, str)
             f_seq = reverse_complement(f_seq)
     return f_seq
示例#29
0
文件: models.py 项目: derjogi/Gibthon
	def misprime_check(self):
		u = UnaFolder(t=self.tm(), safety=self.construct.settings.ss_safety, mg_salt=self.construct.settings.mg_salt, na_salt=self.construct.settings.na_salt)
		if(self.stick.top):
			target = str(self.stick.cfragment.sequence())
		else:
			target = str(reverse_complement(Seq(self.stick.cfragment.sequence())))
		if u.mis_prime(target, str(self.seq())):
			self.warning.all().filter(type='mp').delete()
			for warning in u.warnings:
				w = Warning.objects.create(
					primer = self,
					type = 'mp',
					text= 'Potentital mis-priming ' + (str(warning[1]) + ' bp from ' if warning[1] > 0 else ' of ') + '3\' end of primer at bp ' + str(warning[2]) + ', length ' + str(warning[0]) + ', energy ' + str(warning[3]),
				)
示例#30
0
	def __init__(self, arws):
		if(len(set([x.qname for x in arws])) != 1): 
			raise ChimeraException('Chimera cannot be made from aligned reads with different identifiers\nFollowing are given:\n%s\n' % "\n".join(["\t%s" % x for x.qname in ar.wrappers]))
			
		self.arws = arws;
		self.control = any([x.rname.split("_")[0] == "random" for x in self.arws])
		
		self.gap = arws[1].qstart - arws[0].qend;
		self.AS = sum([x.AS for x in arws]);
		self.coordinates = [];
		
		if(arws[0].aligned_read.is_reverse):
			self.gap_seq = reverse_complement(arws[0].aligned_read.query_sequence)[self.arws[1].qstart:self.arws[0].qend]
		else:
			self.gap_seq = arws[0].aligned_read.query_sequence[self.arws[1].qstart:self.arws[0].qend]
示例#31
0
def align_call(elem):
    """
    this function call the aligner software
    :param elem: 
    :return: 
    """
    record = elem[0]
    adapter = elem[1]
    lEle = []
    dRc = {} 
    dEle2Int = {}
    dInt2Ele = {}
    nMatch = 2
    nMismatch = 1
    nOpen = 1
    nExt = -2
    nFlag = 0
    #if not args.sMatrix:
    lEle = ['A', 'C', 'G', 'T', 'N']
    for i,ele in enumerate(lEle):
        dEle2Int[ele] = i
        dEle2Int[ele.lower()] = i
        dInt2Ele[i] = ele
    nEleNum = len(lEle)
    lScore = [0 for i in range(nEleNum**2)]
    for i in range(nEleNum-1):
        for j in range(nEleNum-1):
            if lEle[i] == lEle[j]:
                lScore[i*nEleNum+j] = nMatch
            else:
                lScore[i*nEleNum+j] = -nMismatch
    mat = (len(lScore) * ct.c_int8) ()
    mat[:] = lScore
    
    ssw = ssw_lib.CSsw("./")
    sQSeq = record.seq
    sQId = record.id
    if len(sQSeq) > 30:
        nMaskLen = len(sQSeq) / 2
    else:
        nMaskLen = 15
    outputAlign = []
    qNum = to_int(sQSeq, lEle, dEle2Int)
    qProfile = ssw.ssw_init(qNum, ct.c_int32(len(sQSeq)), mat, len(lEle), 2)
    sQRcSeq = reverse_complement(sQSeq)
    qRcNum = to_int(sQRcSeq, lEle, dEle2Int)
    qRcProfile = ssw.ssw_init(qRcNum, ct.c_int32(len(sQSeq)), mat, len(lEle), 2)
    sRSeq = adapter.seq
    sRId = adapter.id
    rNum = to_int(sRSeq, lEle, dEle2Int)
    res = align_one(ssw, qProfile, rNum, len(sRSeq), nOpen, nExt, nFlag, nMaskLen)
    resRc = None
    resRc = align_one(ssw, qRcProfile, rNum, len(sRSeq), nOpen, nExt, nFlag, nMaskLen)
    strand = 0
    if res[0] == resRc[0]:
        next
    if res[0] > resRc[0]:
        res = res
        strand = 0
        outputAlign = [sRId , sQId, strand, res]
    elif res[0] < resRc[0]:
        res = resRc
        strand = 1
        outputAlign = [sRId , sQId, strand, res]
    ssw.init_destroy(qProfile)
    ssw.init_destroy(qRcProfile)
    return outputAlign
示例#32
0
def handle_bulk_manual_swaps(genome_record, input_file, mg1655_genome_record):
    """Method that allows handling bulk swaps.

    NOTE: This method is pretty much hard-coded to work with derivatives of
    MG1655.
    """
    # After considering all the various options for finding the exact position
    # to change, I've decided to go with parsing the 'AGR ID', which has the
    # name of the gene as well as the codon position. Let's see how it goes.

    # Eventually we'll need a way to get from gene name to feature in the
    # MDS42 genome.  One problem that arises when matching gene names is we
    # have gene name synonyms.  Thus we can use the MG1655 record which
    # has a lot more synonyms recorded.

    # Create a bi-map linking various gene synonyms of the MG1655 genbank.
    # This allows us to handle more flexible cases.
    # TODO: In general, this synonym-finding functionality could be useful
    # elsewhere. Maybe use Ecocyc or regulondb data for this purpose.
    gene_to_synonym_bimap = {}
    mg1655_cds_features = [feature for feature in mg1655_genome_record.features
            if feature.type in set(['CDS', 'gene'])]
    for feature in mg1655_cds_features:
        if not feature.type in ['CDS', 'gene']:
            continue

        maybe_gene = get_feature_gene(feature)
        if not maybe_gene:
            continue

        if 'gene_synonym' in feature.qualifiers:
            # Build a list containing all synonyms which will serve as the
            # value of the bimap.
            synonym_list = [maybe_gene]

            # Check each to see if it can be split.
            for synonym_phrase in feature.qualifiers['gene_synonym']:
                split_phrase = synonym_phrase.split(';')
                for synonym in split_phrase:
                    clean_synonym = synonym.strip()
                    if len(clean_synonym) > 0:
                        synonym_list.append(clean_synonym)

            for synonym in synonym_list:
                gene_to_synonym_bimap[synonym] = synonym_list

    # Create a map from gene name to CDS feature for that gene in genome_record.
    gene_to_feature_map = {}
    cds_features = [feature for feature in genome_record.features
            if feature.type == 'CDS']
    for feature in cds_features:
        maybe_gene = get_feature_gene(feature)
        if not maybe_gene:
            continue

        # Always add the feature for the gene. It's possible one of the
        # synonyms was added earlier, so we override it.
        gene_to_feature_map[maybe_gene] = feature

        # Get all synonyms and build up the map.
        if 'gene_synonym' in feature.qualifiers:
            synonym_set = set(
                    [maybe_gene] +
                    feature.qualifiers['gene_synonym'] +
                    gene_to_synonym_bimap.get(maybe_gene, []))
            for synonym in synonym_set:
                # Don't override if it's already there. We want actual genes
                # to get precedence over less reliable synonyms.
                if not synonym in gene_to_feature_map:
                    gene_to_feature_map[synonym] = feature

    # And use the bimap to pickup any missing synonym connections.
    for gene, synonym_list in gene_to_synonym_bimap.iteritems():
        if gene in gene_to_feature_map:
            continue
        for synonym in synonym_list:
            if synonym in gene_to_feature_map:
                feature = gene_to_feature_map[synonym]
                gene_to_feature_map[gene] = feature
                break

    # Now iterate through the manual fixes and make the changes.
    with open(input_file) as input_fh:
        reader = csv.DictReader(input_fh, delimiter='\t')
        for manual_fix in reader:
            clean_id = manual_fix['AGR ID'].strip()
            if clean_id in KNOWN_PROBLEM_IDS:
                continue

            parsed_id = re.match(
                    r'(?P<gene>[a-zA-Z]+)_[a-zA-Z]+_(?P<mutation_start>[0-9]+)?.*',
                    clean_id)

            # prfB is the only know weird case, so check that anything weird
            # is indeed prfB and skip it for now.
            if not parsed_id:
                parsed_id = re.match(r'(?P<gene>[a-zA-Z]+)_.*',
                        manual_fix['AGR ID'])
            gene = parsed_id.group('gene')

            # Make it 0-indexed per the method that does the swap's API.
            mutation_start = int(parsed_id.group('mutation_start')) - 1

            if not gene in gene_to_feature_map:
                print "%s not in map." % gene
                assert False

            feature = gene_to_feature_map[gene]

            # Parse the old and new sequence data.
            previous_seq = manual_fix['wt Genotype'].upper()
            new_seq = manual_fix['Destination Genotype'].upper()

            if len(new_seq) == 0:
                continue

            assert len(previous_seq) == 3
            assert len(new_seq) == 3

            # Ugh, looks like the data gives changes always in the forward
            # strand.  We'll have to reverse it ourselves in the negative
            # strand case.
            if feature.strand == -1:
                mutation_start = len(feature) - mutation_start - len(previous_seq)
                previous_seq = reverse_complement(previous_seq)
                new_seq = reverse_complement(new_seq)

            swap_feature_codon_at_position(genome_record, feature.id,
                    mutation_start, previous_seq, new_seq)
示例#33
0
    type=str,
    help="name of input sequence file")
parser.add_argument('-o', '--output', dest='output', 
    type=str, help="output file name for reverse complemented sequences (optional)")

args = parser.parse_args()
infile = args.seqfile
outseq = args.output

if args.seqfile:
	if args.output:
		output_name = outseq
	else:
		output_name = 'reverse_comp_'+infile
	outfile = open(output_name, 'w')
	seqfile = open(infile, 'r')
	for seq in SeqIO.parse(seqfile, 'fasta'):
	    rec_id = str(seq.id)
	    rec_seq = seq.seq
	    rev = str(reverse_complement(rec_seq))
	    new = SeqRecord(Seq(rev), id=rec_id, description='')
	    
	    SeqIO.write(new, outfile, 'fasta')
	    
else:
	DNAseq = input("enter sequence to reverse complement: ")
	from Bio.Seq import reverse_complement
	RCseq = reverse_complement(DNAseq)
	print('reverse complement is: ',RCseq)

示例#34
0
def gff_to_fasta(gff_file,
                 fasta_file,
                 protein_coding=False,
                 qc=None,
                 log=None):
    '''Convert a gff file with the appended FASTA to protein/all fasta file
    gff_file = input gff file
    fasta_file = output file
    output: a protein coding FASTA file OR nucleotide FASTA file'''
    out_tmp = ''.join(
        random.choice(string.ascii_lowercase + string.ascii_uppercase +
                      string.digits) for _ in range(7)) + "_fasta.fa"
    out = open(out_tmp, "w")
    contigs = {}
    with open(gff_file) as f:
        fasta = False
        for line in f:
            if fasta:
                out.write(line)
                continue
            if line.startswith("##FASTA"):
                fasta = True
                continue
            if line.startswith("#"):
                continue
            toks = line.strip().split()
            if toks[2] != "CDS":
                continue

            name = toks[-1].split("|")[-1]
            if toks[0] not in contigs:
                contigs[toks[0]] = []

            contigs[toks[0]].append({
                "name": name,
                "start": int(toks[3]) - 1,
                "stop": int(toks[4]),
                "strand": toks[6]
            })
    out.close()

    # not protein coding, in this case we apply QC measures
    if not protein_coding:
        if qc_fasta(out_tmp, qc, log,
                    name=gff_file):  # if passed quality control
            # rename the temp to the final and return
            os.rename(out_tmp, fasta_file)
            return fasta_file
        else:
            os.remove(out_tmp)  # remove the temp file if didnt pass QC
            return None

    # read the contigs and save the final fasta file
    out = open(fasta_file, "w")
    with open(out_tmp) as handle:
        for values in SimpleFastaParser(handle):
            curr_contig = values[0]
            if curr_contig not in contigs:  # no CDSs in this contig
                continue
            for cds in contigs[curr_contig]:
                out.write(">" + cds["name"] + "\n")
                seq = values[1][cds["start"]:cds["stop"]]
                if cds["strand"] == "-":
                    seq = reverse_complement(seq)
                out.write(translate(seq) + "\n")
    out.close()
    os.remove(out_tmp)
    return
示例#35
0
#! /home/a_filipchyk/soft/home/a_filipchyk/anaconda3/bin/python
'''Outputs reverse complement for the provided sequences'''

import pybedtools
import sys
from Bio.Seq import reverse_complement

for seq in sys.argv[1:]:
    print(reverse_complement(seq))
示例#36
0
    def __str__(self):
        global cached_pairs, read_group_ids
        if self.ref_rc:
            flag = 0x10  #maps to reverse strand
            read_seq = reverse_complement(self.read_seq)
            read_qual = self.read_qual[::-1]
        else:
            flag = 0
            read_seq = self.read_seq
            read_qual = self.read_qual
        mate_ref_name = "*"
        mate_ref_pos = 0
        if not self.template_name:
            assert self.read_name
            self.template_name = self.read_name
        if self.is_paired():
            flag += 1  #paired
            if self.first_in_pair:
                flag += 0x40  #forward partner
            else:
                flag += 0x80  #reverse partner
            try:
                mate = self.get_partner()
            except KeyError:
                #Paired but no parter in ACE file
                flag += 0x08  #mate unmapped
            else:
                mate_ref_name = mate.contig_name
                mate_ref_pos = mate.ref_pos
                if mate_ref_name == self.contig_name:
                    #Since MIRA seems happy and both on same contig,
                    flag += 0x02  #properly aligned

        assert not self.tags
        read_seq_unpadded = read_seq.replace("*", "")
        read_qual_unpadded = "".join(q for (l, q) in zip(read_seq, read_qual)
                                     if l != "*")
        cigar = self.cigar
        #assert "M" not in cigar, cigar
        if "D" not in cigar:
            #Sum of lengths of the M/I/S/=/X operations should match the sequence length
            #By construction there are no M entries in our CIGAR string.
            #TODO - Improve this check to consider D in CIGAR?
            if len(read_seq_unpadded) != sum(
                    int(x)
                    for x in cigar.replace("I", "=").replace("S", "=").replace(
                        "M", "X").replace("X", "=").split("=") if x):
                raise ValueError("%s vs %i for %s" %
                                 (cigar, len(read_seq_unpadded), read_seq))
        assert len(read_seq_unpadded) == len(read_qual_unpadded)
        line = "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\t%s\t%s" % \
            (self.template_name, flag, self.contig_name, self.ref_pos,
             self.map_qual, cigar,
             mate_ref_name, mate_ref_pos, self.insert_size,
             read_seq_unpadded, read_qual_unpadded)
        if self.read_group:
            #MIRA v3.9+ assigns this
            if self.read_group not in read_group_ids:
                log("Undeclared read group %r" % self.read_group)
                log(line)
                sys.exit(1)
            line += "\tRG:Z:%s" % self.read_group
        else:
            #We assign this on old MIRA
            assert self.seq_tech
            line += "\tRG:Z:%s" % read_group_ids[(self.seq_tech, self.strain)]
        for tag in self.tags:
            assert not tag.startswith("RG:"), tag
            line += "\t" + tag
        return line
示例#37
0
 def antiparallel(self, seq):
     return reverse_complement(seq)
示例#38
0
s_nucls = defaultdict(int)
e_nucls = defaultdict(int)
#print(starts)
for seqrecord in SeqIO.parse(args.genome, 'fasta'):
    s_plus = starts[(seqrecord.name, '+')];
    #print('bu')
    s_minus = starts[(seqrecord.name, '-')];
    e_plus = ends[(seqrecord.name, '+')];
    e_minus = ends[(seqrecord.name, '-')];
    for s, e, n1, n2 in zip(s_plus[1:], e_plus[1:], seqrecord.seq, seqrecord.seq[1:]):
        nn = n1+n2
        s_nucls[nn] += s;
        e_nucls[nn] += e;
        
    for s, e, n1, n2 in zip(s_minus[1:], e_minus[1:], seqrecord.seq, seqrecord.seq[1:]):
        nn = reverse_complement(n1+n2)
        s_nucls[nn] += e;
        e_nucls[nn] += s;

def rnase_plot(ndict, output, title, normed=False):
    xticklabels = ["".join(x) for x in product('ACTG', repeat=2)]
    bars = np.array([ndict[x] for x in xticklabels])
    brange = range(16)
    if(normed):
        bars = bars/sum(bars);
        ylabel = 'Fraction'
    else:
        ylabel = 'Counts'
    fig, ax = plt.subplots(figsize=(16, 9))
    ax.bar(brange, bars, 0.5, color='lightblue')
    plt.title(title)
from Bio.Seq import translate
from Bio.Seq import reverse_complement
dna = "TTCCGGTATTTAGAGGATCGGGGCCAGTAATGCGAAGGTGATTGTGCCTTCGCGCAGTTGAATGCGAAAGCATTGTCACCTTATAGGTTTGCGGATCATCAGCTTGATTTGAATACGCTTTGTCCTGCCCCCCTTGATACGATGAAACAGGATTTACTGCTGACAACTCAGAATGGAAAGCAAGGTATGATGCGCTGACTTGGTTAGAACGACCGGATCATCGAGATGGCGGTGAATTATTCAGACGTAACGTCGGGAAGCTTCACTTCCTACTGGCTACGCGATTTAAAACTCACTGCCGGTGTATGACCATATTATACATCGGATATCTGTATCGCTGTTTGTGCCGCGGTTATACGCCACTTGTTGCTAATGGTTATCTACTATTCTGCACAGATAGAGAGCATTTGATGTGGAAATGGGGAAAGGCGCTTTTCTAGCTAGAAAAGCGCCTTTCCCCATTTCCACATGCGGTATTTCTCAATGGAAACTTTCATGTAATCTGTCCCTTGAACAGAGCCGTTCAGTACAGCCCTACTCAAACGCATTTGCTCTGTTCACACCCTAGTGCAACATAAACTAGGGGATGTGAGTTACTCGCCTGTGAACCGAGGTCCTCTTTCATATCTTGGTATTAAAAACTCCCTTATGGCGTGCATCAATGAACTTTGCACTTACGGAAGCTGACTTGAATCCTCCCAGCCCGCATTGTATTGCGTTAGACACAGGTGTCGAATGCTGCAACCATCTAGCCGCTTAAGTCGTACCACCCTGCCGCGCAGGGTTACATATTTACTTATTGTTCTTACTACGATCGCAAGCTCAATTAGCTTCCCTGCATCCAAAGCAAAGGGCTCGGACGAGCAGCAGCCCATGACCA"
initial = []
for start in xrange(0, 3):
    initial.append(translate(dna[start:-(3 - start)]))
    initial.append(translate(reverse_complement(dna)[start:-(3 - start)]))
longest = 0
best = ""
for j in xrange(len(initial)):
    init = initial[j]
    st = len(init)
    for i in range(len(init) - 1, -1, -1):
        if (init[i] == '*'):
            st = i + 1
            break
    initial[j] = init[:st]
best = ""
for init in initial:
    frames = init.split("*")
    for frame in frames:
        st = -1
        for i in range(len(frame)):
            if frame[i] == 'M':
                st = i
                break
        if st != -1 and len(frame[st:]) > len(best):
            best = frame[st:]
print best
示例#40
0
def revcomp(dna_string):
    """Return the reverse complement of a string"""
    return reverse_complement(dna_string)
示例#41
0
			mylottia = ''
			if '|' in item[0]:
				if len(item[0].split('|')) > 6:
					mylottia = item[0].split('|')[3]
				else:
					mylottia = item[0].split('|')[2]
			else:
				mylottia = item[0]

#header includes 1. species name 2. assembled contig ID 3. coverage 4. percent sequence above a certain number of bp 5. lottia target 6. lot start 7. lot end 8. seq start 9. seq end
			out.write(">" + ID + '|' + item[1] + '|' + str(item[2])  + '|' + str(item[4]) + '|' + mylottia + '|' + myblastline[6]  + '|' + myblastline[7] + '|' + myblastline[8] + '|' + myblastline[9] +'\n')
			if int(myblastline[9]) > int(myblastline[8]):
				out.write(item[-2] + '\n')
			else:
				out.write(reverse_complement(item[-2]) + '\n')


			
				#if lengthcounter == 0:
				#	out.write('\t'.join([info[0], info[1], str(float(covcounter)/len(myseq)), '0', str(float(lengthcounter)/len(myseq))]) + '\n'), 
				#else:
				#	out.write('\t'.join([info[0], info[1], str(float(covcounter)/len(myseq)), str(float(hetcounter)/lengthcounter), str(float(lengthcounter)/len(myseq))]) + '\n'), 
					
				


		out.close()
		myfasta.close()
		mycov.close()
#		myhet.close()
示例#42
0
def six_frame_translations(seq, genetic_code=1):
    """Return pretty string showing the 6 frame translations and GC content.

    Nice looking 6 frame translation with GC content - code from xbbtools
    similar to DNA Striders six-frame translation

    >>> from Bio.SeqUtils import six_frame_translations
    >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA"))
    GC_Frame: a:5 t:0 g:8 c:5 
    Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC
    <BLANKLINE>
    <BLANKLINE>
    1/1
      G  H  C  N  G  P  L
     W  P  L  *  W  A  A
    M  A  I  V  M  G  R  *
    auggccauuguaaugggccgcuga   54 %
    uaccgguaacauuacccggcgacu
    A  M  T  I  P  R  Q 
     H  G  N  Y  H  A  A  S
      P  W  Q  L  P  G  S
    <BLANKLINE>
    <BLANKLINE>

    """  # noqa for pep8 W291 trailing whitespace
    from Bio.Seq import reverse_complement, translate
    anti = reverse_complement(seq)
    comp = anti[::-1]
    length = len(seq)
    frames = {}
    for i in range(0, 3):
        fragment_length = 3 * ((length - i) // 3)
        frames[i + 1] = translate(seq[i:i + fragment_length], genetic_code)
        frames[-(i + 1)] = translate(anti[i:i + fragment_length],
                                     genetic_code)[::-1]

    # create header
    if length > 20:
        short = '%s ... %s' % (seq[:10], seq[-10:])
    else:
        short = seq
    header = 'GC_Frame: '
    for nt in ['a', 't', 'g', 'c']:
        header += '%s:%d ' % (nt, seq.count(nt.upper()))

    header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),
                                                           length, GC(seq))
    res = header

    for i in range(0, length, 60):
        subseq = seq[i:i + 60]
        csubseq = comp[i:i + 60]
        p = i // 3
        res += '%d/%d\n' % (i + 1, i / 3 + 1)
        res += '  ' + '  '.join(frames[3][p:p + 20]) + '\n'
        res += ' ' + '  '.join(frames[2][p:p + 20]) + '\n'
        res += '  '.join(frames[1][p:p + 20]) + '\n'
        # seq
        res += subseq.lower() + '%5d %%\n' % int(GC(subseq))
        res += csubseq.lower() + '\n'
        # - frames
        res += '  '.join(frames[-2][p:p + 20]) + ' \n'
        res += ' ' + '  '.join(frames[-1][p:p + 20]) + '\n'
        res += '  ' + '  '.join(frames[-3][p:p + 20]) + '\n\n'
    return res
示例#43
0
def revcomp(seq):
    return reverse_complement(seq)
示例#44
0
def get_sequence(feature, raw_sequence, record, locusTags, rRNAnum,
                 trans_table):

    # Get the locus tag
    locus_tag = record.features[feature].qualifiers.get('locus_tag')

    # Set the locus tag
    if not locus_tag:
        locus_tag = "no_tag_%s" % rRNAnum
    else:
        locus_tag = locus_tag[0]

    # Check to see whether the locus has already been used
    if locus_tag in locusTags:
        locus_tag = "%s_%s" % (locus_tag, samelocus + 1)
        quit()
    else:
        locusTags.append(locus_tag)

    # Get the location of the gene
    # Note, the start site is -1 to the true location
    location = str(record.features[feature].location)

    # Check to see whether there are multiple parts
    joincheck = re.search('join', location)

    # If there is a join in the gene
    if joincheck:

        joinCDSstart = 0
        joinCDSend = 0

        geneSeq = ''

        # Locate the splits
        region = location[location.find("{") + 1:location.find("}")]

        splits = re.sub(', ', ',', region)
        splits = re.split(',', splits)

        cdsstrand = record.features[feature].strand

        intronNum = 0
        loc = ''

        # For each intron
        for i in range(0, len(splits)):

            intronNum += 1

            strand = record.features[feature].strand
            # strand = splits[i][splits[i].find("(")+1:splits[i].find(")")]

            locations = re.findall('\d+', splits[i])

            cdsStart = int(locations[0])
            cdsEnd = int(locations[1])

            if joinCDSstart == 0:
                joinCDSstart = int(cdsStart)
            else:
                if int(cdsStart) < joinCDSstart:
                    joinCDSstart = int(cdsStart)

            if joinCDSend == 0:
                joinCDSend = int(cdsEnd)
            else:
                if int(cdsEnd) > cdsEnd:
                    joinCDSend = int(cdsEnd)

            if intronNum == len(splits):
                loc += "%s..%s" % (cdsStart + 1, cdsEnd)
            else:
                loc += "%s..%s," % (cdsStart + 1, cdsEnd)

            seq = raw_sequence[cdsStart:cdsEnd]

            if strand == -1:
                strandType = 1
                seq = seq.reverse_complement()
                geneSeq += seq
            else:
                strandType = 0
                geneSeq += seq

        cdsStart = joinCDSstart
        cdsEnd = joinCDSend

    # Otherwise
    else:

        # Locate the gene start (note -1 from true start site)
        cdsStart = record.features[feature].location.nofuzzy_start

        # Locate the gene end
        cdsEnd = record.features[feature].location.nofuzzy_end

        # Write the location
        loc = "%s..%s" % (cdsStart + 1, cdsEnd)

        # Get the raw sequence
        seq = raw_sequence[cdsStart:cdsEnd]

        strand = record.features[feature].strand

        if strand == -1:
            strandType = 1
            geneSeq = reverse_complement(seq)
        else:
            strandType = 0
            geneSeq = seq

    return geneSeq, cdsStart, cdsEnd, strand
示例#45
0
 def frame(self, seq, frame, translation_table=1):
     """Translate DNA sequence in a chosen frame."""
     if frame < 0:
         seq = reverse_complement(seq)
     seq = seq[(abs(frame) - 1):]
     return translate(seq, table=translation_table)
def __main__():
    parser = argparse.ArgumentParser(
        description='Generate proBED and proBAM from mz.sqlite')
    parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML")
    parser.add_argument(
        'genomic_mapping_sqlite',
        help="genomic_mapping.sqlite with feature_cds_map table")
    parser.add_argument('-R',
                        '--genomeReference',
                        default='Unknown',
                        help='Genome reference sequence in 2bit format')
    parser.add_argument('-t',
                        '--twobit',
                        default=None,
                        help='Genome reference sequence in 2bit format')
    parser.add_argument('-r',
                        '--reads_bam',
                        default=None,
                        help='reads alignment bam path')
    parser.add_argument('-g',
                        '--gffutils_sqlite',
                        default=None,
                        help='gffutils GTF sqlite DB')
    parser.add_argument('-B', '--probed', default=None, help='proBed path')
    parser.add_argument('-s', '--prosam', default=None, help='proSAM path')
    parser.add_argument('-b', '--probam', default=None, help='proBAM path')
    parser.add_argument('-l',
                        '--limit',
                        type=int,
                        default=None,
                        help='limit numbers of PSMs for testing')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
    parser.add_argument('-d', '--debug', action='store_true', help='Debug')
    args = parser.parse_args()

    def get_sequence(chrom, start, end):
        if twobit:
            if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
                return twobit[chrom][start:end]
            contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
            if contig in twobit and 0 <= start < end < len(twobit[contig]):
                return twobit[contig][start:end]
            return ''
        return None

    twobit = TwoBitFile(args.twobit) if args.twobit else None
    samfile = pysam.AlignmentFile(args.reads_bam,
                                  "rb") if args.reads_bam else None
    seqlens = twobit.sequence_sizes()

    probed = open(args.probed, 'w') if args.probed else sys.stdout

    gff_cursor = get_connection(
        args.gffutils_sqlite).cursor() if args.gffutils_sqlite else None
    map_cursor = get_connection(args.genomic_mapping_sqlite).cursor()
    mz_cursor = get_connection(args.mzsqlite).cursor()

    unmapped_accs = set()
    timings = dict()

    def add_time(name, elapsed):
        if name in timings:
            timings[name] += elapsed
        else:
            timings[name] = elapsed

    XG_TYPES = [
        'N', 'V', 'W', 'J', 'A', 'M', 'C', 'E', 'B', 'O', 'T', 'R', 'I', 'G',
        'D', 'U', 'X', '*'
    ]
    FT_TYPES = ['CDS', 'five_prime_utr', 'three_prime_utr', 'transcript']

    def get_peptide_type(exons):
        ## XG classify peptide
        ##     N  Normal peptide. The peptide sequence is contained in the reference protein sequence.
        ##     V  Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
        ##     W  Indel peptide. An insertion or deletion is present as compared to the reference.
        ##     J  Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
        ##     A  Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
        ##     M  Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
        ##     C  Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
        ##     E  Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
        ##     B  3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
        ##     O  Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
        ##     T  Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
        ##     R  Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
        ##     I  Intron peptide. A peptide that is located in an intronic region of the reference isoform.
        ##     G  Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
        ##     D  Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
        ##     U  Unmapped peptide. A peptide that could not be mapped to a reference sequence.
        ##     X  Unknown.

        peptide_type = '*'
        if gff_cursor:
            ts = time()
            etypes = ['*'] * len(exons)
            efeatures = [None] * len(exons)
            if args.debug:
                print('exons:%d\t%s' % (len(exons), etypes), file=sys.stderr)
            for i, exon in enumerate(exons):
                (acc, gc, gs, ge, st, cs, ce) = exon
                fr = cs % 3
                if args.debug:
                    print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                          (acc, gc, gs, ge, st, cs, ce, fr),
                          file=sys.stderr)
                ft_params = {
                    "seqid": str(gc).replace('chr', ''),
                    "start": gs,
                    "end": ge,
                    'strand': st,
                    'frame': fr,
                    'ftype': 'CDS'
                }
                features = [
                    f for f in gff_cursor.execute(FEATURE_ANY_QUERY, ft_params)
                ]
                efeatures[i] = features
            for i, exon in enumerate(exons):
                (acc, gc, gs, ge, st, cs, ce) = exon
                for f in efeatures[i]:
                    (id, seqid, start, end, featuretype, strand, frame,
                     in_frame) = f
                    if args.debug:
                        print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' %
                              (id, seqid, start, end, featuretype, strand,
                               frame, in_frame),
                              file=sys.stderr)
                    if strand == st:
                        if start <= gs and ge <= end:
                            if in_frame:
                                etypes[i] = 'N'
                                break
                            elif XG_TYPES.index('O') < XG_TYPES.index(
                                    etypes[i]):
                                etypes[i] = 'O'
                        break
                    else:
                        if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]):
                            etypes[i] = 'O'
                peptide_type = etypes[i]
            te = time()
            add_time('pep_type', te - ts)
        return peptide_type

    def classify_exon(exon, exons, features):
        ##     N  Normal peptide. The peptide sequence is contained in the reference protein sequence.
        # 1 exon, contained, in_frame
        # 2+ exons, contained, in_frame, on_exon_boundary
        ##     V  Variant peptide. A single amino acid variation (SAV) is present as compared to the reference.
        # 1 exon, contained, in_frame, AA_mismatch
        # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch
        ##     W  Indel peptide. An insertion or deletion is present as compared to the reference.
        # 1 exon, contained, in_frame, AA_mismatch
        # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch
        ##     J  Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference.
        # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons
        ##     A  Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference.
        # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons
        ##     M  Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference.
        ##     C  Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic).
        # 1 exon overlaps but not contained
        ##     E  Extension peptide. A peptide that points to a non-canonical N-terminal protein extension.
        ##     B  3' UTR peptide. A peptide that maps to the 3' UTR region from the reference.
        # exon overlaps a three_prime_utr
        ##     O  Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference.
        # exon contained but not in_frame
        ##     T  Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation.
        ##     R  Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference.
        ##     I  Intron peptide. A peptide that is located in an intronic region of the reference isoform.
        # exon contained in transcript, not not overlapping any exon
        ##     G  Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion.
        # exonis from different seqs, strand, or transcripts
        ##     D  Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy.
        ##     U  Unmapped peptide. A peptide that could not be mapped to a reference sequence.
        ##     X  Unknown.
        return '*'

    def get_variant_cds(exons, ref_prot, peptide, pep_cds):
        if ref_prot != peptide and samfile:
            try:
                if args.debug:
                    print('name: %s \nref: %s\npep: %s\n' %
                          (scan_name, ref_prot, peptide),
                          file=sys.stderr)
                ts = time()
                for exon in exons:
                    (acc, chrom, start, end, strand, c_start, c_end) = exon
                    a_start = c_start / 3 * 3
                    a_end = c_end / 3 * 3
                    if ref_prot[a_start:a_end] != peptide[a_start:a_end]:
                        pileup = get_exon_pileup(chrom, start, end)
                        for i, (bi, ai, ao) in enumerate([
                            (i, i / 3, i % 3) for i in range(c_start, c_end)
                        ]):
                            if ao == 0 or i == 0:
                                if ref_prot[ai] != peptide[ai]:
                                    codon = get_pep_codon(
                                        pileup, bi - c_start, peptide[ai], ao)
                                    if args.debug:
                                        print('%d %d %d   %s :  %s %s %s' %
                                              (bi, ai, ao, peptide[ai],
                                               str(pep_cds[:bi]), str(codon),
                                               str(pep_cds[bi + 3:])),
                                              file=sys.stderr)
                                    if codon:
                                        pep_cds = pep_cds[:
                                                          bi] + codon + pep_cds[
                                                              bi + 3:]
                te = time()
                add_time('var_cds', te - ts)
            except Exception as e:
                print('name: %s \nref: %s\npep: %s\n%s\n' %
                      (scan_name, ref_prot, peptide, e),
                      file=sys.stderr)
        return pep_cds

    def get_mapping(acc, pep_start, pep_end):
        ts = time()
        p_start = (pep_start - 1) * 3
        p_end = pep_end * 3
        map_params = {"acc": acc, "p_start": p_start, "p_end": p_end}
        if args.debug:
            print('%s' % map_params, file=sys.stderr)
        locs = [l for l in map_cursor.execute(MAP_QUERY, map_params)]
        exons = []
        ##       =========	pep
        ##  ---			continue
        ##      ---		trim
        ##          ---		copy
        ##              ---	trim
        ##                 ---  break
        c_end = 0
        for i, (acc, chrom, start, end, strand, cds_start,
                cds_end) in enumerate(locs):
            if args.debug:
                print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' %
                      (acc, chrom, start, end, strand, cds_start, cds_end),
                      file=sys.stderr)
            c_start = c_end
            if cds_end < p_start:
                continue
            if cds_start >= p_end:
                break
            if strand == '+':
                if cds_start < p_start:
                    start += p_start - cds_start
                if cds_end > p_end:
                    end -= cds_end - p_end
            else:
                if cds_start < p_start:
                    end -= p_start - cds_start
                if cds_end > p_end:
                    start += cds_end - p_end
            c_end = c_start + abs(end - start)
            if args.debug:
                print('Pep:  %s\t%s:%d-%d\t%s\t%d\t%d' %
                      (acc, chrom, start, end, strand, cds_start, cds_end),
                      file=sys.stderr)
            exons.append([acc, chrom, start, end, strand, c_start, c_end])
        te = time()
        add_time('get_mapping', te - ts)
        return exons

    def get_cds(exons):
        ts = time()
        seqs = []
        for i, (acc, chrom, start, end, strand, cds_start,
                cds_end) in enumerate(exons):
            seq = get_sequence(chrom, min(start, end), max(start, end))
            if strand == '-':
                seq = reverse_complement(seq)
            seqs.append(seq)
        te = time()
        add_time('get_cds', te - ts)
        if args.debug:
            print('CDS:  %s' % str(seqs), file=sys.stderr)
        return ''.join(seqs) if seqs else ''

    def genomic_mapping_count(peptide):
        ts = time()
        params = {"sequence": peptide}
        acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY, params)]
        te = time()
        add_time('PEPTIDE_ACC_QUERY', te - ts)
        if acc_locs:
            if len(acc_locs) == 1:
                return 1
            locations = set()
            for i, acc_loc in enumerate(acc_locs):
                (acc, pep_start, pep_end) = acc_loc
                if acc in unmapped_accs:
                    continue
                try:
                    add_time('GENOMIC_POS_QUERY_COUNT', 1)
                    ts = time()
                    p_start = pep_start * 3
                    p_end = pep_end * 3
                    params = {"acc": acc, "cds_offset": p_start}
                    (start_chrom,
                     start_pos) = map_cursor.execute(GENOMIC_POS_QUERY,
                                                     params).fetchone()
                    params = {"acc": acc, "cds_offset": p_end}
                    (end_chrom,
                     end_pos) = map_cursor.execute(GENOMIC_POS_QUERY,
                                                   params).fetchone()
                    locations.add('%s:%s-%s:%s' %
                                  (start_chrom, start_pos, end_chrom, end_pos))
                    te = time()
                    add_time('GENOMIC_POS_QUERY', te - ts)
                except:
                    unmapped_accs.add(acc)
                    if args.debug:
                        print('Unmapped: %s' % acc, file=sys.stderr)
            return len(locations)
        return -1

    def spectrum_peptide_count(spectrum_id):
        ts = time()
        params = {"sr_id": spectrum_id}
        pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY,
                                      params).fetchone()[0]
        te = time()
        add_time('SPECTRUM_PEPTIDES_QUERY', te - ts)
        return pep_count

    def get_exon_pileup(chrom, chromStart, chromEnd):
        cols = []
        for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd):
            if chromStart <= pileupcolumn.reference_pos <= chromEnd:
                bases = dict()
                col = {
                    'depth': 0,
                    'cov': pileupcolumn.nsegments,
                    'pos': pileupcolumn.reference_pos,
                    'bases': bases
                }
                for pileupread in pileupcolumn.pileups:
                    if not pileupread.is_del and not pileupread.is_refskip:
                        col['depth'] += 1
                        base = pileupread.alignment.query_sequence[
                            pileupread.query_position]
                        if base not in bases:
                            bases[base] = 1
                        else:
                            bases[base] += 1
                cols.append(col)
        return cols

    codon_map = {
        "TTT": "F",
        "TTC": "F",
        "TTA": "L",
        "TTG": "L",
        "TCT": "S",
        "TCC": "S",
        "TCA": "S",
        "TCG": "S",
        "TAT": "Y",
        "TAC": "Y",
        "TAA": "*",
        "TAG": "*",
        "TGT": "C",
        "TGC": "C",
        "TGA": "*",
        "TGG": "W",
        "CTT": "L",
        "CTC": "L",
        "CTA": "L",
        "CTG": "L",
        "CCT": "P",
        "CCC": "P",
        "CCA": "P",
        "CCG": "P",
        "CAT": "H",
        "CAC": "H",
        "CAA": "Q",
        "CAG": "Q",
        "CGT": "R",
        "CGC": "R",
        "CGA": "R",
        "CGG": "R",
        "ATT": "I",
        "ATC": "I",
        "ATA": "I",
        "ATG": "M",
        "ACT": "T",
        "ACC": "T",
        "ACA": "T",
        "ACG": "T",
        "AAT": "N",
        "AAC": "N",
        "AAA": "K",
        "AAG": "K",
        "AGT": "S",
        "AGC": "S",
        "AGA": "R",
        "AGG": "R",
        "GTT": "V",
        "GTC": "V",
        "GTA": "V",
        "GTG": "V",
        "GCT": "A",
        "GCC": "A",
        "GCA": "A",
        "GCG": "A",
        "GAT": "D",
        "GAC": "D",
        "GAA": "E",
        "GAG": "E",
        "GGT": "G",
        "GGC": "G",
        "GGA": "G",
        "GGG": "G",
    }

    aa_codon_map = dict()
    for c, a in codon_map.items():
        aa_codon_map[a] = [
            c
        ] if a not in aa_codon_map else aa_codon_map[a] + [c]

    aa_na_map = dict()  # m[aa]{bo : {b1 : [b3]
    for c, a in codon_map.items():
        if a not in aa_na_map:
            aa_na_map[a] = dict()
        d = aa_na_map[a]
        for i in range(3):
            b = c[i]
            if i < 2:
                if b not in d:
                    d[b] = dict() if i < 1 else set()
                d = d[b]
            else:
                d.add(b)

    def get_pep_codon(pileup, idx, aa, ao):
        try:
            ts = time()
            bases = []
            for i in range(3):
                if i < ao:
                    bases.append(list(set([c[i] for c in aa_codon_map[aa]])))
                else:
                    bases.append([
                        b for b, cnt in reversed(
                            sorted(pileup[idx + i]['bases'].iteritems(),
                                   key=lambda (k, v): (v, k)))
                    ])
                if args.debug:
                    print('%s' % bases, file=sys.stderr)
            for b0 in bases[0]:
                if b0 not in aa_na_map[aa]:
                    continue
                for b1 in bases[1]:
                    if b1 not in aa_na_map[aa][b0]:
                        continue
                    for b2 in bases[2]:
                        if b2 in aa_na_map[aa][b0][b1]:
                            return '%s%s%s' % (b0, b1, b2)
            te = time()
            add_time('pep_codon', te - ts)
        except Exception as e:
            print("get_pep_codon: %s %s %s %s" % (aa, ao, idx, pileup),
                  file=sys.stderr)
            raise e
        return None

    def write_probed(chrom,
                     chromStart,
                     chromEnd,
                     strand,
                     blockCount,
                     blockSizes,
                     blockStarts,
                     spectrum,
                     protacc,
                     peptide,
                     uniqueness,
                     genomeReference,
                     score=1000,
                     psmScore='.',
                     fdr='.',
                     mods='.',
                     charge='.',
                     expMassToCharge='.',
                     calcMassToCharge='.',
                     psmRank='.',
                     datasetID='.',
                     uri='.'):
        probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
            (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount,
             ','.join([str(v) for v in blockSizes]),
             ','.join([str(v) for v in blockStarts]),
             protacc,peptide,uniqueness, genomeReference,
             psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri))

    def get_genomic_location(exons):
        chrom = exons[0][1]
        strand = exons[0][4]
        pos = [exon[2] for exon in exons] + [exon[3] for exon in exons]
        chromStart = min(pos)
        chromEnd = max(pos)
        blockCount = len(exons)
        blockSizes = [abs(exon[3] - exon[2]) for exon in exons]
        blockStarts = [min(exon[2], exon[3]) - chromStart for exon in exons]
        return (chrom, chromStart, chromEnd, strand, blockCount, blockSizes,
                blockStarts)

    def get_psm_modifications(peptide_ref):
        mods = []
        ts = time()
        params = {"peptide_ref": peptide_ref}
        pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)]
        if pepmods:
            for (location, residue, name, modType, unimod) in pepmods:
                mods.append('%s-%s' % (location, unimod if unimod else '%s%s' %
                                       (name, residue)))
        te = time()
        add_time('PEP_MODS_QUERY', te - ts)
        return ';'.join(mods)

    """
    QNAME
    FLAG
    RNAME
    POS
    CIGAR
    SEQ
    'NH' : 'i', #number of genomic locations to which the peptide sequence maps
    'XO' : 'Z', #uniqueness of the peptide mapping
    'XL' : 'i', #number of peptides to which the spectrum maps
    'XP' : 'Z', #peptide sequence
    'YP' : 'Z', #Protein accession ID from the original search result
    'XF' : 'Z', #Reading frame of the peptide (0, 1, 2)
    'XI' : 'f', #Peptide intensity
    'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;).
    'XR' : 'Z', #reference peptide sequence
    'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before).
    'YA' : 'Z', #Following amino acids (2 AA, A stands for after).
    'XS' : 'f', #PSM score
    'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP).
    'XC' : 'i', #peptide charge
    'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown;
    'XM' : 'Z', #Modifications
    'XN' : 'i', #Number of missed cleavages in the peptide (XP)
    'XT' : 'i', #Enzyme specificity
    'XE' : 'i', #Enzyme used in the experiment
    'XG' : 'A', #Peptide type
    'XU' : 'Z', #URI
    """
    psm_cursor = get_connection(args.mzsqlite).cursor()
    ts = time()
    psms = psm_cursor.execute(PSM_QUERY)
    te = time()
    add_time('PSM_QUERY', te - ts)
    proBAM = ProBAM(species=None,
                    assembly=args.genomeReference,
                    seqlens=seqlens,
                    comments=[])
    proBED = ProBED(species=None, assembly=args.genomeReference, comments=[])
    for i, psm in enumerate(psms):
        probam_dict = PROBAM_DEFAULTS.copy()
        (acc, pep_start, pep_end, aa_pre, aa_post, peptide, spectrum_id,
         spectrum_title, rank, charge, calcmass, exprmass, pepref) = psm
        scan_name = spectrum_title if spectrum_title else spectrum_id
        if args.debug:
            print('\nPSM: %d\t%s' % (i, '\t'.join([
                str(v) for v in (acc, pep_start, pep_end, peptide, spectrum_id,
                                 scan_name, rank, charge, calcmass, exprmass)
            ])),
                  file=sys.stderr)
        exons = get_mapping(acc, pep_start, pep_end)
        if args.debug:
            print('%s' % exons, file=sys.stderr)
        if not exons:
            continue
        mods = get_psm_modifications(pepref)
        (chrom, chromStart, chromEnd, strand, blockCount, blockSizes,
         blockStarts) = get_genomic_location(exons)
        ref_cds = get_cds(exons)
        if args.debug:
            print('%s' % ref_cds, file=sys.stderr)
        ref_prot = translate(ref_cds)
        if args.debug:
            print('%s' % ref_prot, file=sys.stderr)
            print('%s' % peptide, file=sys.stderr)
        spectrum_peptides = spectrum_peptide_count(spectrum_id)
        peptide_locations = genomic_mapping_count(peptide)
        if args.debug:
            print('spectrum_peptide_count: %d\tpeptide_location_count: %d' %
                  (spectrum_peptides, peptide_locations),
                  file=sys.stderr)
        uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]'
        ts = time()
        proBEDEntry = ProBEDEntry(chrom,
                                  chromStart,
                                  chromEnd,
                                  '%s_%s' % (acc, scan_name),
                                  1000,
                                  strand,
                                  blockCount,
                                  blockSizes,
                                  blockStarts,
                                  acc,
                                  peptide,
                                  uniqueness,
                                  args.genomeReference,
                                  charge=charge,
                                  expMassToCharge=exprmass,
                                  calcMassToCharge=calcmass,
                                  mods=mods if mods else '.',
                                  psmRank=rank)
        proBED.add_entry(proBEDEntry)
        te = time()
        add_time('add_probed', te - ts)
        if len(ref_prot) != len(peptide):
            continue
        ts = time()
        probam_dict['NH'] = peptide_locations
        probam_dict['XO'] = uniqueness
        probam_dict['XL'] = peptide_locations
        probam_dict['XP'] = peptide
        probam_dict['YP'] = acc
        probam_dict['XC'] = charge
        probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass,
                                          calcmass)
        probam_dict['XR'] = ref_prot  # ? dbSequence
        probam_dict['YA'] = aa_post
        probam_dict['YB'] = aa_pre
        probam_dict['XM'] = mods if mods else '*'
        flag = 16 if strand == '-' else 0
        if str(rank) != str(1) and rank != '*' and rank != [] and rank != "":
            flag += 256
        probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons])
        ## check for variation from ref_cds
        pep_cds = get_variant_cds(exons, ref_prot, peptide, ref_cds)
        peptide_type = '*'
        ## XG classify peptide
        probam_dict['XG'] = get_peptide_type(exons)
        ## probam_dict['MD'] = peptide

        ## FIX  SAM sequence is forward strand
        seq = pep_cds if strand == '+' else reverse_complement(pep_cds)
        ## cigar based on plus strand
        cigar = ''
        if strand == '+':
            blkStarts = blockStarts
            blkSizes = blockSizes
        else:
            blkStarts = [x for x in reversed(blockStarts)]
            blkSizes = [x for x in reversed(blockSizes)]
        for j in range(blockCount):
            if j > 0:
                intron = blkStarts[j] - (blkStarts[j - 1] + blkSizes[j - 1])
                if intron > 0:
                    cigar += '%dN' % intron
            cigar += '%dM' % blkSizes[j]
        ## Mods TODO
        proBAMEntry = ProBAMEntry(qname=scan_name,
                                  flag=flag,
                                  rname=chrom,
                                  pos=chromStart + 1,
                                  cigar=cigar,
                                  seq=seq,
                                  optional=probam_dict)
        proBAM.add_entry(proBAMEntry)
        te = time()
        add_time('add_probam', te - ts)

        if args.debug:
            print('%s' % probam_dict, file=sys.stderr)

        if args.limit and i >= args.limit:
            break
    if args.probed:
        ts = time()
        with open(args.probed, 'w') as fh:
            proBED.write(fh)
        te = time()
        add_time('write_probed', te - ts)
    if args.prosam or args.probam:
        samfile = args.prosam if args.prosam else 'temp.sam'
        ts = time()
        with open(samfile, 'w') as fh:
            proBAM.write(fh)
        te = time()
        add_time('write_prosam', te - ts)
        if args.probam:
            ts = time()
            bamfile = args.prosam.replace('.sam', '.bam')
            pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False)
            te = time()
            add_time('write_probam', te - ts)
            pysam.index(args.probam)

    print('\n%s\n' % str(timings), file=sys.stderr)
示例#47
0
 def sequence(self):
     seq = self.fragment.sequence
     if self.direction == 'r':
         seq = str(reverse_complement(Seq(seq)))
     return seq[self.start():self.end()]
示例#48
0
def filterLongReads(fastq_filename, min_length, max_length, wd, adapter, threads, a):
    """
    Filters out reads longer than length provided and it is used to call the alignemnt and parse the outputs
    """
    seq_dict = {}
    first_dict_score = {}
    first_dict_seq = {}
    score_dict = {}
    listA_adapter = []
    final_seq = []
    list_seq_adap = []
    record_dict = {}
    max_score = 0
    if a and not adapter:
        out_filename = wd + fastq_filename + '.longreads.filtered.fasta'
    elif a and adapter:
        out_filename = wd + fastq_filename + '.longreads.filtered.oriented.fasta'
    else:
        out_filename = fastq_filename + '.longreads.filtered.fasta'
    filter_count = 0
    if os.path.isfile(out_filename):
            sys.stdout.write(('Filtered FASTQ existed already: ' + out_filename + ' --- skipping\n'))
            return out_filename, 0
    if fastq_filename.endswith('fastq') or fastq_filename.endswith('fq'):
        for record in SeqIO.parse(fastq_filename, "fastq"):
            if len(str(record.seq)) > int(min_length) < int(max_length):
                record.description= ""
                record.name = ""
                record.id = str(filter_count)
                filter_count += 1
                record_dict[record.id] = record
    elif fastq_filename.endswith('fasta') or fastq_filename.endswith('fa'):
        for record in SeqIO.parse(fastq_filename, "fasta"):
            if int(min_length) < len(str(record.seq)) < int(max_length):
                record.description= ""
                record.name = ""
                record.id = str(filter_count)
                filter_count += 1
                record_dict[record.id] = record
    if adapter:
        for adpt in SeqIO.parse(adapter, "fasta"):
            listA_adapter.append(adpt.id)
            list_seq_adap.append(adpt)
    outFile = open(out_filename, 'w')

    filter_count = 0
    if len(listA_adapter) == 1:
        filter_count = 0
        list_command = []
        for key in record_dict:
            for adpter in list_seq_adap:
                list_command.append([record_dict[key], adpter])
        with Pool(processes=int(threads), maxtasksperchild=1000) as p:
            align_resul = p.map(align_call, list_command, chunksize=1)
        for aling_res in align_resul:
            if len(aling_res) == 0:
                next
            else:
                seq_dict[aling_res[1]] = [record_dict[aling_res[1]], aling_res[2]]
                score_dict[aling_res[1]] =  aling_res[3]
        numbers = [score_dict[key][0] for key in score_dict]
        value_optimal = float(sum(numbers)) / max(len(numbers), 1)
        #for key in score_dict:

        #    if score_dict[key][0] > max_score:
        #        max_score = score_dict[key][0]
        #value_optimal = max_score - (max_score/20)
        for key in score_dict:
            if score_dict[key][0] > value_optimal and seq_dict[key][1] == 0:
                filter_count += 1
                final_seq.append(seq_dict[key][0])
            elif score_dict[key][0]  > value_optimal and seq_dict[key][1] == 1:
                filter_count += 1
                sequenze = reverse_complement(seq_dict[key][0].seq)
                seq_dict[key][0].seq = sequenze
                final_seq.append(seq_dict[key][0])
    elif len(listA_adapter) == 2:
        filter_count = 0
        list_command = []
        for key in record_dict:
            for adpter in list_seq_adap:
                list_command.append([record_dict[key], adpter])
        with Pool(processes=int(threads), maxtasksperchild=1000) as p:
            align_resul = p.map(align_call, list_command, chunksize=1)
        for aling_res in align_resul:
            if len(aling_res) == 0:
                next
            elif aling_res[1] in first_dict_seq:
                seq_dict[aling_res[1]] =  first_dict_seq[aling_res[1]]  +  [ aling_res[2], aling_res[0]]
                score_dict[aling_res[1]] = first_dict_score[aling_res[1]]  +  [ aling_res[0], aling_res[3]]
            else:
                first_dict_seq[aling_res[1]] = [record_dict[aling_res[1]], aling_res[2], aling_res[0]]
                first_dict_score[aling_res[1]] =  [ aling_res[0], aling_res[3]]
        max_score_first = 0
        max_score_second = 0
        for key in score_dict:
            score = score_dict[key]
            if score[0] == listA_adapter[0] and score[1][0] > max_score_first:
                max_score_first = score[1][0]
            if score[2] == listA_adapter[0] and score[3][0] > max_score_first:
                max_score_first = score[3][0]
            if score[0] == listA_adapter[1] and score[1][0] > max_score_second:
                max_score_second = score[1][0]
            if score[2] == listA_adapter[1] and score[3][0] > max_score_second:
                max_score_second = score[3][0]
        value_optimal_first = max_score_first - (max_score_first/30)
        value_optimal_second = max_score_second - (max_score_second/30)
        listReadsOverLimit = []
        for key in score_dict:
            score = score_dict[key]
            if (score[0] == listA_adapter[0] and score[1][0] > value_optimal_first) and (score[2] == listA_adapter[1] and score[3][0] > value_optimal_second):
                listReadsOverLimit.append(key)
            elif (score[2] == listA_adapter[0] and score[3][0] > value_optimal_first) and (score[0] == listA_adapter[1] and score[1][0] > value_optimal_second):
                listReadsOverLimit.append(key)
        for key in listReadsOverLimit:
            if seq_dict[key][1] == 1 and seq_dict[key][3] == 0:
                if seq_dict[key][2] == listA_adapter[0] and seq_dict[key][4] == listA_adapter[1]:
                    final_seq.append(seq_dict[key][0])
                elif seq_dict[key][2] in listA_adapter[1] and seq_dict[key][4] in listA_adapter[0]:
                    sequenze = reverse_complement(seq_dict[key][0].seq)
                    seq_dict[key][0].seq = sequenze
                    final_seq.append(seq_dict[key][0])
            elif seq_dict[key][1] == 0 and seq_dict[key][3] == 1:
                if seq_dict[key][2] == listA_adapter[0] and seq_dict[key][4] == listA_adapter[1]:
                    sequenze = reverse_complement(seq_dict[key][0].seq)
                    seq_dict[key][0].seq = sequenze
                    final_seq.append(seq_dict[key][0])
                elif seq_dict[key][2] in listA_adapter[1] and seq_dict[key][4] in listA_adapter[0]:
                    final_seq.append(seq_dict[key][0])
    elif len(listA_adapter) == 0:
        for key in record_dict:
            if int(min_length) < len(str(record_dict[key].seq)) < int(max_length):
                filter_count += 1
                final_seq.append(record_dict[key])
    SeqIO.write(final_seq, out_filename, "fasta")

    fmtdate = '%H:%M:%S %d-%m'
    now = datetime.datetime.now().strftime(fmtdate)

    sys.stdout.write(("###FINISHED FILTERING AT:\t" + now +
                      "###\n\n###LOREAN KEPT\t\033[32m" + str(filter_count) + "\033[0m\tREADS AFTER LENGTH FILTERING###\n"))

    return out_filename
示例#49
0
         overlap_counter_m += 1
         if int(v[0]) <= min_coord:
             #print max_coord
             min_coord = int(v[0])
             #print max_coord
         #else:
         #	print "blah" # int(v[1])
         #~ print k,v, gene, coords
         #~ print int(int(coords[0])-50)
         #~ print int(v[1])
         #~ print int(coords[0])
 if overlap_counter_m > 0:
     #print overlap_counter_m,int(coords[1]),int(min_coord)
     sequence = record.seq.tomutable()
     sequence = sequence[int(coords[1]):int(min_coord)]
     reverse_seq = reverse_complement(sequence)
     #print sequence, gene.split("*")[1], int(coords[0])-500, "this on +, normal"
     if len(sequence) > 100:
         output.write(
             ">%s=%s=%s=%s=%s=%s=%s\n%s\n" %
             (record.id, gene.split("*")[1],
              int(coords[1]), int(min_coord),
              len(sequence), gene.split("*")[2],
              "OVERLAPPED", reverse_seq))
         print("%s\t%s\t%s\t%s\t%s" %
               (record.id, gene.split("*")[1],
                int(coords[1]), int(min_coord),
                len(sequence)))
     else:
         n_short_genes += 1
 elif overlap_counter_m == 0:
示例#50
0
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser('')
    parser.add_argument('fa',
                        help='name of fasta file',
                        type=argparse.FileType('r'))
    parser.add_argument('seqnames',
                        help='file containing names sequence to complement',
                        type=argparse.FileType('r'))
    args = parser.parse_args()

    ids = [line.strip() for line in args.seqnames.readlines()]

    from Bio.SeqIO import parse
    from Bio.Seq import reverse_complement
    import re

    with open(args.fa.name + '.rc', 'w') as fout:
        for fasta in parse(args.fa.name, 'fasta'):
            fout.write('>' + fasta.id + '\n')
            if fasta.id in ids:
                fout.write(
                    re.sub("(.{64})", "\\1\n",
                           str(reverse_complement(fasta.seq)), 0, re.DOTALL))
            else:
                fout.write(
                    re.sub("(.{64})", "\\1\n", str(fasta.seq), 0, re.DOTALL))
            fout.write('\n')
示例#51
0
 #The interval [x2 y2] of the read (i.e., the unclipped
 #data, also called the 'clear range') aligns with the
 #interval [x1 y1] of the contig. If x1 > y1 (the contig
 #positions), then the reverse complement of the read is
 #aligned to the contig. For the read positions, x2 is
 #always < y2.
 #
 # If MIRA is used in mapping mode, the soft trimmed
 # region can contain gaps which must be discarded
 # for getting the CIGAR S operator count.
 if x1 > y1:
     current_read.ref_rc = True
     #SAM stores these backwards:
     cigar = make_cigar(
         padded_con_seq[y1 - 1:x1],
         reverse_complement(
             current_read.read_seq[x2 - 1:y2]))
     if x2 > 1:
         clipped = len(
             current_read.read_seq[:x2 - 1].replace(
                 "*", ""))
         cigar += "%iS" % clipped
     if y2 < len(current_read.read_seq):
         clipped = len(
             current_read.read_seq[y2:].replace(
                 "*", ""))
         cigar = "%iS%s" % (clipped, cigar)
 else:
     cigar = make_cigar(
         padded_con_seq[x1 - 1:y1],
         current_read.read_seq[x2 - 1:y2])
     if x2 > 1:
示例#52
0
def worker(conting, start, stop):
    global parent_pid, dictRefSeq, options
    pid = os.getpid()
    tmp_file_name = 'tmp_' + str(parent_pid) + '_' + str(pid)
    max_depth = options.max_depth
    if "GL" in conting:
        max_depth = options.rRNA_max_depth
    else:
        max_depth = options.max_depth
    with pysam.AlignmentFile(options.input,
                             'rb') as samfile, file(tmp_file_name,
                                                    'a') as tmp_file:
        try:
            for pileupcolumn in samfile.pileup(conting,
                                               start=start,
                                               stop=stop,
                                               max_depth=max_depth,
                                               ignore_orphans=False,
                                               ignore_overlaps=False,
                                               truncate=True):
                chr = pileupcolumn.reference_name
                ref_seq = dictRefSeq.get(pileupcolumn.reference_name)
                ref_base = ref_seq[pileupcolumn.pos].upper()
                pos = pileupcolumn.pos + 1
                positive_seq = []
                positive_C_seg = []
                negative_seq = []
                negative_C_seg = []
                PF_positive_base = {}
                PF_negative_base = {}
                PF_positive_qual = {}
                PF_negative_qual = {}
                PF_positive_C = {}
                PF_negative_C = {}
                for pileupread in pileupcolumn.pileups:
                    if pileupread.query_position is not None and pileupread.alignment.query_qualities[
                            pileupread.query_position] >= options.qual:
                        not_at_end = True
                        if pileupread.alignment.is_reverse:
                            if options.trim_tail and pileupread.query_position < options.trim_tail:
                                not_at_end = False
                            if not_at_end == True and options.trim_head and pileupread.alignment.query_length - options.trim_head <= pileupread.query_position:
                                not_at_end = False
                        else:
                            if options.trim_tail and pileupread.alignment.query_length - options.trim_tail <= pileupread.query_position:
                                not_at_end = False
                            if not_at_end == True and pileupread.query_position < options.trim_head:
                                not_at_end = False
                        if not_at_end == True:
                            query_name, query_base, query_qual = pileupread.alignment.query_name, pileupread.alignment.query_sequence[
                                pileupread.
                                query_position], pileupread.alignment.query_qualities[
                                    pileupread.query_position]
                            if pileupread.alignment.get_tag("YG") == "C2T":
                                C_segment = pileupread.alignment.query_sequence.count(
                                    'C')
                                if PF_positive_base.get(query_name) is None:
                                    PF_positive_base[query_name] = query_base
                                    PF_positive_qual[query_name] = query_qual
                                    PF_positive_C[query_name] = C_segment
                                else:
                                    lastRead = PF_positive_base.get(query_name)
                                    lastCcount = PF_positive_C.get(query_name)
                                    if lastRead != query_base:
                                        if options.omit:
                                            PF_positive_base.pop(query_name)
                                            PF_positive_qual.pop(query_name)
                                            PF_positive_C.pop(query_name)
                                        else:
                                            lastQual = PF_positive_qual.get(
                                                query_name)
                                            if lastQual < query_qual:
                                                PF_positive_base[
                                                    query_name] = query_base
                                                PF_positive_qual[
                                                    query_name] = query_qual
                                                PF_positive_C[
                                                    query_name] = C_segment
                                    elif lastRead == query_base:
                                        if lastCcount < PF_positive_C.get(
                                                query_name):
                                            PF_positive_base[
                                                query_name] = query_base
                                            PF_positive_qual[
                                                query_name] = query_qual
                                            PF_positive_C[
                                                query_name] = C_segment

                            elif pileupread.alignment.get_tag("YG") == "G2A":
                                C_segment = pileupread.alignment.query_sequence.count(
                                    'G')
                                query_base = reverse_complement(query_base)

                                if PF_negative_base.get(query_name) is None:
                                    PF_negative_base[query_name] = query_base
                                    PF_negative_qual[query_name] = query_qual
                                    PF_negative_C[query_name] = C_segment
                                else:
                                    lastRead = PF_negative_base.get(query_name)
                                    lastCcount = PF_negative_C.get(query_name)
                                    if lastRead != query_base:
                                        if options.omit:
                                            PF_negative_base.pop(query_name)
                                            PF_negative_qual.pop(query_name)
                                            PF_negative_C.pop(query_name)
                                        else:
                                            lastQual = PF_negative_qual.get(
                                                query_name)
                                            if lastQual < query_qual:
                                                PF_negative_base[
                                                    query_name] = query_base
                                                PF_negative_qual[
                                                    query_name] = query_qual
                                                PF_negative_C[
                                                    query_name] = C_segment
                                    elif lastRead == query_base:
                                        if lastCcount < PF_negative_C.get(
                                                query_name):
                                            PF_negative_base[
                                                query_name] = query_base
                                            PF_negative_qual[
                                                query_name] = query_qual
                                            PF_negative_C[
                                                query_name] = C_segment

                list_tmp_positive = PF_positive_base.values()
                list_tmp_negative = PF_negative_base.values()
                if len(list_tmp_positive) > 0:
                    positive_seq = PF_positive_base.values()
                    positive_C_seg = [str(i) for i in PF_positive_C.values()]
                    tmp_file.write("\t".join([
                        chr,
                        str(pos), '+', ref_base, 'Genome',
                        str(pos), ','.join(positive_seq),
                        ','.join(positive_C_seg)
                    ]))
                    tmp_file.write("\n")
                if len(list_tmp_negative) > 0:
                    negative_seq = PF_negative_base.values()
                    negative_C_seg = [str(i) for i in PF_negative_C.values()]
                    ref_base = reverse_complement(ref_base)
                    tmp_file.write("\t".join([
                        chr,
                        str(pos), '-', ref_base, 'Genome',
                        str(pos), ','.join(negative_seq),
                        ','.join(negative_C_seg)
                    ]))
                    tmp_file.write("\n")
        except ValueError:
            print "Conting [%s] does not exist in @SQ header, pass" % conting
示例#53
0
 def complement(self, seq):
     #TODO - use Seq methods instead of this hack:?
     return reverse_complement(seq)[::-1]
def six_frame_translations(seq, genetic_code=1):
    """Return pretty string showing the 6 frame translations and GC content.
    coded and written by casesagar aka sagar saini

    >>> from Bio.SeqUtils import six_frame_translations
    >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA"))
    GC_Frame: a:5 t:0 g:8 c:5 
    Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC
    <BLANKLINE>
    <BLANKLINE>
    1/1
      G  H  C  N  G  P  L
     W  P  L  *  W  A  A
    M  A  I  V  M  G  R  *
    auggccauuguaaugggccgcuga 
    uaccgguaacauuacccggcgacu
    A  M  T  I  P  R  Q 
     H  G  N  Y  H  A  A  S
      P  W  Q  L  P  G  S
    <BLANKLINE>
    <BLANKLINE>

    """  # noqa for pep8 W291 trailing whitespace
    from Bio.Seq import reverse_complement, translate

    anti = reverse_complement(seq)
    comp = anti[::-1]
    length = len(seq)
    frames = {}
    for i in range(0, 3):
        fragment_length = 3 * ((length - i) // 3)
        frames[i + 1] = translate(
            seq[i:i + fragment_length],
            genetic_code,
        )
        frames[-(i + 1)] = translate(
            anti[i:i + fragment_length],
            genetic_code,
        )[::-1]

    # create header
    if length > 20:
        short = "%s ... %s" % (seq[:10], seq[-10:])
    else:
        short = seq
    header = "GC_Frame: "
    for nt in ["a", "t", "g", "c"]:
        header += "%s:%d " % (nt, seq.count(nt.upper()))

    header += "\nSequence: %s, %d nt, %0.2f %%GC\n\n\n" % (
        short.lower(),
        length,
        GC(seq),
    )
    res = header

    frame_3 = frames[3]
    frame_2 = frames[2]
    frame_1 = frames[1]
    # seq

    # - frames
    framecomp_2 = frames[-2]
    framecomp_1 = frames[-1]
    framecomp_3 = frames[-3]
    return res, frame_1, frame_2, frame_3, seq, comp, framecomp_2, framecomp_1, framecomp_3
示例#55
0
 def format_alignment(self, alignment):
     """Return a string with a single alignment formatted as one PSL line."""
     if not isinstance(alignment, Alignment):
         raise TypeError("Expected an Alignment object")
     coordinates = alignment.coordinates
     if not coordinates.size:  # alignment consists of gaps only
         return ""
     target, query = alignment.sequences
     try:
         qName = query.id
     except AttributeError:
         qName = "query"
     try:
         query = query.seq
     except AttributeError:
         pass
     try:
         tName = target.id
     except AttributeError:
         tName = "target"
     try:
         target = target.seq
     except AttributeError:
         pass
     tSize = len(target)
     qSize = len(query)
     # fmt: off
     dnax = None  # set to True for translated DNA aligned to protein,
                  # and to False for DNA/RNA aligned to DNA/RNA  # noqa: E114, E116
     if coordinates[1, 0] > coordinates[1, -1]:
         # DNA/RNA mapped to reverse strand of DNA/RNA
         strand = "-"
         query = reverse_complement(query, inplace=False)
         coordinates = coordinates.copy()
         coordinates[1, :] = qSize - coordinates[1, :]
     elif coordinates[0, 0] > coordinates[0, -1]:
         # protein mapped to reverse strand of DNA
         strand = "-"
         target = reverse_complement(target, inplace=False)
         coordinates = coordinates.copy()
         coordinates[0, :] = tSize - coordinates[0, :]
         dnax = True
     else:
         # mapped to forward strand
         strand = "+"
     # fmt: on
     wildcard = self.wildcard
     mask = self.mask
     # variable names follow those in the PSL file format specification
     matches = 0
     misMatches = 0
     repMatches = 0
     nCount = 0
     qNumInsert = 0
     qBaseInsert = 0
     tNumInsert = 0
     tBaseInsert = 0
     blockSizes = []
     qStarts = []
     tStarts = []
     tStart, qStart = coordinates[:, 0]
     for tEnd, qEnd in coordinates[:, 1:].transpose():
         if tStart == tEnd:
             if qStart > 0 and qEnd < qSize:
                 qNumInsert += 1
                 qBaseInsert += qEnd - qStart
             qStart = qEnd
         elif qStart == qEnd:
             if tStart > 0 and tEnd < tSize:
                 tNumInsert += 1
                 tBaseInsert += tEnd - tStart
             tStart = tEnd
         else:
             tCount = tEnd - tStart
             qCount = qEnd - qStart
             tStarts.append(tStart)
             qStarts.append(qStart)
             blockSizes.append(qCount)
             if tCount == qCount:
                 assert dnax is not True
                 dnax = False
             else:
                 # translated DNA aligned to protein, typically generated by
                 # blat -t=dnax -q=prot
                 assert tCount == 3 * qCount
                 assert dnax is not False
                 dnax = True
             tSeq = target[tStart:tEnd]
             qSeq = query[qStart:qEnd]
             try:
                 tSeq = bytes(tSeq)
             except TypeError:  # string
                 tSeq = bytes(tSeq, "ASCII")
             except UndefinedSequenceError:  # sequence contents is unknown
                 tSeq = None
             try:
                 qSeq = bytes(qSeq)
             except TypeError:  # string
                 qSeq = bytes(qSeq, "ASCII")
             except UndefinedSequenceError:  # sequence contents is unknown
                 qSeq = None
             if tSeq is None or qSeq is None:
                 # contents of at least one sequence is unknown;
                 # count all aligned letters as matches:
                 matches += qCount
             else:
                 if mask == "lower":
                     for u1, u2, c1 in zip(tSeq.upper(), qSeq.upper(), tSeq):
                         if u1 == wildcard or u2 == wildcard:
                             nCount += 1
                         elif u1 == u2:
                             if u1 == c1:
                                 matches += 1
                             else:
                                 repMatches += 1
                         else:
                             misMatches += 1
                 elif mask == "upper":
                     for u1, u2, c1 in zip(tSeq.lower(), qSeq.lower(), tSeq):
                         if u1 == wildcard or u2 == wildcard:
                             nCount += 1
                         elif u1 == u2:
                             if u1 == c1:
                                 matches += 1
                             else:
                                 repMatches += 1
                         else:
                             misMatches += 1
                 else:
                     for u1, u2 in zip(tSeq.upper(), qSeq.upper()):
                         if u1 == wildcard or u2 == wildcard:
                             nCount += 1
                         elif u1 == u2:
                             matches += 1
                         else:
                             misMatches += 1
             tStart = tEnd
             qStart = qEnd
     try:
         matches = alignment.matches
     except AttributeError:
         pass
     try:
         misMatches = alignment.misMatches
     except AttributeError:
         pass
     try:
         repMatches = alignment.repMatches
     except AttributeError:
         pass
     try:
         nCount = alignment.nCount
     except AttributeError:
         pass
     tStart = tStarts[0]  # start of alignment in target
     qStart = qStarts[0]  # start of alignment in query
     tEnd = tStarts[-1] + tCount  # end of alignment in target
     qEnd = qStarts[-1] + qCount  # end of alignment in query
     if strand == "-":
         if dnax is True:
             tStart, tEnd = tSize - tEnd, tSize - tStart
         else:
             qStart, qEnd = qSize - qEnd, qSize - qStart
     blockCount = len(blockSizes)
     blockSizes = ",".join(map(str, blockSizes)) + ","
     qStarts = ",".join(map(str, qStarts)) + ","
     tStarts = ",".join(map(str, tStarts)) + ","
     if dnax:
         strand = "+" + strand
     words = [
         str(matches),
         str(misMatches),
         str(repMatches),
         str(nCount),
         str(qNumInsert),
         str(qBaseInsert),
         str(tNumInsert),
         str(tBaseInsert),
         strand,
         qName,
         str(qSize),
         str(qStart),
         str(qEnd),
         tName,
         str(tSize),
         str(tStart),
         str(tEnd),
         str(blockCount),
         blockSizes,
         qStarts,
         tStarts,
     ]
     line = "\t".join(words) + "\n"
     return line
示例#56
0
def build_filter(bloom_filename,
                 linear_refs,
                 circular_refs,
                 kmer,
                 mismatches,
                 inserts,
                 deletions,
                 error_rate=0.01,
                 rc=True):
    #Using 5e-06 is close to a set for my example, both in run time
    #(a fraction more) and the number of reads kept (9528 vs 8058
    #with sets).
    simple = set()
    del_hashes = set()
    count = 0
    t0 = time.time()
    if linear_refs:
        for fasta in linear_refs:
            sys.stderr.write("Hashing linear references in %s\n" % fasta)
            handle = open(fasta)
            for upper_seq, raw_read in fasta_iterator(handle):
                #assert set(upper_seq).issubset("ACGT"), "%s contains %s" \
                #    % (raw_read.split("\n",1)[0], set(upper_seq).difference("ACGT"))
                #Note we do the disambiguate call on the fragments rather than
                #the whole reference to avoid too many levels of recursion.
                for i in range(0, len(upper_seq) - kmer):
                    for fragment in disambiguate(upper_seq[i:i + kmer]):
                        assert set(fragment).issubset("ACGT"), fragment
                        simple.add(fragment)
                        #bloom.add(fragment, kmer)
                        count += 1  #TODO - Can do this in one go from len(upper_seq)
                if deletions:
                    for i in range(0, len(upper_seq) - kmer + 1):
                        for fragment in make_deletions(upper_seq[i:i + kmer +
                                                                 1]):
                            del_hashes.add(fragment)
            handle.close()

    if circular_refs:
        for fasta in circular_refs:
            sys.stderr.write("Hashing circular references in %s\n" % fasta)
            handle = open(fasta)
            for upper_seq, raw_read in fasta_iterator(handle):
                #assert set(upper_seq).issubset("ACGT"), "%s contains %s" \
                #    % (raw_read.split("\n",1)[0], set(upper_seq).difference("ACGT"))
                #Want to consider wrapping round the origin, add k-mer length:
                upper_seq += upper_seq[:kmer]
                for i in range(0, len(upper_seq) - kmer):
                    for fragment in disambiguate(upper_seq[i:i + kmer]):
                        assert set(fragment).issubset("ACGT"), fragment
                        simple.add(fragment)
                        #bloom.add(fragment, kmer)
                        count += 1  #TODO - Can do this in one go from len(upper_seq)
                if deletions:
                    for i in range(0, len(upper_seq) - kmer + 1):
                        for fragment in make_deletions(upper_seq[i:i + kmer +
                                                                 1]):
                            del_hashes.add(fragment)
            handle.close()
    if rc:
        #Would popping be slow? Should mean less memory at once
        temp = simple.copy()
        for fragment in temp:
            simple.add(reverse_complement(fragment))
        del temp
    if mismatches or inserts or deletions:
        sys.stderr.write("Have %i unique k-mers before consider fuzzy matches\n" \
                         % (len(simple)))
        if deletions:
            #Do this first to avoid 3 large sets in memory!
            new = del_hashes
            del del_hashes
            new.update(simple)
            sys.stderr.write("Adding deletions brings this to %i unique k-mers\n" \
                             % len(new))
        else:
            new = simple.copy()
        if mismatches:
            for fragment in simple:
                for var in make_variants(fragment, mismatches):
                    new.add(var)
            sys.stderr.write("Adding %i mis-matches per k-mer, have %i unique k-mers\n" \
                             % (mismatches, len(new)))
        if inserts:
            for fragment in simple:
                for var in make_inserts(fragment):
                    new.add(var)
            sys.stderr.write("Adding inserts brings this to %i unique k-mers\n" \
                             % len(new))
        simple = new
    capacity = len(simple)
    bloom = pydablooms.Dablooms(capacity, error_rate, bloom_filename)
    for fragment in simple:
        bloom.add(fragment)
    bloom.flush()
    sys.stderr.write(
        "Set and bloom filter of %i-mers created (%i k-mers considered, %i unique)\n"
        % (kmer, count, len(simple)))
    sys.stderr.write(
        "Using Bloom filter with capacity %i and error rate %r\n" %
        (capacity, error_rate))
    sys.stderr.write("Building filters took %0.1fs\n" % (time.time() - t0))
    return simple, bloom
    print(len(readsDict))
    for s in spacersDic:
        for rd in spacerReadsDic[s]:
            seqRead = str(readsDict.get(rd,"None"))
            if seqRead != "None":
                seqSpacer = spacersDic[s]
                seqRead = str(readsDict[rd].seq)
                alignment = pairwise2.align.localms(seqRead,seqSpacer,2,-.1,-3,-2, one_alignment_only=True)
                if alignment[0][2] >= (len(seqSpacer)*2)-(args.r*2.1):
                    print(alignment[0]), len(seqSpacer)*2
                    print(format_alignment(*alignment[0]))
                    readNameDic[s].append(readsDict[rd].id)
                    readQualDic[s].append(readsDict[rd].letter_annotations["phred_quality"][alignment[0][3]:alignment[0][4]])
                ## Reverse aligment read 1:
                alignmentR = pairwise2.align.localms(reverse_complement(seqRead),seqSpacer,2,-.1,-3,-2, one_alignment_only=True)
                if alignmentR[0][2] >= (len(seqSpacer)*2)-(args.r*2.1):
                    print(alignmentR[0]), len(seqSpacer)*2
                    print(format_alignment(*alignmentR[0]))
                    readNameDic[s].append(readsDict[rd].id)
                    rR = readsDict[rd].letter_annotations["phred_quality"]
                    rR.reverse()
                    readQualDic[s].append(rR[alignmentR[0][3]:alignmentR[0][4]])

readsDict = dict()  ## clean memory

# Generate out file:
with open(args.out+".resultQ.file.test.csv",'wt') as spacers_out_file:
    spacers_out_file.write("id\tNR\tNR100%\tAvQ100%\tstdQ100%\tAveQ100%List\n")

with open(args.out+".resultQ.file.test.full_report.txt",'wt') as spacers_out_file2:
示例#58
0
    def _exonic_transcript_effect(self, exon, exon_number, transcript):
        """Effect of this variant on a Transcript, assuming we already know
        that this variant overlaps some exon of the transcript.

        Parameters
        ----------
        exon : pyensembl.Exon
            Exon which this variant overlaps

        exon_number : int
            Index (starting from 1) of the given exon in the transcript's
            sequence of exons.

        transcript : pyensembl.Transcript
        """

        genome_ref = self.ref
        genome_alt = self.alt

        # clip mutation to only affect the current exon
        if self.start < exon.start:
            # if mutation starts before current exon then only look
            # at nucleotides which overlap the exon
            assert len(genome_ref) > 0, "Unexpected insertion into intron"
            n_skip_start = exon.start - self.start
            genome_ref = genome_ref[n_skip_start:]
            genome_alt = genome_alt[n_skip_start:]
            genome_start = exon.start
        else:
            genome_start = self.start

        if self.end > exon.end:
            # if mutation goes past exon end then only look at nucleotides
            # which overlap the exon
            n_skip_end = self.end - exon.end
            genome_ref = genome_ref[:-n_skip_end]
            genome_alt = genome_alt[:len(genome_ref)]
            genome_end = exon.end
        else:
            genome_end = self.end

        transcript_offset = interval_offset_on_transcript(
            genome_start, genome_end, transcript)

        if transcript.on_backward_strand:
            strand_ref = reverse_complement(genome_ref)
            strand_alt = reverse_complement(genome_alt)
        else:
            strand_ref = genome_ref
            strand_alt = genome_alt

        expected_ref = str(transcript.sequence[
            transcript_offset:transcript_offset + len(strand_ref)])

        if strand_ref != expected_ref:
            raise ValueError(
                ("Found ref nucleotides '%s' in sequence"
                 " of %s at offset %d (chromosome positions %d:%d)"
                 " but variant %s has '%s'") % (
                     expected_ref,
                     transcript,
                     transcript_offset,
                     genome_start,
                     genome_end,
                     self,
                     strand_ref))

        utr5_length = min(transcript.start_codon_spliced_offsets)

        # does the variant start inside the 5' UTR?
        if utr5_length > transcript_offset:
            # does the variant end after the 5' UTR, within the coding region?
            if utr5_length < transcript_offset + len(strand_ref):
                return StartLoss(self, transcript)
            else:
                # if variant contained within 5' UTR
                return FivePrimeUTR(self, transcript)

        utr3_offset = max(transcript.stop_codon_spliced_offsets) + 1

        if transcript_offset >= utr3_offset:
            return ThreePrimeUTR(self, transcript)

        exon_start_offset = interval_offset_on_transcript(
            exon.start, exon.end, transcript)
        exon_end_offset = exon_start_offset + len(exon) - 1

        # Further below we're going to try to predict exonic splice site
        # modifications, which will take this effect_annotation as their
        # alternative hypothesis for what happens if splicing doesn't change.
        # If the mutation doesn't affect an exonic splice site, then
        # we'll just return this effect.
        coding_effect_annotation = coding_effect(
            ref=strand_ref,
            alt=strand_alt,
            transcript_offset=transcript_offset,
            variant=self,
            transcript=transcript)

        if changes_exonic_splice_site(
                transcript=transcript,
                transcript_ref=strand_ref,
                transcript_alt=strand_alt,
                transcript_offset=transcript_offset,
                exon_start_offset=exon_start_offset,
                exon_end_offset=exon_end_offset,
                exon_number=exon_number):
            return ExonicSpliceSite(
                variant=self,
                transcript=transcript,
                exon=exon,
                alternate_effect=coding_effect_annotation)
        return coding_effect_annotation
示例#59
0
 def antiparallel(self, seq):
     """Return reverse complementary sequence."""
     return reverse_complement(seq)
示例#60
0
文件: utils.py 项目: shunsunsun/pydna
def eq(*args, **kwargs):
    '''Compares two or more DNA sequences for equality i.e. they
    represent the same DNA molecule. Comparisons are case insensitive.

    Parameters
    ----------
    args : iterable
        iterable containing sequences
        args can be strings, Biopython Seq or SeqRecord, Dseqrecord
        or dsDNA objects.
    circular : bool, optional
        Consider all molecules circular or linear
    linear : bool, optional
        Consider all molecules circular or linear

    Returns
    -------
    eq : bool
        Returns True or False

    Notes
    -----

    Compares two or more DNA sequences for equality i.e. if they
    represent the same DNA molecule.

    Two linear sequences are considiered equal if either:

    * They have the same sequence (case insensitive)
    * One sequence is the reverse complement of the other (case insensitive)

    Two circular sequences are considered equal if they are circular permutations:

    1. They have the same lengt, AND
    2. One sequence or can be found in the concatenation of the other sequence with itself, OR
    3. The reverse complement can be found in the concatenation of the other sequence with itself.

    The topology for the comparison can be set using one of the keywords
    linear or circular to True or False.

    If circular or linear is not set, it will be deduced from the topology of
    each sequence for sequences that have a linear or circular attribute
    (like Dseq and Dseqrecord).

    Examples
    --------

    >>> from pydna import eq, Dseqrecord
    >>> eq("aaa","AAA")
    True
    >>> eq("aaa","AAA","TTT")
    True
    >>> eq("aaa","AAA","TTT","tTt")
    True
    >>> eq("aaa","AAA","TTT","tTt", linear=True)
    True
    >>> eq("Taaa","aTaa", linear = True)
    False
    >>> eq("Taaa","aTaa", circular = True)
    True
    >>> a=Dseqrecord("Taaa")
    >>> b=Dseqrecord("aTaa")
    >>> eq(a,b)
    False
    >>> eq(a,b,circular=True)
    True
    >>> a=a.looped()
    >>> b=b.looped()
    >>> eq(a,b)
    True
    >>> eq(a,b,circular=False)
    False
    >>> eq(a,b,linear=True)
    False
    >>> eq(a,b,linear=False)
    True
    >>> eq("ggatcc","GGATCC")
    True
    >>> eq("ggatcca","GGATCCa")
    True
    >>> eq("ggatcca","tGGATCC")
    True


    '''

    from Bio.Seq import reverse_complement
    from Bio.SeqRecord import SeqRecord
    import itertools
    args = list(args)
    for i, arg in enumerate(args):
        if not hasattr(arg, "__iter__") or isinstance(arg, SeqRecord):
            args[i] = (arg, )
    args = list(itertools.chain.from_iterable(args))

    topology = None

    if "linear" in kwargs:
        if kwargs["linear"] == True:
            topology = "linear"
        if kwargs["linear"] == False:
            topology = "circular"
    elif "circular" in kwargs:
        if kwargs["circular"] == True:
            topology = "circular"
        if kwargs["circular"] == False:
            topology = "linear"
    else:
        # topology keyword not set, look for topology associated to each sequence
        # otherwise raise exception
        topology = set([
            arg.circular if hasattr(arg, "circular") else None for arg in args
        ])

        if len(topology) != 1:
            raise Exception("sequences have different topologies")
        topology = topology.pop()
        if topology in (False, None):
            topology = "linear"
        elif topology == True:
            topology = "circular"

    #args_string_list    = [str(arg.seq).lower() if hasattr(arg,"seq") else str(arg).lower() for arg in args]

    args = [arg.seq if hasattr(arg, "seq") else arg for arg in args]
    args_string_list = [
        arg.watson.lower() if hasattr(arg, "watson") else str(arg).lower()
        for arg in args
    ]

    length = set((len(s) for s in args_string_list))

    if len(length) != 1:
        return False
    same = True

    if topology == "circular":
        # force circular comparison of all given sequences
        for s1, s2 in itertools.combinations(args_string_list, 2):
            if not (s1 in s2 + s2 or reverse_complement(s1) in s2 + s2):
                same = False
    elif topology == "linear":
        # force linear comparison of all given sequences
        for s1, s2 in itertools.combinations(args_string_list, 2):
            if not (s1 == s2 or s1 == reverse_complement(s2)):
                same = False
    return same