def start(self, database, expanded, to_examine, working=''): if working!='': if working[-1]!='/': working+='/' if '../' in working: dir = os.path.dirname(__file__) working = os.path.join(dir, working) os.chdir(working) else: working=os.path.dirname(__file__) if '../' in database or database[0]!='/': dir = os.path.dirname(working) database = os.path.join(dir, database) if '../' in expanded or expanded[0]!='/': dir = os.path.dirname(working) expanded = os.path.join(dir, expanded) if '../' in to_examine or to_examine[0]!='/': dir = os.path.dirname(working) to_examine = os.path.join(dir, to_examine) infile_filename = expanded.split('/')[-1] infiledir = expanded.split(infile_filename)[0] extracted_path=infiledir+'extracted_genes/' if not os.path.exists(extracted_path): os.makedirs(extracted_path) #print extracted_path, "test'" with open(to_examine, 'r') as file: all_examine=file.readlines() file.close() #print all_examine[:4] examin_headers=[] for line in all_examine: if '>' in line: examin_headers.append(line.split('>')[1]) elif '\n'!=line: examin_headers.append(line) #print len(examin_headers) with open(expanded, 'r') as file: all_exapnded =file.readlines() file.close() #print all_exapnded[:4] x=0 while x <len(all_examine): #print [all_examine[x]] all_examine[x]=str(all_examine[x]).replace('\t','').replace('\n','') x+=1 #print all_exapnded[0:3] clustered_headers=[] clusters=[] record=False y=0 t=True #print examin_headers[:4] for header in examin_headers: for line in all_exapnded: #if t==True: # print 'x' if record==True: if '--------' in line: record=False clustered_headers.append(clusters) clusters=[] else: clusters.append(line) if ' 'in header: #if t==True: # t=False # print '-'+header.split(' ')[0].replace('\n','') if '-'+header.split(' ')[0].replace('\n','') in line: clusters.append(header.split(' ')[0]) record=True else: #if t==True: # t=False # print '-'+header.split(' ')[0].replace('\n','') if '-'+header.replace('\n','') in line: clusters.append(header.replace("signalal","signal")) #print "test" record=True clustered_headers.append(clusters) #print clustered_headers #print len(clustered_headers) #print len(clustered_headers[0]) with open(database,'r') as file: all_genes=file.readlines() file.close() record=False #print clustered_headers[:2] #print len(clustered_headers) to_edit = all_genes all_genes=[] for gene in to_edit: if ' 'in gene: all_genes.append(gene.split(' ')[0]) #turn this into a \n replace as well as a " " replace with _. problem is signal peptide spelled signalal else: all_genes.append(gene) #print [clustered_headers[-1][0]] #print [all_genes[0]] #print len(all_genes) reverse=False check=True for cluster in clustered_headers: z=0 to_write='' while z<len(cluster): x=0 if '63_fo_mel_Fom013_contig_1806:3631-3882' in cluster[z]: print("ahhhhhh") print(cluster[z]) if '_Reversed' in cluster[z].replace('\n',''): reverse=True to_reverse='' while x<len(all_genes): if record==True and reverse==False: if '63_fo_mel_Fom013_contig_1806:3631-3882' in cluster[z]: print("ahhhhhh") if '>' in all_genes[x]: record=False to_write+='\n' else: to_write+=all_genes[x] elif record==True and reverse ==True: if '>' in all_genes[x]: record=False reverse=False my_seq=Seq.Seq(str(to_reverse)) rrr=my_seq.reverse_complement() to_write+=str(rrr).replace('\n','')+'\n' else: to_reverse+=all_genes[x].replace('\n','') if reverse==True: if ">"+cluster[z].split('_Reversed')[0] == all_genes[x].split(" ")[0].replace('\n','') and cluster[z]!='\n': to_write+='>'+cluster[z] record=True else: if ">"+cluster[z].replace('\n',"") == all_genes[x].split(" ")[0].replace('\n','') and cluster[z]!='\n': to_write+='>'+cluster[z] record=True x+=1 z+=1 if cluster != []: #print extracted_path with open(extracted_path+cluster[0].replace('\n','')+'_extracted.fasta','w') as file: file.write(to_write) file.close
def test_append_proteins(self): self.test_chars.append(Seq.Seq("K")) self.test_chars.append(Seq.Seq("K-")) self.test_chars.append(Seq.Seq("K@")) self.assertEqual(7, len(self.test_chars))
def setUp(self): sequence = b"TCAAAAGGATGCATCATG" self.s = Seq.Seq(sequence) self.mutable_s = Seq.MutableSeq(sequence)
def test_concatenation_of_seq(self): t = Seq.Seq("T") u = self.s + t self.assertEqual(str(self.s) + "T", u) self.assertEqual(self.s + Seq.Seq("T"), "TCAAAAGGATGCATCATGT")
def test_not_equal_comparsion(self): """Test __ne__ comparison method.""" self.assertNotEqual(Seq.Seq("TCAAA"), Seq.Seq("TCAAAA"))
def project_to_genbank(filename, project, allblocks, construct_id=None): if construct_id is not None: blocks = [construct_id] else: blocks = project["components"] seq_obj_lst = [] # For each of the construct in the project for block_id in blocks: block = [b for b in allblocks if b["id"] == block_id][0] if not block: continue # Grab the original ID that came from genbank before if available, otherwise the GD Name as the name if "genbank" in block["metadata"] and "id" in block["metadata"][ "genbank"]: genbank_id = block["metadata"]["genbank"]["id"] elif "genbank" in block["metadata"] and "name" in block["metadata"][ "genbank"]: genbank_id = block["metadata"]["genbank"]["name"] else: genbank_id = "GC_DNA" sequence = build_sequence(block, allblocks) seq_obj = SeqIO.SeqRecord( Seq.Seq(sequence, Seq.Alphabet.DNAAlphabet()), genbank_id) # Create a 'source' feature sf = SeqFeature.SeqFeature() sf.type = "source" sf.location = SeqFeature.FeatureLocation(0, len(seq_obj.seq)) add_GC_info(sf, block, allblocks) if "genbank" in block["metadata"]: # Set up all the annotations in the genbank record. These came originally from genbank. if "annotations" in block["metadata"]["genbank"]: for annot_key, annot_value in block["metadata"]["genbank"][ "annotations"].iteritems(): seq_obj.annotations[annot_key] = annot_value # Set up all the references in the genbank record. These came originally from genbank. if "references" in block["metadata"]["genbank"]: for ref in block["metadata"]["genbank"]["references"]: genbank_ref = SeqFeature.Reference() genbank_ref.authors = ref['authors'] genbank_ref.comment = ref['comment'] genbank_ref.consrtm = ref['consrtm'] genbank_ref.journal = ref['journal'] genbank_ref.medline_id = ref['medline_id'] genbank_ref.pubmed_id = ref['pubmed_id'] genbank_ref.title = ref['title'] if "references" not in seq_obj.annotations: seq_obj.annotations["references"] = [] seq_obj.annotations["references"].append(genbank_ref) # Add the original annotations to the source feature if "feature_annotations" in block["metadata"]["genbank"]: for annot_key, annot_value in block["metadata"]["genbank"][ "feature_annotations"].iteritems(): sf.qualifiers[annot_key] = annot_value seq_obj.features.append(sf) if "description" in block["metadata"]: seq_obj.description = block["metadata"]["description"] if "genbank" in block["metadata"] and "name" in block["metadata"][ "genbank"]: seq_obj.name = block["metadata"]["genbank"]["name"] elif "name" in block["metadata"]: seq_obj.name = block["metadata"]["name"].replace(" ", "")[:5] else: seq_obj.name = "GC_DNA" convert_annotations(block, seq_obj, 0) # Add a block for each of the features, recursively start = 0 for child_id in block['components']: child_block = [b for b in allblocks if b["id"] == child_id][0] start = add_features(child_block, allblocks, seq_obj, start) seq_obj_lst.append(seq_obj) SeqIO.write(seq_obj_lst, open(filename, "w"), "genbank")
def test_gapped_seq_no_gap_char_given(self): seq = Seq.Seq("ATG---AAACTG") self.assertRaises(TranslationError, seq.translate, gap=None)
''' Use a Seq object for a single sequence like a string. ----------------------------------------------------------- (c) 2013 Allegra Via and Kristian Rother Licensed under the conditions of the Python License This code appears in section 19.3.1 of the book "Managing Biological Data with Python". ----------------------------------------------------------- ''' from Bio import Seq my_seq = Seq.Seq("AGCATCGTAGCATGCAC") print my_seq[0] print my_seq[0:3] print my_seq.split('T') print my_seq.count('A') print my_seq.count('A') / float(len(my_seq))
def readToPrimerNonTargets(read, maxPrimerNonspec, refFa, primersInfo=None): indelsPat = re.compile('(\d+)([ID])') matchPat = re.compile('(\d+)M') # Extract strand of the main match in genome if read.flag == 0: strand = 1 elif read.flag == 16 or read.flag == 20: strand = -1 elif read.flag == 4: return (read.qname, []) else: print('ERROR! Unknown value of FLAG:', read.flag) print(read) exit(1) if primersInfo: primersName = read.qname for primerPairNum, primerPairInfo in primersInfo.items(): if primersName in primerPairInfo[1:3]: primerNumInPair = primerPairInfo[1:3].index(primersName) infoStrand = int((-1)**primerNumInPair) try: ## pos=int(primersInfo[primerPairNum][5+primerNumInPair*4])+primerNumInPair pos = int(primersInfo[primerPairNum][5 + primerNumInPair]) except IndexError: print('ERROR (2): Incorrect index:') print(primersInfo) print(primersName) print(primerPairNum) print(primerNumInPair) exit(2) try: targetRegion = [ primersInfo[primerPairNum][4], infoStrand * pos ] except TypeError: print('ERROR!', primersInfo[primerPairNum][4], primersInfo[primerPair][0], primerNumInPair) exit(15) break else: targetRegion = [] if strand == -1: mainMapping = [ read.reference_name, strand * (read.pos + len(read.qname)) ] else: mainMapping = [read.reference_name, strand * (read.pos + 1)] if (mainMapping != targetRegion): ## The next string is necessary for human genomes ## to exclude unsorted chromosome fragments from the analysis ## and '_' not in read.reference_name nonSpecRegions = [ ','.join([ read.reference_name, str(strand * (read.pos + 1)), read.cigarstring, str(read.get_tag('NM')) ]) ] else: nonSpecRegions = [] if read.has_tag('XA'): # XA tag of read ends with ; so the last element is empty nonSpecRegions.extend(read.get_tag('XA').split(';')[:-1]) qname = read.qname if len(nonSpecRegions) > maxPrimerNonspec: return (qname, nonSpecRegions) primerNonSpecRegions = [] for region in nonSpecRegions: # We go through all these regions and check # that 3'-nucleotide matches primer's 3'-end chrom, pos, cigar, subst = region.split(',') # Determine length of sequence of interest by parsing CIGAR. # The length depends only on the deletions # but not mismatches nor insertions into reference genome # So we count number of deletions indelsMatch = indelsPat.findall(cigar) matchMatch = matchPat.findall(cigar) if len(indelsMatch) == 0: regionLen = int(matchMatch[0]) else: sumDeletions = 0 sumMatch = 0 for m in indelsMatch: if m[1] == 'D': sumDeletions += int(m[0]) for m in matchMatch: sumMatch += int(m) regionLen = sumMatch + sumDeletions if int(pos) < 0: pos2 = -(int(pos) + regionLen) else: pos2 = int(pos) if ([chrom, int(pos)] == targetRegion or (len(targetRegion) > 0 and chrom == targetRegion[0] and abs(targetRegion[1] - pos2) <= len(read.seq))): ## print([chrom,int(pos)]) continue attempts = 0 seq = None while (seq is None): try: seq = refFa.fetch(region=chrom + ':' '' + str(abs(int(pos))) + '-' '' + str(abs(int(pos)) + regionLen - 1)) except: seq = None attempts += 1 if attempts >= 10: print('ERROR!') ## logger.error(str(e)) print(refFa.filename) logger.error(refFa.filename) print(chrom, pos, regionLen) exit(1) # Determine, which sequence we should take: # forward or reverse-complement # If primer is on + strand and found region is on opposite # or primer is on - strand and found region is on the same # we take reverse-complement if int(pos) > 0: regStrand = 1 elif int(pos) < 0: regStrand = -1 if (strand > 0 and int(pos) < 0) or (strand < 0 and int(pos) > 0): try: seq = str(Seq.Seq(seq).reverse_complement()).upper() # If there is an error like 'Mixed RNA/DNA found' except ValueError as e: if 'Mixed RNA/DNA found' in str(e): print('ERROR (1): Mixed RNA/DNA found:') print(seq) exit(1) else: print('ERROR: Unknown ValueError!') exit(0) else: seq = seq.upper() # Check if read sequence is the same as read qname if read.qname == read.seq: primerSeq = read.seq regionSeq = seq else: primerSeq = read.qname regionSeq = revComplement(seq) if len(regionSeq) == 1: continue # If there is some insertions or deletion in found region if 'I' in cigar or 'D' in cigar: # We need to align its sequence with primer sequence align = pairwise2.align.globalxx(primerSeq, regionSeq) # Check that 3'-ends of primers are identical ## and one of two nucleotides before 3'-ends are identical, too if (align[0][0][-1] == align[0][1][-1] or align[0][0][-2] == align[0][1][-2]): # Then we consider this region as a non-specific # for this primer primerNonSpecRegions.append( [chrom, regStrand, abs(int(pos)), regionLen]) else: try: if (regionSeq[-1] == primerSeq[-1] or regionSeq[-2] == primerSeq[-2]): # Then we consider this region as a non-specific for this primer primerNonSpecRegions.append( [chrom, regStrand, abs(int(pos)), regionLen]) except IndexError: print('ERROR (2): incorrect index for sequences:') print('regionSeq:', regionSeq) print('primerSeq:', primerSeq) print('chr:', chrom) print('position:', pos) print('regionLen:', regionLen) print(-1, -2) exit(2) return (read.qname, primerNonSpecRegions)
seq = seq[::-1] BaseToLeftIsNoCoverage = False ResultingSeq = '' for base in seq: if base == NoCoverageChar: BaseToLeftIsNoCoverage = True ResultingSeq += NoCoverageChar elif base == GapChar: if BaseToLeftIsNoCoverage: ResultingSeq += NoCoverageChar else: ResultingSeq += GapChar else: BaseToLeftIsNoCoverage = False ResultingSeq += base if LeftToRightDone: ResultingSeq = ResultingSeq[::-1] else: ResultingSeq = PropagateNoCoverageChar(ResultingSeq, True) return ResultingSeq seqs = [] for seq in SeqIO.parse(open(args.FastaFile), 'fasta'): SeqAsString = str(seq.seq) SeqAsString = PropagateNoCoverageChar(SeqAsString) seq.seq = Seq.Seq(SeqAsString) seqs.append(seq) SeqIO.write(seqs, sys.stdout, "fasta")
def calculate_free_energy(seq, check=True, strict=True, c_seq=None, shift=0, nn_table=RNA_NN3, tmm_table=DNA_TMM1, imm_table=DNA_IMM1, de_table=RNA_DE2, dnac1=25, dnac2=0, selfcomp=False, Na=20, K=50, Tris=0, Mg=0, dNTPs=0, saltcorr=5): """Return the delatG using nearest neighbor thermodynamics.""" #print shift print seq seq = str(seq) if not c_seq: # c_seq must be provided by user if dangling ends or mismatches should # be taken into account. Otherwise take perfect complement. c_seq = Seq.Seq(seq).complement() c_seq = str(c_seq) if check: seq = _check(seq, 'Tm_NN') c_seq = _check(c_seq, 'Tm_NN') tmpseq = seq tmp_cseq = c_seq deltaH = 0 deltaS = 0 dH = 0 # Names for indexes dS = 1 # 0 and 1 #print tmpseq, tmp_cseq # Dangling ends? if shift or len(seq) != len(c_seq): # Align both sequences using the shift parameter if shift > 0: tmpseq = '.' * shift + seq if shift < 0: tmp_cseq = '.' * abs(shift) + c_seq if len(tmp_cseq) > len(tmpseq): tmpseq += (len(tmp_cseq) - len(tmpseq)) * '.' if len(tmp_cseq) < len(tmpseq): tmp_cseq += (len(tmpseq) - len(tmp_cseq)) * '.' # Remove 'over-dangling' ends while tmpseq.startswith('..') or tmp_cseq.startswith('..'): tmpseq = tmpseq[1:] tmp_cseq = tmp_cseq[1:] while tmpseq.endswith('..') or tmp_cseq.endswith('..'): tmpseq = tmpseq[:-1] tmp_cseq = tmp_cseq[:-1] #print tmpseq, tmp_cseq # Now for the dangling ends if tmpseq.startswith('.') or tmp_cseq.startswith('.'): left_de = tmpseq[:2] + '/' + tmp_cseq[:2] #print 'left ', left_de deltaH += de_table[left_de][dH] deltaS += de_table[left_de][dS] tmpseq = tmpseq[1:] tmp_cseq = tmp_cseq[1:] if tmpseq.endswith('.') or tmp_cseq.endswith('.'): right_de = tmp_cseq[-2:][::-1] + '/' + tmpseq[-2:][::-1] deltaH += de_table[right_de][dH] deltaS += de_table[right_de][dS] tmpseq = tmpseq[:-1] tmp_cseq = tmp_cseq[:-1] # Now for terminal mismatches left_tmm = tmp_cseq[:2][::-1] + '/' + tmpseq[:2][::-1] if left_tmm in tmm_table: deltaH += tmm_table[left_tmm][dH] deltaS += tmm_table[left_tmm][dS] tmpseq = tmpseq[1:] tmp_cseq = tmp_cseq[1:] right_tmm = tmpseq[-2:] + '/' + tmp_cseq[-2:] if right_tmm in tmm_table: deltaH += tmm_table[right_tmm][dH] deltaS += tmm_table[right_tmm][dS] tmpseq = tmpseq[:-1] tmp_cseq = tmp_cseq[:-1] # Now everything 'unusual' at the ends is handled and removed and we can # look at the initiation. # One or several of the following initiation types may apply: # Type: General initiation value deltaH += nn_table['init'][dH] deltaS += nn_table['init'][dS] # Type: Duplex with no (allA/T) or at least one (oneG/C) GC pair if SeqUtils.GC(seq) == 0: deltaH += nn_table['init_allA/T'][dH] deltaS += nn_table['init_allA/T'][dS] else: deltaH += nn_table['init_oneG/C'][dH] deltaS += nn_table['init_oneG/C'][dS] # Type: Penalty if 5' end is T if seq.startswith('T'): deltaH += nn_table['init_5T/A'][dH] deltaS += nn_table['init_5T/A'][dS] if seq.endswith('A'): deltaH += nn_table['init_5T/A'][dH] deltaS += nn_table['init_5T/A'][dS] # Type: Different values for G/C or A/T terminal basepairs ends = seq[0] + seq[-1] AT = ends.count('A') + ends.count('T') GC = ends.count('G') + ends.count('C') deltaH += nn_table['init_A/T'][dH] * AT deltaS += nn_table['init_A/T'][dS] * AT deltaH += nn_table['init_G/C'][dH] * GC deltaS += nn_table['init_G/C'][dS] * GC # Finally, the 'zipping' for basenumber in range(len(tmpseq) - 1): neighbors = tmpseq[basenumber:basenumber + 2] + '/' + \ tmp_cseq[basenumber:basenumber + 2] #print neighbors if neighbors in imm_table: deltaH += imm_table[neighbors][dH] deltaS += imm_table[neighbors][dS] elif neighbors[::-1] in imm_table: deltaH += imm_table[neighbors[::-1]][dH] deltaS += imm_table[neighbors[::-1]][dS] elif neighbors in nn_table: deltaH += nn_table[neighbors][dH] deltaS += nn_table[neighbors][dS] elif neighbors[::-1] in nn_table: deltaH += nn_table[neighbors[::-1]][dH] deltaS += nn_table[neighbors[::-1]][dS] else: # We haven't found the key... if strict: raise ValueError('no data for neighbors \'' + neighbors + '\'') else: warnings.warn( 'no data for neighbors \'' + neighbors + '\'. Calculation will be wrong', BiopythonWarning) k = (dnac1 - (dnac2 / 2.0)) * 1e-9 if selfcomp: k = dnac1 * 1e-9 deltaH += nn_table['sym'][dH] deltaS += nn_table['sym'][dS] R = 1.987 # universal gas constant in Cal/degrees C*Mol if saltcorr: corr = salt_correction(Na=Na, K=K, Tris=Tris, Mg=Mg, dNTPs=dNTPs, method=saltcorr, seq=seq) if saltcorr == 5: deltaS += corr tao = 273.15 + 22 # Constant temperature tao in Kelvin deltaG = (deltaH * 1000 - tao * deltaS) / 1000 #print deltaG Tm = (1000 * deltaH) / (deltaS + (R * (math.log(k)))) - 273.15 if saltcorr in (1, 2, 3, 4): Tm += corr if saltcorr in (6, 7): Tm = (1 / (1 / (Tm + 273.15) + corr) - 273.15) return deltaG
from Bio import SeqIO from Bio import Seq CENPB = Seq.Seq("NTTCGNNNNANNCGGGN") CENPB_3CtoT = Seq.Seq("NTTTGNNNNANNCGGGN") CENPB_8GtoA = Seq.Seq("NTTCGNNNNANNCGAGN") def check_CENPB(inputf, inputfmt): rec_iter = SeqIO.parse(inputf, inputfmt) while True: try: seq_record = next(rec_iter) except StopIteration: break else: i = 0 while True: seq = seq_record.seq[i:i + 17] if len(seq) < 17: break elif (seq[1:5] == CENPB[1:5] and seq[9] == CENPB[9] and seq[12:16] == CENPB[12:16]): yield [seq_record.id, i, seq, seq_record.description] i += 1 else: i += 1 def check_CENPBrev(inputf, inputfmt): rec_iter = SeqIO.parse(inputf, inputfmt)
def density_adjusted(fname, chr_sam, minlength, maxlength, path_wig, path_den, path_gff): '''Density will be a size separated dictionary = {length : [reads at 0, reads at 1, ....]} this makes it easier to select a size range later for analysis adjusted: will shift reads larger than 24 to alignn 3' end''' fname = fname chr_sam = chr_sam minlength = minlength maxlength = maxlength GFFgen = GFF.parse(path_gff) # open chr aligned sam file f_samfile = open(chr_sam) samfile = csv.reader(f_samfile, delimiter=' ') # dictionaries to hold read counts density_plus = {} density_minus = {} density_plus_sizesep = {} density_minus_sizesep = {} if minlength < 0 or maxlength < 0: print "Error. Length input not valid." return (0) # Makes 2 sets of indices, one for all reads, and another for size separated: for sequence in GFFgen: density_plus[sequence.id] = [0 for x in range(len(sequence) + 20)] density_minus[sequence.id] = [0 for x in range(len(sequence) + 20)] for length in range(minlength, maxlength + 1): density_plus_sizesep[length] = [0 for x in range(len(sequence) + 20)] density_minus_sizesep[length] = [0 for x in range(len(sequence) + 20)] total_reads = 0 mapped_reads = 0 # Loop through the samfile. for read in samfile: if read[0][0] == '@': # Ignore header lines. continue if read[1] == '4': # A bowtie mismatch. continue chrom = read[2] # chromosome identified for read in bowtie readid = read[0] # read id startp = int( read[3] ) - 1 # start position. Need to subtract 1 since genomic sequence starts at 1, seq = Seq.Seq(read[9]) # sequence of the read length = len(seq) # length of read if length < 23: length_shift = 24 - length else: length_shift = 0 if chrom not in density_plus.keys(): print "Error: Bowtie index and GFF do not match" total_reads += 1 # Note that Bowtie reverse complements any sequence aligning to the reverse strand. # and so read[3] is the 3'-end of minus strand reads # Filter to get rid of reads of particular length. Or a particular strand. if (length < minlength or length > maxlength): continue mapped_reads += 1 # 16 is the minus strand, 0 is the plus strand if (read[1] == '16'): start = startp - length_shift density_minus[chrom][start] += 1 density_minus_sizesep[length][start] += 1 if (read[1] == '0'): start = startp + length - 1 + length_shift density_plus[chrom][start] += 1 density_plus_sizesep[length][start] += 1 path_oldformat = path_den + "binary/" if not os.path.exists(path_oldformat): os.makedirs(path_oldformat) density_plus[sequence.id] = [ float(i) * 1000000 / float(mapped_reads) for i in density_plus[sequence.id] ] density_minus[sequence.id] = [ float(i) * 1000000 / float(mapped_reads) for i in density_minus[sequence.id] ] ribo_util.writebin(density_plus, path_oldformat + fname + "_plus_") ribo_util.makePickle(density_plus, path_den + "plus") ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep") ribo_util.countstowig(density_plus, path_wig + "_plus") ribo_util.writebin(density_minus, path_oldformat + fname + "_minus_") ribo_util.makePickle(density_minus, path_den + "minus") ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep") ribo_util.countstowig(density_minus, path_wig + "_minus")
#print(annotation[annotation.sseqid == ID ].iloc[0]) sub = annotation[annotation.sseqid == ID] rows.append(sub.index[0]) annotation = annotation.loc[rows] rows = [] for ID in annotation.qstart.unique(): #print(annotation[annotation.sseqid == ID ].iloc[0]) sub = annotation[annotation.qstart == ID] rows.append(sub.index[0]) annotation = annotation.loc[rows] genes = {} prots = {} for i, r in annotation.iterrows(): genes[i] = qseq[r.qstart - 1:r.qend - 1].decode() #print(genes[i]) prots[i] = str(Seq.Seq(genes[i]).translate()) annotation = annotation.sort_values(['qstart']) annotation['prots'] = annotation.index.map(prots) annotation['genes'] = annotation.index.map(genes) aln_regions = np.array( list(zip(list(annotation.qstart), list(annotation.qend)))) aln_regions = aln_regions[1:, :] aln_len = np.array(list(annotation.qend - annotation.qstart)) annotation = pd.DataFrame.sort_values(annotation, by='qstart') print(annotation) annotation.to_csv(alnfile + 'annotation.csv') else: #just seperate sequence into dummy codons
# pssm = m.counts.normalize(pseudocounts=0.1).log_odds() cons_score = pssm.calculate(cons) cons_list = list(cons) cons_str = str(cons) deg_cons_str = str(deg_cons) # # for each position, generate a new test sequence for each possible nucleotide # at that position. Then score that test sequence relative to the original pssm. # Next, evaluate the absolute value of the score difference between every pair of # test sequence and classify each pair as either a transition or transversion. # for i, c in enumerate(cons_list): new_cons_str_A = Seq.Seq( "".join((cons_str[0:i], "A", cons_str[i + 1:])), IUPAC.unambiguous_dna) new_cons_str_C = Seq.Seq( "".join((cons_str[0:i], "C", cons_str[i + 1:])), IUPAC.unambiguous_dna) new_cons_str_G = Seq.Seq( "".join((cons_str[0:i], "G", cons_str[i + 1:])), IUPAC.unambiguous_dna) new_cons_str_T = Seq.Seq( "".join((cons_str[0:i], "T", cons_str[i + 1:])), IUPAC.unambiguous_dna) new_score_A = pssm.calculate(new_cons_str_A) new_score_C = pssm.calculate(new_cons_str_C) new_score_G = pssm.calculate(new_cons_str_G) new_score_T = pssm.calculate(new_cons_str_T) central_distance = 2 * (0.5 - float(i) / len(counts[1, :]))
def mk_detect(tree_filename, ali_basename, OutDirName): start_detec = time.time() metadata_simu_dico = {} logger.debug("Tree: %s", os.path.basename(tree_filename)) metadata_simu_dico["tree"] = os.path.basename(tree_filename) g_tree = events_placing.gene_tree(tree_filename, manual_mode_nodes) g_tree.init_inter_dir_det(repest0, reptree0, repfasta0, repbppconfig, repseq) g_tree.auto_trim_tree = auto_trim_tree g_tree.init_tree_det(n_sites) metadata_simu_dico["numberOfLeaves"] = g_tree.numberOfLeafs if g_tree.manual_mode_nodes["T"] == []: logger.warning("No transition in the tree. End.") else: logger.debug("repfasta: %s", g_tree.repfasta) logger.debug("repest: %s", g_tree.repest) logger.debug("reptree: %s", g_tree.reptree) ### construit les arbres d'etude : (allbranchlength, convbranchlength) = g_tree.mk_tree_for_simu(plot=args.plot) metadata_simu_dico["allbranchlength"] = allbranchlength metadata_simu_dico["convbranchlength"] = convbranchlength l_TPFPFNTN_mod_het = [] l_TPFPFNTN_topo = [] l_TPFPFNTN_obs_sub = [] if not os.path.isfile(g_tree.repseq + "/" + ali_basename): logger.error("%s does not exist", g_tree.repseq + "/" + ali_basename) sys.exit(1) c1 = 1 # useless but compatibility c2 = 2 # useless but compatibility set_e1e2 = [] for e1 in range(1, (NbCat_Est + 1)): for e2 in range(1, (NbCat_Est + 1)): set_e1e2.append((e1, e2)) for (e1, e2) in set_e1e2: logger.debug("Estime e1: %s e2: %s", e1, e2) # Positif bpp_lib.make_estim(ali_basename, e1, e2, g_tree, NBCATest=NbCat_Est, suffix="_noOneChange", OneChange=False, ext="", max_gap_allowed=args.max_gap_allowed, gamma=args.gamma, inv_gamma=args.inv_gamma) bpp_lib.make_estim(ali_basename, e1, e2, g_tree, NBCATest=NbCat_Est, suffix="_withOneChange", OneChange=True, ext="", max_gap_allowed=args.max_gap_allowed, gamma=args.gamma, inv_gamma=args.inv_gamma) ### post proba res, bilan = estim_data.dico_typechg_het_det(ali_basename, g_tree, set_e1e2=set_e1e2, NbCat_Est=NbCat_Est, ID=date) l_TPFPFNTN_mod_het.extend(res) for p in [ "p_max_OX_OXY", "p_max_XY_OXY", "p_mean_OX_OXY", "p_mean_XY_OXY" ]: if bilan[12].has_key(p): del bilan[12][p] dict_values_pcoc = {} dict_values_pcoc["PCOC"] = bilan[12]["p_mean_X_OXY"] dict_values_pcoc["PC"] = bilan[12]["p_mean_X_XY"] dict_values_pcoc["OC"] = bilan[12]["p_mean_X_OX"] ### Get indel prop prop_indel = [0] * n_sites prop_indel_conv = [0] * n_sites for seq in ali: sp_conv = g_tree.annotated_tree.search_nodes( name=seq.name)[0].C == True for i in range(n_sites): if seq.seq[i] == "-": prop_indel[i] += 1 if sp_conv: prop_indel_conv[i] += 1 # filter position: bilan_f = {} all_pos = range(1, n_sites + 1) # filter on indel prop: t_indel = args.max_gap_allowed_in_conv_leaves * float( g_tree.numberOfConvLeafs) all_pos_without_indel_sites = [ p for p in all_pos if prop_indel_conv[p - 1] < t_indel ] dict_pos_filtered = {} for model in ["PCOC", "PC", "OC"]: dict_pos_filtered[model] = [ p for p in all_pos_without_indel_sites if dict_values_pcoc[model][p - 1] >= dict_p_filter_threshold[model] ] if positions_to_highlight: dict_pos_filtered[model].extend(positions_to_highlight) dict_pos_filtered[model] = list(set(dict_pos_filtered[model])) dict_pos_filtered[model].sort() # filter dict_values_pcoc dict_values_pcoc_filtered = {} all_filtered_position = list( set(events_placing.unlist(dict_pos_filtered.values()))) all_filtered_position.sort() dict_pos_filtered["union"] = all_filtered_position if args.reorder: for model in ["PCOC", "PC", "OC", "union"]: m_list = [model] if model == "union": m_list = ["PCOC", "PC", "OC"] nb_filtered_pos = len(dict_pos_filtered[model]) new_order = [0] * nb_filtered_pos j = 0 # 0.99 for i in range(nb_filtered_pos): p = dict_pos_filtered[model][i] if any( [dict_values_pcoc[m][p - 1] >= 0.99 for m in m_list]): new_order[i] = j j += 1 # 0.9 for i in range(nb_filtered_pos): p = dict_pos_filtered[model][i] if any([0.99 > dict_values_pcoc[m] [p-1] >= 0.9 for m in m_list]) and \ all([0.99 > dict_values_pcoc[m] [p-1] for m in m_list]): new_order[i] = j j += 1 # 0.8 for i in range(nb_filtered_pos): p = dict_pos_filtered[model][i] if any([ 0.9 > dict_values_pcoc[m] [p-1] >= 0.8 for m in m_list]) and \ all([ 0.9 > dict_values_pcoc[m] [p-1] for m in m_list]): new_order[i] = j j += 1 # other for i in range(nb_filtered_pos): p = dict_pos_filtered[model][i] if all([dict_values_pcoc[m][p - 1] < 0.8 for m in m_list]): new_order[i] = j j += 1 dict_pos_filtered[model] = reorder_l(dict_pos_filtered[model], new_order) # filtered ali: ## Per model for model in ["PCOC", "PC", "OC", "union"]: filtered_ali = [] for seq in ali: new_seq = SeqRecord.SeqRecord( Seq.Seq("".join( filter_l(list(seq.seq), dict_pos_filtered[model]))), seq.id, "", "") filtered_ali.append(new_seq) SeqIO.write(filtered_ali, g_tree.repfasta + "/filtered_ali." + model + ".faa", "fasta") if model == "union": modelstr = "union" else: modelstr = model logger.info("%s model: # filtered position: %s/%s", modelstr.upper(), len(dict_pos_filtered[model]), n_sites) ## Output ### Table #### complete: df_bilan = pd.DataFrame.from_dict(dict_values_pcoc, orient='columns', dtype=None) df_bilan["Sites"] = all_pos df_bilan["Indel_prop"] = prop_indel df_bilan["Indel_prop"] = df_bilan["Indel_prop"] / nb_seq df_bilan["Indel_prop(ConvLeaves)"] = prop_indel_conv df_bilan["Indel_prop(ConvLeaves)"] = df_bilan[ "Indel_prop(ConvLeaves)"] / float(g_tree.numberOfConvLeafs) df_bilan = df_bilan[[ "Sites", "Indel_prop", "Indel_prop(ConvLeaves)", "PCOC", "PC", "OC" ]] #### filtered: df_bilan_f = df_bilan[df_bilan.Sites.isin(all_filtered_position)] df_bilan_f = df_bilan_f.copy() df_bilan.to_csv(prefix_out + ".results.tsv", index=False, sep='\t') if not df_bilan_f.empty: df_bilan_f.to_csv(prefix_out + ".filtered_results.tsv", index=False, sep='\t') ### Plot if args.plot: if args.plot_complete_ali: plot_data.make_tree_ali_detect_combi( g_tree, g_tree.repseq + "/" + ali_basename, prefix_out + "_plot_complete.pdf", dict_benchmark=dict_values_pcoc, hp=positions_to_highlight, title=args.plot_title) if args.svg: plot_data.make_tree_ali_detect_combi( g_tree, g_tree.repseq + "/" + ali_basename, prefix_out + "_plot_complete.svg", dict_benchmark=dict_values_pcoc, hp=positions_to_highlight, title=args.plot_title) for model in ["PCOC", "PC", "OC"]: if dict_pos_filtered[ model] and dict_p_filter_threshold[model] <= 1: dict_values_pcoc_filtered_model = {} for (key, val) in dict_values_pcoc.items(): dict_values_pcoc_filtered_model[key] = filter_l( val, dict_pos_filtered[model]) plot_data.make_tree_ali_detect_combi( g_tree, g_tree.repfasta + "/filtered_ali." + model + ".faa", prefix_out + "_plot_filtered_" + model + ".pdf", hist_up=model, dict_benchmark=dict_values_pcoc_filtered_model, x_values=dict_pos_filtered[model], hp=positions_to_highlight, reorder=args.reorder, det_tool=True, title=args.plot_title) if args.svg: plot_data.make_tree_ali_detect_combi( g_tree, g_tree.repfasta + "/filtered_ali." + model + ".faa", prefix_out + "_plot_filtered_" + model + ".svg", hist_up=model, dict_benchmark=dict_values_pcoc_filtered_model, x_values=dict_pos_filtered[model], hp=positions_to_highlight, reorder=args.reorder, det_tool=True, title=args.plot_title) # all model if dict_pos_filtered["union"]: model = "union" dict_values_pcoc_filtered_model = {} for (key, val) in dict_values_pcoc.items(): dict_values_pcoc_filtered_model[key] = filter_l( val, dict_pos_filtered[model]) plot_data.make_tree_ali_detect_combi( g_tree, g_tree.repfasta + "/filtered_ali." + model + ".faa", prefix_out + "_plot_filtered_" + model + ".pdf", dict_benchmark=dict_values_pcoc_filtered_model, x_values=dict_pos_filtered[model], hp=positions_to_highlight, reorder=False, det_tool=True, title=args.plot_title) if args.svg: plot_data.make_tree_ali_detect_combi( g_tree, g_tree.repfasta + "/filtered_ali." + model + ".faa", prefix_out + "_plot_filtered_" + model + ".svg", dict_benchmark=dict_values_pcoc_filtered_model, x_values=dict_pos_filtered[model], hp=positions_to_highlight, reorder=False, det_tool=True, title=args.plot_title) if not args.no_cleanup: remove_folder(g_tree.repest) remove_folder(g_tree.repbppconfig) remove_folder(g_tree.reptree) if not args.no_cleanup_fasta: remove_folder(g_tree.repfasta) metadata_simu_dico["time"] = str(time.time() - start_detec)
def assign_umi_amplicons(trg_umi_cluster_file, trg_umi_fasta, amp_match_file, amp_seq_fasta, outfilename): #function will tally reads counted for each target umi across each amplicon-call, and return a csv file with the following columns: #(target umi cluster-index),(leading amplicon-call),(reads for leading amplicon-call),(total reads counted) sysOps.throw_status('Loading cluster-file ' + sysOps.globaldatapath + trg_umi_cluster_file) trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary( trg_umi_cluster_file) #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]} trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU") amp_seq_handle = open(sysOps.globaldatapath + amp_seq_fasta, "rU") realign_amplicons = False amp_match_handle = None try: sysOps.throw_status('Loading ' + sysOps.globaldatapath + amp_match_file) amp_match_handle = open(sysOps.globaldatapath + amp_match_file, "rU") except: sysOps.throw_status( sysOps.globaldatapath + amp_match_file + ' not found. Alignments will occur from sequence-consenses directly.' ) realign_amplicons = True if not sysOps.check_file_exists('amplicon_refs.txt'): sysOps.throw_exception('Error: ' + sysOps.globaldatapath + 'amplicon_refs.txt not found.') sysOps.exitProgram() trg_umi_dict = dict() trg_amp_seq_dict = dict() for trg_umi_record, amp_seq_record in itertools.izip( SeqIO.parse(trg_umi_handle, "fasta"), SeqIO.parse(amp_seq_handle, "fasta")): if not realign_amplicons: amp_match = int(amp_match_handle.readline().strip('\n')) else: amp_match = -1 trg_umi_seq = str(trg_umi_record.seq) if trg_umi_seq in trg_umi_cluster_dict: trg_umi_index = str( trg_umi_cluster_dict[trg_umi_seq][0]) #uxi cluster-index if trg_umi_index in trg_umi_dict: if amp_match in trg_umi_dict[trg_umi_index]: trg_umi_dict[trg_umi_index][ amp_match] += 1 #add 1, because every read is being entered else: trg_umi_dict[trg_umi_index][amp_match] = 1 else: trg_umi_dict[trg_umi_index] = dict() trg_amp_seq_dict[trg_umi_index] = baseTally() trg_umi_dict[trg_umi_index][amp_match] = 1 trg_amp_seq_dict[trg_umi_index].add_record(str(amp_seq_record.seq), 1) trg_umi_handle.close() amp_seq_handle.close() if not realign_amplicons: amp_match_handle.close() csvfile = open(sysOps.globaldatapath + outfilename, 'w') fastafile = open( sysOps.globaldatapath + outfilename[:outfilename.rfind('.')] + '.fasta', 'w') ref_sequences = list() if realign_amplicons and sysOps.check_file_exists('amplicon_refs.txt'): with open(sysOps.globaldatapath + 'amplicon_refs.txt', 'rU') as ref_file_handle: for ref_line in ref_file_handle: [ref_name, ref_seq] = ref_line.strip('\n').upper().split('|') # amplicon_refs.txt will contain sequences in reverse complementary orientation. We therefore reverse both complementarity and order ref_sequences.append([ str(Seq.Seq(my_ref_seq).reverse_complement()) for my_ref_seq in reversed(ref_seq.split(',')) ]) mySettings = fileOps.read_settingsfile_to_dictionary('libsettings.txt') max_mismatch_amplicon = float(mySettings["-max_mismatch_amplicon"][0]) trg_umi_index_dict = dict() accepted_consensus_sequences = 0 inadmis_consensus_sequences = 0 for trg_umi_index in trg_umi_dict: max_tally = 0 tot_tally = 0 for amp_match in trg_umi_dict[trg_umi_index]: my_tally = trg_umi_dict[trg_umi_index][amp_match] if my_tally >= max_tally: max_tally = int(my_tally) max_match = int(amp_match) tot_tally += int(my_tally) consensus_seq = str( trg_amp_seq_dict[trg_umi_index].get_str_consensus()) if realign_amplicons: # perform direct, un-gapped alignment of consensus_seq to reference options to obtain max_match max_match = -1 max_tally = -1 # exclude max_tally as count, since alignment is happening post-consensus min_mismatch_count = -1 for i in range(len(ref_sequences)): all_subamplicons_pass = True start_index = 0 tot_mismatches = 0 for j in range(len(ref_sequences[i]) ): # loop through sub-amplicon-sequences ref_subamplicon_len = len(ref_sequences[i][j]) my_mismatches, minlen = alignOps.count_mismatches( ref_sequences[i][j], consensus_seq[start_index:(start_index + ref_subamplicon_len)]) if minlen == 0: all_subamplicons_pass = False break all_subamplicons_pass = all_subamplicons_pass and ( my_mismatches / float(minlen) <= max_mismatch_amplicon) start_index += ref_subamplicon_len tot_mismatches += my_mismatches if all_subamplicons_pass and ( max_match < 0 or min_mismatch_count < tot_mismatches): max_match = int(i) min_mismatch_count = int(tot_mismatches) if max_match >= 0: csvfile.write(trg_umi_index + "," + str(max_match) + "," + str(max_tally) + "," + str(tot_tally) + "\n") fastafile.write(">" + trg_umi_index + '\n') fastafile.write(consensus_seq + '\n') if realign_amplicons: trg_umi_index_dict[trg_umi_index] = True accepted_consensus_sequences += 1 else: inadmis_consensus_sequences += 1 csvfile.close() fastafile.close() sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' + str(accepted_consensus_sequences + inadmis_consensus_sequences) + ' sequences in writing ' + sysOps.globaldatapath + outfilename + ' due to inadequate amplicon match.') if realign_amplicons: # create a new consensus pairing file that's filtered with the accepted trg umi indices [dirnames, filenames] = sysOps.get_directory_and_file_list() consensus_filenames = [ filename for filename in filenames if filename.startswith('consensus') ] for consensus_filename in consensus_filenames: # find all consensus files present accepted_consensus_sequences = 0 inadmis_consensus_sequences = 0 os.rename( sysOps.globaldatapath + consensus_filename, sysOps.globaldatapath + 'unfiltered_' + consensus_filename) with open(sysOps.globaldatapath + consensus_filename, 'w') as new_consensus_file: with open( sysOps.globaldatapath + 'unfiltered_' + consensus_filename, 'rU') as old_consensus_file: for old_consensus_file_line in old_consensus_file: consensus_list = old_consensus_file_line.strip( '\n' ).split( ',' ) # [uei_index, bcn_umi_index, trg_umi_index, read_count, (additional variables)] if consensus_list[2] in trg_umi_index_dict: new_consensus_file.write(old_consensus_file_line) accepted_consensus_sequences += 1 else: inadmis_consensus_sequences += 1 sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' + str(accepted_consensus_sequences + inadmis_consensus_sequences) + ' consensus-pairings in writing ' + sysOps.globaldatapath + consensus_filename + ' due to inadequate amplicon match.') if len(consensus_filenames) == 0: sysOps.throw_exception( 'Error: no consensus files available to update with realigned amplicon information. Exiting.' ) sysOps.exitProgram()
insertions = [[pos, '', 1-total_freq]] for insertion, c in list(ins[ref][pos].items()): ins_freq = 1.0*c.sum()/cov[pos] insertions.append([pos, insertion, ins_freq]) insertions.sort(key=lambda x:x[2]) if insertions[-2][2]>args.min_freq: insertions_to_include.append(insertions[-2]) seq = "".join(consensus_seq) if insertions_to_include: complete_seq = "" pos = 0 for ins_pos, ins, freq in sorted(insertions_to_include, key=lambda x:x[0]): complete_seq += seq[pos:ins_pos] + ins pos=ins_pos print(sample + ": inserted %s at position %d with frequency %f."%(ins, ins_pos, freq)) complete_seq += seq[pos:] seq=complete_seq any_minors = True if len(ac)==1: seq_name = sample+'_minor' else: seq_name = sample + '_minor_' + ref seqs.append(SeqRecord.SeqRecord(id=seq_name, name=seq_name, description="", seq=Seq.Seq(seq))) if any_minors: SeqIO.write(seqs, args.out_dir+'/minor.fasta', 'fasta') else: os.system("touch "+args.out_dir+'/minor.fasta')
import copy import unittest import warnings from Bio import BiopythonWarning, BiopythonDeprecationWarning from Bio import Seq from Bio.Data.IUPACData import ( ambiguous_dna_complement, ambiguous_rna_complement, ambiguous_dna_values, ambiguous_rna_values, ) from Bio.Data.CodonTable import TranslationError, standard_dna_table test_seqs = [ Seq.Seq("TCAAAAGGATGCATCATG"), Seq.Seq("T"), Seq.Seq("ATGAAACTG"), Seq.Seq("ATGAARCTG"), Seq.Seq("AWGAARCKG"), # Note no U or T Seq.Seq("".join(ambiguous_rna_values)), Seq.Seq("".join(ambiguous_dna_values)), Seq.Seq("AWGAARCKG"), Seq.Seq("AUGAAACUG"), Seq.Seq("ATGAAA-CTG"), Seq.Seq("ATGAAACTGWN"), Seq.Seq("AUGAAA==CUG"), Seq.Seq("AUGAAACUGWN"), Seq.Seq("AUGAAACTG"), # U and T Seq.MutableSeq("ATGAAACTG"), Seq.MutableSeq("AUGaaaCUG"),
def get_micro_homology_features(gene_names, learn_options, X): # originally was flipping the guide itself as necessary, but now flipping the gene instead print("building microhomology features") feat = pandas.DataFrame(index=X.index) feat["mh_score"] = "" feat["oof_score"] = "" #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f: if True: # number of nulceotides to take to the left and right of the guide k_mer_length_left = 9 k_mer_length_right = 21 for gene in gene_names.unique(): gene_seq = Seq.Seq( util.get_gene_sequence(gene)).reverse_complement() guide_inds = np.where(gene_names.values == gene)[0] print("getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene)) for j, ps in enumerate(guide_inds): guide_seq = Seq.Seq(X['30mer'][ps]) strand = X['Strand'][ps] if strand == 'sense': gene_seq = gene_seq.reverse_complement() # figure out the sequence to the left and right of this guide, in the gene ind = gene_seq.find(guide_seq) if ind == -1: gene_seq = gene_seq.reverse_complement() ind = gene_seq.find(guide_seq) #assert ind != -1, "still didn't work" #print "shouldn't get here" else: #print "all good" pass #assert ind != -1, "could not find guide in gene" if ind == -1: #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene)) #if.write(str(gene) + "," + str(guide_seq)) mh_score = 0 oof_score = 0 else: #print "worked" assert gene_seq[ind:( ind + len(guide_seq))] == guide_seq, "match not right" left_win = gene_seq[(ind - k_mer_length_left):ind] right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length_right)] #if strand=='antisense': # # it's arbitrary which of sense and anti-sense we flip, we just want # # to keep them in the same relative alphabet/direction # left_win = left_win.reverse_complement() # right_win = right_win.reverse_complement() assert len(left_win.tostring()) == k_mer_length_left assert len(right_win.tostring()) == k_mer_length_right sixtymer = str(left_win) + str(guide_seq) + str(right_win) assert len(sixtymer) == 60, "should be of length 60" mh_score, oof_score = microhomology.compute_score(sixtymer) feat.ix[ps, "mh_score"] = mh_score feat.ix[ps, "oof_score"] = oof_score print("computed microhomology of %s" % (str(gene))) return pandas.DataFrame(feat, dtype='float')
def test_translation_wrong_type(self): """Test translation table cannot be CodonTable.""" seq = Seq.Seq("ATCGTA") with self.assertRaises(ValueError): seq.translate(table=ambiguous_dna_complement)
orf = orf_dict[orf_key] # Get ref genome pos bp = pd.Series(orf.index, name='genome_pos') bp.index = bp.index + 1 # Get codon position codon_number = pd.Series(bp.index / 3) codon_number = codon_number.apply(math.ceil) pos_in_codon = pd.Series(bp.index % 3).replace({0: 3}) # Get reference protein sequence seq = ''.join(list(orf['Ref'])) seq = seq.upper() seq = Seq.Seq(seq) prot_seq = seq.translate() prot_seq = pd.Series(list(prot_seq)) prot_seq.index = prot_seq.index + 1 # match AA to codon number ref_AA = codon_number.apply( func=lambda x: prot_seq[x]) # -1 to match 0-based indexing to_concat = pd.DataFrame({ 'bp': bp.reset_index(drop=True), 'codon_number': codon_number, 'pos_in_codon': pos_in_codon, 'ref_AA': ref_AA }) orf_df = pd.concat([to_concat, orf.reset_index(drop=True)], axis=1)
def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG") self.dna = [ Seq.Seq("ATCG"), Seq.Seq("gtca"), Seq.MutableSeq("GGTCA"), Seq.Seq("CTG-CA"), ] self.rna = [ Seq.Seq("AUUUCG"), Seq.MutableSeq("AUUCG"), Seq.Seq("uCAg"), Seq.MutableSeq("UC-AG"), Seq.Seq("U.CAG"), ] self.nuc = [Seq.Seq("ATCG")] self.protein = [ Seq.Seq("ATCGPK"), Seq.Seq("atcGPK"), Seq.Seq("T.CGPK"), Seq.Seq("T-CGPK"), Seq.Seq("MEDG-KRXR*"), Seq.MutableSeq("ME-K-DRXR*XU"), Seq.Seq("MEDG-KRXR@"), Seq.Seq("ME-KR@"), Seq.Seq("MEDG.KRXR@"), ] self.test_chars = ["-", Seq.Seq("-"), Seq.Seq("*"), "-X@"]
def str2seq(s, prt=False): if prt: alpha = Alphabet.ProteinAlphabet else: alpha = Alphabet.generic_dna return Seq.Seq(s, alpha)
def test_append_nucleotides(self): self.test_chars.append(Seq.Seq("A")) self.assertEqual(5, len(self.test_chars))
from Bio import SeqIO, Seq import sys for rec in SeqIO.parse(open(sys.argv[1]), "fasta"): seqstr = str(rec.seq) seqstr = '-------------------------------------------' + seqstr + '-------------------------------------------------------------------------------------------------------------' rec.seq = Seq.Seq(seqstr) SeqIO.write([rec], sys.stdout, "fasta")
def setUp(self): self.dna = [ Seq.Seq("ATCG"), Seq.Seq("gtca"), Seq.MutableSeq("GGTCA"), Seq.Seq("CTG-CA"), "TGGTCA", ] self.rna = [ Seq.Seq("AUUUCG"), Seq.MutableSeq("AUUCG"), Seq.Seq("uCAg"), Seq.MutableSeq("UC-AG"), Seq.Seq("U.CAG"), "UGCAU", ] self.nuc = [ Seq.Seq("ATCG"), "UUUTTTACG", ] self.protein = [ Seq.Seq("ATCGPK"), Seq.Seq("atcGPK"), Seq.Seq("T.CGPK"), Seq.Seq("T-CGPK"), Seq.Seq("MEDG-KRXR*"), Seq.MutableSeq("ME-K-DRXR*XU"), "TEDDF", ]
def SwissIterator(handle): """Break up a Swiss-Prot/UniProt file into SeqRecord objects. Every section from the ID line to the terminating // becomes a single SeqRecord with associated annotation and features. This parser is for the flat file "swiss" format as used by: - Swiss-Prot aka SwissProt - TrEMBL - UniProtKB aka UniProt Knowledgebase For consistency with BioPerl and EMBOSS we call this the "swiss" format. See also the SeqIO support for "uniprot-xml" format. Rather than calling it directly, you are expected to use this parser via Bio.SeqIO.parse(..., format="swiss") instead. """ swiss_records = SwissProt.parse(handle) for swiss_record in swiss_records: # Convert the SwissProt record to a SeqRecord seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) record = SeqRecord.SeqRecord( seq, id=swiss_record.accessions[0], name=swiss_record.entry_name, description=swiss_record.description, features=[_make_seqfeature(*f) for f in swiss_record.features], ) record.description = swiss_record.description for cross_reference in swiss_record.cross_references: if len(cross_reference) < 2: continue database, accession = cross_reference[:2] dbxref = "%s:%s" % (database, accession) if dbxref not in record.dbxrefs: record.dbxrefs.append(dbxref) annotations = record.annotations annotations['accessions'] = swiss_record.accessions if swiss_record.protein_existence: annotations['protein_existence'] = swiss_record.protein_existence if swiss_record.created: annotations['date'] = swiss_record.created[0] annotations['sequence_version'] = swiss_record.created[1] if swiss_record.sequence_update: annotations[ 'date_last_sequence_update'] = swiss_record.sequence_update[0] annotations['sequence_version'] = swiss_record.sequence_update[1] if swiss_record.annotation_update: annotations[ 'date_last_annotation_update'] = swiss_record.annotation_update[ 0] annotations['entry_version'] = swiss_record.annotation_update[1] if swiss_record.gene_name: annotations['gene_name'] = swiss_record.gene_name annotations['organism'] = swiss_record.organism.rstrip(".") annotations['taxonomy'] = swiss_record.organism_classification annotations['ncbi_taxid'] = swiss_record.taxonomy_id if swiss_record.host_organism: annotations['organism_host'] = swiss_record.host_organism if swiss_record.host_taxonomy_id: annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id if swiss_record.comments: annotations['comment'] = "\n".join(swiss_record.comments) if swiss_record.references: annotations['references'] = [] for reference in swiss_record.references: feature = SeqFeature.Reference() feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) for key, value in reference.references: if key == 'PubMed': feature.pubmed_id = value elif key == 'MEDLINE': feature.medline_id = value elif key == 'DOI': pass elif key == 'AGRICOLA': pass else: raise ValueError("Unknown key %s found in references" % key) feature.authors = reference.authors feature.title = reference.title feature.journal = reference.location annotations['references'].append(feature) if swiss_record.keywords: record.annotations['keywords'] = swiss_record.keywords yield record
def setUp(self): self.s = Seq.Seq("TCAAAAGGATGCATCATG")
def run(args): # check alignment type, set flags, read in if VCF is_vcf = False ref = None tree_meta = {'alignment': args.alignment} attributes = ['branch_length'] # check if tree is provided an can be read for fmt in ["newick", "nexus"]: try: T = Phylo.read(args.tree, fmt) tree_meta['input_tree'] = args.tree break except: pass if T is None: print("ERROR: reading tree from %s failed." % args.tree) return -1 if not args.alignment: # fake alignment to appease treetime when only using it for naming nodes... if args.ancestral or args.timetree: print( "ERROR: alignment is required for ancestral reconstruction or timetree inference" ) return -1 from Bio import SeqRecord, Seq, Align seqs = [] for n in T.get_terminals(): seqs.append( SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description='')) aln = Align.MultipleSeqAlignment(seqs) elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]): if not args.vcf_reference: print( "ERROR: a reference Fasta is required with VCF-format alignments" ) return -1 compress_seq = read_vcf(args.alignment, args.vcf_reference) sequences = compress_seq['sequences'] ref = compress_seq['reference'] is_vcf = True aln = sequences else: aln = args.alignment if args.output: tree_fname = args.output else: tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk' if args.timetree and T: if args.metadata is None: print( "ERROR: meta data with dates is required for time tree reconstruction" ) return -1 metadata, columns = read_metadata(args.metadata) if args.year_limit: args.year_limit.sort() dates = get_numerical_dates(metadata, fmt=args.date_fmt, min_max_year=args.year_limit) for n in T.get_terminals(): if n.name in metadata and 'date' in metadata[n.name]: n.raw_date = metadata[n.name]['date'] if args.root and len( args.root ) == 1: #if anything but a list of seqs, don't send as a list args.root = args.root[0] tt = timetree( tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence, reroot=args.root or 'best', Tc=args.coalescent if args.coalescent is not None else 0.01, #Otherwise can't set to 0 use_marginal=args.time_marginal or False, branch_length_mode=args.branch_length_mode or 'auto', clock_rate=args.clock_rate, n_iqd=args.n_iqd) tree_meta['clock'] = { 'rate': tt.date2dist.clock_rate, 'intercept': tt.date2dist.intercept, 'rtt_Tmrca': -tt.date2dist.intercept / tt.date2dist.clock_rate } attributes.extend([ 'numdate', 'clock_length', 'mutation_length', 'mutations', 'raw_date', 'date' ]) if not is_vcf: attributes.extend(['sequence' ]) #don't add sequences if VCF - huge! if args.date_confidence: attributes.append('num_date_confidence') elif args.ancestral in ['joint', 'marginal']: tt = ancestral_sequence_inference( tree=T, aln=aln, ref=ref, marginal=args.ancestral, optimize_branch_length=args.branchlengths, branch_length_mode=args.branch_length_mode) attributes.extend(['mutation_length', 'mutations']) if not is_vcf: attributes.extend(['sequence' ]) #don't add sequences if VCF - huge! else: from treetime import TreeAnc # instantiate treetime for the sole reason to name internal nodes tt = TreeAnc(tree=T, aln=aln, ref=ref, gtr='JC69', verbose=1) if is_vcf: #TreeTime overwrites ambig sites on tips during ancestral reconst. #Put these back in tip sequences now, to avoid misleading tt.recover_var_ambigs() tree_meta['nodes'] = prep_tree(T, attributes, is_vcf) if T: import json tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f') if args.node_data: node_data_fname = args.node_data else: node_data_fname = '.'.join( args.alignment.split('.')[:-1]) + '.node_data' with open(node_data_fname, 'w') as ofile: meta_success = json.dump(tree_meta, ofile) #If VCF and ancestral reconst. was done, output VCF including new ancestral seqs if is_vcf and (args.ancestral or args.timetree): if args.output_vcf: vcf_fname = args.output_vcf else: vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf' write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname) return 0 if (tree_success and meta_success) else -1 else: return -1