def doalnmuscle(cog): kinglist = [] kingdic = {} if 1 == 1: seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.fa') if kingdic == {}: tagsI = [int(tag) for tag in tags] taxinfo = mist.genomes.find({'_id': { '$in': tagsI }}, { 'ta': 1, 'n': 1 }) for i in taxinfo: kingdic[i['_id']] = i['ta'][0] kinglist = list(set(kingdic.values())) print kinglist for kingdom in kinglist: print "Aligning " + cog + ' from kingdom ' + kingdom if kingdom not in kinglist: kinglist.append(kingdom) output = '' for tag in tags: if kingdic[int(tag)] == kingdom: output += '>' + str(tag) + '\n' + seq_dic[str(tag)] + '\n' with open('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa', 'w') as f: f.write(output) submit_to_muscle('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa') return kinglist
def concat(cog_list, filename='concat.fa', algo='linsi'): if filename == None: filename = 'concat.fa' concat = {} partition = [[0, 0]] print "Concatenating the alignments" for cog in cog_list: seq_dic, tags = bitk.fastareader( 'cdd.' + cog.split('|')[-1] + '.' + algo + '.fa', 'r') partition.append( [partition[-1][-1] + 1, partition[-1][-1] + len(seq_dic[tags[0]])]) for tag in tags: if tag not in concat.keys(): concat[tag] = seq_dic[tag] else: concat[tag] += seq_dic[tag] print "Saving the full alignment and checking for bugs" output = '' max_len = 0 for tag in tags: output += '>' + tag + '\n' + concat[tag] + '\n' if max_len == 0: max_len = len(concat[tag]) elif max_len != len(concat[tag]): print "Something is wrong with the genome: " + tag print len(concat[tag]) fout = open(filename + '.debug.fa', 'w') fout.write(output) fout.close() sys.exit() print tag + '\t' + str(len(concat[tag])) print filename fout = open(filename, 'w') fout.write(output) fout.close() print "Making the partition file" with open(filename + ".partition.dat", "w") as f: outpart = "" for i in range(1, len(partition)): outpart += "AUTO, gene" + str(i) + " = " + '-'.join( [str(x) for x in partition[i]]) + '\n' f.write(outpart) return filename
def concat(cog_list, filename = 'concat.fa', algo = 'linsi'): if filename == None: filename = 'concat.fa' concat = {} partition = [[0,0]] print "Concatenating the alignments" for cog in cog_list: seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.' + algo + '.fa', 'r') partition.append( [ partition[-1][-1] + 1, partition[-1][-1] + len(seq_dic[tags[0]]) ] ) for tag in tags: if tag not in concat.keys(): concat[tag] = seq_dic[tag] else: concat[tag] += seq_dic[tag] print "Saving the full alignment and checking for bugs" output = '' max_len = 0 for tag in tags: output += '>' + tag + '\n' + concat[tag] + '\n' if max_len == 0: max_len = len(concat[tag]) elif max_len != len(concat[tag]): print "Something is wrong with the genome: " + tag print len(concat[tag]) fout = open(filename + '.debug.fa', 'w') fout.write(output) fout.close() sys.exit() print tag + '\t' + str(len(concat[tag])) print filename fout = open(filename, 'w') fout.write(output) fout.close() print "Making the partition file" with open(filename + ".partition.dat" , "w") as f: outpart = "" for i in range(1, len(partition)): outpart += "AUTO, gene" + str(i) +" = " + '-'.join([str(x) for x in partition[i]]) + '\n' f.write(outpart) return filename
def concatbykingdom(cog_list, kinglist=[], filename='concat.fa', algo='linsi'): kingdic = {} print "Concatenating the alignments" filenames = [] for kingdom in kinglist: concat = {} for cog in cog_list: seq_dic, tags = bitk.fastareader( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.' + algo + '.fa', 'r') for tag in tags: if tag not in concat.keys(): concat[tag] = seq_dic[tag] else: concat[tag] += seq_dic[tag] print "Saving the full alignment and checking for bugs" output = '' max_len = 0 for tag in tags: output += '>' + tag + '\n' + concat[tag] + '\n' if max_len == 0: max_len = len(concat[tag]) elif max_len != len(concat[tag]): print "Something is wrong with the genome: " + tag print len(concat[tag]) fout = open(filename + '.debug.fa', 'w') fout.write(output) fout.close() sys.exit() print len(concat[tag]) fname = filename[:-3] + '.' + kingdom + '.fa' fout = open(fname, 'w') fout.write(output) fout.close() filenames.append(fname) return filenames
def concatbykingdom(cog_list, kinglist = [], filename = 'concat.fa', algo = 'linsi'): kingdic = {} print "Concatenating the alignments" filenames = [] for kingdom in kinglist: concat = {} for cog in cog_list: seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.' + algo + '.fa', 'r') for tag in tags: if tag not in concat.keys(): concat[tag] = seq_dic[tag] else: concat[tag] += seq_dic[tag] print "Saving the full alignment and checking for bugs" output = '' max_len = 0 for tag in tags: output += '>' + tag + '\n' + concat[tag] + '\n' if max_len == 0: max_len = len(concat[tag]) elif max_len != len(concat[tag]): print "Something is wrong with the genome: " + tag print len(concat[tag]) fout = open(filename + '.debug.fa', 'w') fout.write(output) fout.close() sys.exit() print len(concat[tag]) fname = filename[:-3] + '.' + kingdom + '.fa' fout = open(fname, 'w') fout.write(output) fout.close() filenames.append(fname) return filenames
def preptree(filename = 'concat.gb.fa', nogid = False): print "Starting the preparation to make a tree" seq_dic, tags = bitk.fastareader(filename) tags = [ int(i) for i in tags] names = mist.genomes.find({'_id' : { '$in' : tags }}, {'n' : 1, '_id':1}) names_dic = {} for name in names: if nogid == True: names_dic[name['_id']] = name['n'] else: names_dic[name['_id']] = str(name['_id']) + '|' + name['n'] output = '' for tag in tags: output += '>' + names_dic[tag] + '\n' + seq_dic[str(tag)] + '\n' filename = filename.split('.') filename.insert(len(filename)-1, 'names') filename = '.'.join(filename) fout = open(filename, 'w') fout.write(output) fout.close() print "Done with prepping" return filename
def doalnmuscle(cog): kinglist = [] kingdic = {} if 1 == 1: seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.fa') if kingdic == {}: tagsI = [ int(tag) for tag in tags ] taxinfo = mist.genomes.find({'_id' : { '$in' : tagsI }}, { 'ta' : 1, 'n' : 1 }) for i in taxinfo: kingdic[i['_id']] = i['ta'][0] kinglist = list(set(kingdic.values())) print kinglist for kingdom in kinglist: print "Aligning " + cog + ' from kingdom ' + kingdom if kingdom not in kinglist: kinglist.append(kingdom) output = '' for tag in tags: if kingdic[int(tag)] == kingdom: output += '>' + str(tag) + '\n' + seq_dic[str(tag)] + '\n' with open('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa', 'w' ) as f: f.write(output) submit_to_muscle( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa') return kinglist
def preptree(filename='concat.gb.fa', nogid=False): print "Starting the preparation to make a tree" seq_dic, tags = bitk.fastareader(filename) tags = [int(i) for i in tags] names = mist.genomes.find({'_id': {'$in': tags}}, {'n': 1, '_id': 1}) names_dic = {} for name in names: if nogid == True: names_dic[name['_id']] = name['n'] else: names_dic[name['_id']] = str(name['_id']) + '|' + name['n'] output = '' for tag in tags: output += '>' + names_dic[tag] + '\n' + seq_dic[str(tag)] + '\n' filename = filename.split('.') filename.insert(len(filename) - 1, 'names') filename = '.'.join(filename) fout = open(filename, 'w') fout.write(output) fout.close() print "Done with prepping" return filename
import MDAnalysis.selections.base as MDS if '-h' in sys.argv: print 'Calculates the rmsd of aligned residues between two homologs. Output format is compatible to data import on VMD\n \ Sintax: homolog1.pdb homolog2.pdb -aln homolog1and2aligned.fa \n ' sys.exit() pdb1 = sys.argv[1] pdb2 = sys.argv[2] ref = MD.Universe(pdb1) trg = MD.Universe(pdb2) if ref.residues.resnames() == trg.residues.resnames(): if '-aln' in sys.argv: seq_dic, seq_list = bitk.fastareader(sys.argv[sys.argv.index('-aln')+1]) if bitk.threeLetter2oneLetter(ref.residues.resnames()) == seq_dic[seq_list[0]].replace('-',''): alndic, alnlis = bitk.alnpos_dic(seq_dic[seq_list[0]], seq_dic[seq_list[1]]) pick = 1 elif bitk.threeLetter2oneLetter(ref.residues.resnames()) == seq_dic[seq_list[1]].replace('-',''): alndic, alnlis = bitk.alnpos_dic(seq_dic[seq_list[1]], seq_dic[seq_list[0]]) pick = 2 else: print("fasta file irrelevant for requested operation... ignoring") alnlis = ref.residues.resnames() else: alnlis = ref.residues.resnames() for res in alnlis: try: refcrd = ref.selectAtoms("backbone and resid " + str(res)).coordinates()
def aligncogbykingdom(cog_list, algo = 'linsi', Np = 20): def submit_to_linsi(fasta = ''): os.system(' linsi --quiet --thread 12 ' + fasta + ' > ' + fasta[:-3] + '.linsi.fa') return fasta[:-3] + '.linsi.fa' def submit_to_einsi(fasta = ''): os.system(' einsi --quiet --thread 12 ' + fasta + ' > ' + fasta[:-3] + '.einsi.fa') return fasta[:-3] + '.einsi.fa' def submit_to_muscle(fasta = ''): os.system('muscle -in ' + fasta + ' -out ' + fasta[:-3] + '.muscle.fa -maxiters 100') return fasta[:-3] + '.muscle.fa' sets = {} kingdic = {} kinglist = [] # for cog in cog_list: if len(cog_list) < Np: Np = len(cog_list) if Np < 2 or algo == 'linsi' or algo == 'einsi': for cog in cog_list: seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.fa') if kingdic == {}: tagsI = [ int(tag) for tag in tags ] taxinfo = mist.genomes.find({'_id' : { '$in' : tagsI }}, { 'ta' : 1, 'n' : 1 }) for i in taxinfo: kingdic[i['_id']] = i['ta'][0] kinglist = list(set(kingdic.values())) print kinglist for kingdom in kinglist: print "Aligning " + cog + ' from kingdom ' + kingdom if kingdom not in kinglist: kinglist.append(kingdom) output = '' for tag in tags: if kingdic[int(tag)] == kingdom: output += '>' + str(tag) + '\n' + seq_dic[str(tag)] + '\n' with open('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa', 'w' ) as f: f.write(output) if algo == 'linsi': submit_to_linsi( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa') elif algo == 'einsi': submit_to_einsi( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa') elif algo == 'muscle': submit_to_muscle( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa') else: print "Alignment algorithm " + algo + " not supported" sys.exit() elif algo == 'muscle': pool = multip.Pool(processes=Np) kinglist = pool.map(doalnmuscle, cog_list) newkinglist = [] for i in kinglist: for j in i: if j not in newkinglist: newkinglist.append(j) kinglist = newkinglist print kinglist return kinglist
for f in range(2, len(sys.argv)): print sys.argv[f] datafile = open(sys.argv[f],'r') for line in datafile: if '>' in line: orgid = line.split('-')[0].split('.')[-1] if orgid in orgid_list: orgid_dic[orgid][f-2] += 1 datafile.close() print orgid_dic for f in range(2, len(sys.argv)): print 'Working on file ' + sys.argv[f] seq_dic, seq_list = bitk.fastareader(sys.argv[f]) output = '' for tag in seq_list: orgid = tag.split('-')[0].split('.')[-1] if orgid in orgid_list: output += '>' + tag + '-' + '-'.join([str(i) for i in orgid_dic[orgid]]) + '\n' + seq_dic[tag] + '\n' datafile = open(sys.argv[f][:-3] + '.countche.fa', 'w') datafile.write(output) datafile.close()
orgid_list.append(orgid) orgid_dic[orgid] = copy.deepcopy(start_list) datafile.close() for f in range(2, len(sys.argv)): print sys.argv[f] datafile = open(sys.argv[f], 'r') for line in datafile: if '>' in line: orgid = line.split('-')[0].split('.')[-1] if orgid in orgid_list: orgid_dic[orgid][f - 2] += 1 datafile.close() print orgid_dic for f in range(2, len(sys.argv)): print 'Working on file ' + sys.argv[f] seq_dic, seq_list = bitk.fastareader(sys.argv[f]) output = '' for tag in seq_list: orgid = tag.split('-')[0].split('.')[-1] if orgid in orgid_list: output += '>' + tag + '-' + '-'.join( [str(i) for i in orgid_dic[orgid]]) + '\n' + seq_dic[tag] + '\n' datafile = open(sys.argv[f][:-3] + '.countche.fa', 'w') datafile.write(output) datafile.close()
def aligncogbykingdom(cog_list, algo='linsi', Np=20): def submit_to_linsi(fasta=''): os.system(' linsi --quiet --thread 12 ' + fasta + ' > ' + fasta[:-3] + '.linsi.fa') return fasta[:-3] + '.linsi.fa' def submit_to_einsi(fasta=''): os.system(' einsi --quiet --thread 12 ' + fasta + ' > ' + fasta[:-3] + '.einsi.fa') return fasta[:-3] + '.einsi.fa' def submit_to_muscle(fasta=''): os.system('muscle -in ' + fasta + ' -out ' + fasta[:-3] + '.muscle.fa -maxiters 100') return fasta[:-3] + '.muscle.fa' sets = {} kingdic = {} kinglist = [] # for cog in cog_list: if len(cog_list) < Np: Np = len(cog_list) if Np < 2 or algo == 'linsi' or algo == 'einsi': for cog in cog_list: seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.fa') if kingdic == {}: tagsI = [int(tag) for tag in tags] taxinfo = mist.genomes.find({'_id': { '$in': tagsI }}, { 'ta': 1, 'n': 1 }) for i in taxinfo: kingdic[i['_id']] = i['ta'][0] kinglist = list(set(kingdic.values())) print kinglist for kingdom in kinglist: print "Aligning " + cog + ' from kingdom ' + kingdom if kingdom not in kinglist: kinglist.append(kingdom) output = '' for tag in tags: if kingdic[int(tag)] == kingdom: output += '>' + str(tag) + '\n' + seq_dic[str( tag)] + '\n' with open('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa', 'w') as f: f.write(output) if algo == 'linsi': submit_to_linsi('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa') elif algo == 'einsi': submit_to_einsi('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa') elif algo == 'muscle': submit_to_muscle('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa') else: print "Alignment algorithm " + algo + " not supported" sys.exit() elif algo == 'muscle': pool = multip.Pool(processes=Np) kinglist = pool.map(doalnmuscle, cog_list) newkinglist = [] for i in kinglist: for j in i: if j not in newkinglist: newkinglist.append(j) kinglist = newkinglist print kinglist return kinglist