Exemplo n.º 1
0
def doalnmuscle(cog):
    kinglist = []
    kingdic = {}
    if 1 == 1:
        seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.fa')
        if kingdic == {}:
            tagsI = [int(tag) for tag in tags]
            taxinfo = mist.genomes.find({'_id': {
                '$in': tagsI
            }}, {
                'ta': 1,
                'n': 1
            })
            for i in taxinfo:
                kingdic[i['_id']] = i['ta'][0]
            kinglist = list(set(kingdic.values()))
            print kinglist
        for kingdom in kinglist:
            print "Aligning " + cog + ' from kingdom ' + kingdom
            if kingdom not in kinglist:
                kinglist.append(kingdom)
            output = ''
            for tag in tags:
                if kingdic[int(tag)] == kingdom:
                    output += '>' + str(tag) + '\n' + seq_dic[str(tag)] + '\n'
            with open('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa',
                      'w') as f:
                f.write(output)
            submit_to_muscle('cdd.' + cog.split('|')[-1] + '.' + kingdom +
                             '.fa')
    return kinglist
Exemplo n.º 2
0
def concat(cog_list, filename='concat.fa', algo='linsi'):
    if filename == None:
        filename = 'concat.fa'
    concat = {}
    partition = [[0, 0]]
    print "Concatenating the alignments"
    for cog in cog_list:
        seq_dic, tags = bitk.fastareader(
            'cdd.' + cog.split('|')[-1] + '.' + algo + '.fa', 'r')
        partition.append(
            [partition[-1][-1] + 1, partition[-1][-1] + len(seq_dic[tags[0]])])
        for tag in tags:
            if tag not in concat.keys():
                concat[tag] = seq_dic[tag]
            else:
                concat[tag] += seq_dic[tag]

    print "Saving the full alignment and checking for bugs"

    output = ''
    max_len = 0

    for tag in tags:
        output += '>' + tag + '\n' + concat[tag] + '\n'
        if max_len == 0:
            max_len = len(concat[tag])
        elif max_len != len(concat[tag]):
            print "Something is wrong with the genome: " + tag
            print len(concat[tag])
            fout = open(filename + '.debug.fa', 'w')
            fout.write(output)
            fout.close()
            sys.exit()
        print tag + '\t' + str(len(concat[tag]))

    print filename
    fout = open(filename, 'w')
    fout.write(output)
    fout.close()

    print "Making the partition file"
    with open(filename + ".partition.dat", "w") as f:
        outpart = ""
        for i in range(1, len(partition)):
            outpart += "AUTO, gene" + str(i) + " = " + '-'.join(
                [str(x) for x in partition[i]]) + '\n'
        f.write(outpart)

    return filename
Exemplo n.º 3
0
def concat(cog_list, filename = 'concat.fa', algo = 'linsi'):
	if filename == None:
		filename = 'concat.fa'
        concat = {}
        partition = [[0,0]]
        print "Concatenating the alignments"
        for cog in cog_list:
                seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.' + algo + '.fa', 'r')
                partition.append( [ partition[-1][-1] + 1, partition[-1][-1] + len(seq_dic[tags[0]]) ] )
                for tag in tags:
                        if tag not in concat.keys():
                                concat[tag] = seq_dic[tag]
                        else:
                                concat[tag] += seq_dic[tag]

        print "Saving the full alignment and checking for bugs"

        output = ''
        max_len = 0

        for tag in tags:
                output += '>' + tag + '\n' + concat[tag] + '\n'
                if max_len == 0:
                        max_len = len(concat[tag])
                elif max_len != len(concat[tag]):
                        print "Something is wrong with the genome: " + tag
                        print len(concat[tag])
			fout = open(filename + '.debug.fa', 'w')
		        fout.write(output)
		        fout.close()
                        sys.exit()
		print tag + '\t' + str(len(concat[tag]))

	print filename
        fout = open(filename, 'w')
        fout.write(output)
        fout.close()

        print "Making the partition file"
        with open(filename + ".partition.dat" , "w") as f:
            outpart = ""
            for i in range(1, len(partition)):
                outpart += "AUTO, gene" + str(i) +" = " + '-'.join([str(x) for x in partition[i]]) + '\n'
            f.write(outpart)

	return filename
Exemplo n.º 4
0
def concatbykingdom(cog_list, kinglist=[], filename='concat.fa', algo='linsi'):
    kingdic = {}
    print "Concatenating the alignments"
    filenames = []
    for kingdom in kinglist:
        concat = {}
        for cog in cog_list:
            seq_dic, tags = bitk.fastareader(
                'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.' + algo +
                '.fa', 'r')
            for tag in tags:
                if tag not in concat.keys():
                    concat[tag] = seq_dic[tag]
                else:
                    concat[tag] += seq_dic[tag]

        print "Saving the full alignment and checking for bugs"

        output = ''
        max_len = 0

        for tag in tags:
            output += '>' + tag + '\n' + concat[tag] + '\n'
            if max_len == 0:
                max_len = len(concat[tag])
            elif max_len != len(concat[tag]):
                print "Something is wrong with the genome: " + tag
                print len(concat[tag])
                fout = open(filename + '.debug.fa', 'w')
                fout.write(output)
                fout.close()
                sys.exit()
            print len(concat[tag])

        fname = filename[:-3] + '.' + kingdom + '.fa'
        fout = open(fname, 'w')
        fout.write(output)
        fout.close()
        filenames.append(fname)
    return filenames
Exemplo n.º 5
0
def concatbykingdom(cog_list, kinglist = [], filename = 'concat.fa', algo = 'linsi'):
	kingdic = {}
        print "Concatenating the alignments"
	filenames = []
	for kingdom in kinglist:
		concat = {}
	        for cog in cog_list:
			seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.' + algo + '.fa', 'r')
			for tag in tags:
                        	if tag not in concat.keys():
                                	concat[tag] = seq_dic[tag]
	                        else:
        	                        concat[tag] += seq_dic[tag]

	        print "Saving the full alignment and checking for bugs"

	        output = ''
        	max_len = 0

	        for tag in tags:
        	        output += '>' + tag + '\n' + concat[tag] + '\n'
                	if max_len == 0:
	                        max_len = len(concat[tag])
        	        elif max_len != len(concat[tag]):
                	        print "Something is wrong with the genome: " + tag
                        	print len(concat[tag])
	                        fout = open(filename + '.debug.fa', 'w')
        	                fout.write(output)
                	        fout.close()
                        	sys.exit()
	                print len(concat[tag])

		fname = filename[:-3] + '.' + kingdom + '.fa'
        	fout = open(fname, 'w')
	        fout.write(output)
        	fout.close()
		filenames.append(fname)
        return filenames
Exemplo n.º 6
0
def preptree(filename = 'concat.gb.fa', nogid = False):
	print "Starting the preparation to make a tree"
        seq_dic, tags = bitk.fastareader(filename)
        tags = [ int(i) for i in tags]
        names = mist.genomes.find({'_id' :  { '$in' : tags }}, {'n' : 1, '_id':1})
        names_dic = {}
        for name in names:
		if nogid == True:
			names_dic[name['_id']] = name['n']
		else:
	                names_dic[name['_id']] = str(name['_id']) + '|' + name['n']

        output = ''
        for tag in tags:
                output += '>' + names_dic[tag] + '\n' + seq_dic[str(tag)] + '\n'

	filename = filename.split('.')
	filename.insert(len(filename)-1, 'names')
	filename = '.'.join(filename)
        fout = open(filename, 'w')
        fout.write(output)
        fout.close()
	print "Done with prepping"
	return filename
Exemplo n.º 7
0
def doalnmuscle(cog):
	kinglist = []
	kingdic = {}
	if 1 == 1:
	        seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.fa')
        	if kingdic == {}:
	                tagsI = [ int(tag) for tag in tags ]
                        taxinfo = mist.genomes.find({'_id' :  { '$in' : tagsI }}, { 'ta' : 1, 'n' : 1 })
                        for i in taxinfo:
                                kingdic[i['_id']] = i['ta'][0]
                        kinglist = list(set(kingdic.values()))
                        print kinglist
                for kingdom in kinglist:
                        print "Aligning " + cog + ' from kingdom ' + kingdom
                        if kingdom not in kinglist:
                                kinglist.append(kingdom)
                        output = ''
                        for tag in tags:
                                if kingdic[int(tag)] == kingdom:
                                        output += '>' + str(tag) + '\n' + seq_dic[str(tag)] + '\n'
                        with open('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa', 'w' ) as f:
                                f.write(output)
                        submit_to_muscle( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa')
	return kinglist
Exemplo n.º 8
0
def preptree(filename='concat.gb.fa', nogid=False):
    print "Starting the preparation to make a tree"
    seq_dic, tags = bitk.fastareader(filename)
    tags = [int(i) for i in tags]
    names = mist.genomes.find({'_id': {'$in': tags}}, {'n': 1, '_id': 1})
    names_dic = {}
    for name in names:
        if nogid == True:
            names_dic[name['_id']] = name['n']
        else:
            names_dic[name['_id']] = str(name['_id']) + '|' + name['n']

    output = ''
    for tag in tags:
        output += '>' + names_dic[tag] + '\n' + seq_dic[str(tag)] + '\n'

    filename = filename.split('.')
    filename.insert(len(filename) - 1, 'names')
    filename = '.'.join(filename)
    fout = open(filename, 'w')
    fout.write(output)
    fout.close()
    print "Done with prepping"
    return filename
Exemplo n.º 9
0
import MDAnalysis.selections.base as MDS

if '-h' in sys.argv:
	print 'Calculates the rmsd of aligned residues between two homologs. Output format is compatible to data import on VMD\n \
	Sintax: homolog1.pdb homolog2.pdb -aln homolog1and2aligned.fa \n '
	sys.exit()

pdb1 = sys.argv[1]
pdb2 = sys.argv[2]

ref = MD.Universe(pdb1)
trg = MD.Universe(pdb2)

if ref.residues.resnames() == trg.residues.resnames():
	if '-aln' in sys.argv:
		seq_dic, seq_list = bitk.fastareader(sys.argv[sys.argv.index('-aln')+1])
		if bitk.threeLetter2oneLetter(ref.residues.resnames()) == seq_dic[seq_list[0]].replace('-',''):
			alndic, alnlis = bitk.alnpos_dic(seq_dic[seq_list[0]], seq_dic[seq_list[1]])
			pick = 1
		elif bitk.threeLetter2oneLetter(ref.residues.resnames()) == seq_dic[seq_list[1]].replace('-',''):
			alndic, alnlis = bitk.alnpos_dic(seq_dic[seq_list[1]], seq_dic[seq_list[0]])
			pick = 2
		else:
			print("fasta file irrelevant for requested operation... ignoring")
			alnlis = ref.residues.resnames()
	else:
		alnlis = ref.residues.resnames()

	for res in alnlis:
		try:
			refcrd = ref.selectAtoms("backbone and resid " + str(res)).coordinates()
Exemplo n.º 10
0
def aligncogbykingdom(cog_list, algo = 'linsi', Np = 20):
        def submit_to_linsi(fasta = ''):
                os.system(' linsi --quiet --thread 12 ' + fasta + ' > ' + fasta[:-3] + '.linsi.fa')
		return fasta[:-3] + '.linsi.fa'
        def submit_to_einsi(fasta = ''):
                os.system(' einsi --quiet --thread 12 ' + fasta + ' > ' + fasta[:-3] + '.einsi.fa')
                return fasta[:-3] + '.einsi.fa'

	def submit_to_muscle(fasta = ''):
		os.system('muscle -in ' + fasta + ' -out ' + fasta[:-3] + '.muscle.fa -maxiters 100')
		return fasta[:-3] + '.muscle.fa'

	sets = {}
	kingdic = {}
	kinglist = []
#	for cog in cog_list:
	if len(cog_list) < Np:
		Np = len(cog_list)
	
	if Np < 2 or algo == 'linsi' or algo == 'einsi':
		for cog in cog_list:
	                seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] + '.fa')
	                if kingdic == {}:
        	                tagsI = [ int(tag) for tag in tags ]
	              		taxinfo = mist.genomes.find({'_id' :  { '$in' : tagsI }}, { 'ta' : 1, 'n' : 1 })
        	                for i in taxinfo:
                	                kingdic[i['_id']] = i['ta'][0]
                        	kinglist = list(set(kingdic.values()))
	                        print kinglist
        	        for kingdom in kinglist:
                	        print "Aligning " + cog + ' from kingdom ' + kingdom
                        	if kingdom not in kinglist:
                                	kinglist.append(kingdom)
	                        output = ''
        	                for tag in tags:
                	                if kingdic[int(tag)] == kingdom:
                        	                output += '>' + str(tag) + '\n' + seq_dic[str(tag)] + '\n'
	                        with open('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa', 'w' ) as f:
        	                        f.write(output)
                	        if algo == 'linsi':
                        	        submit_to_linsi( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa')
				elif algo == 'einsi':
					submit_to_einsi( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa')
	                        elif algo == 'muscle':
         	                        submit_to_muscle( 'cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa')
                 	        else:
                         	        print "Alignment algorithm " + algo + " not supported"
                                	sys.exit()

	elif algo == 'muscle':
		pool = multip.Pool(processes=Np)
		kinglist = pool.map(doalnmuscle, cog_list)

		newkinglist = []
		for i in kinglist:
			for j in i:
				if j not in newkinglist:
					newkinglist.append(j)

		kinglist = newkinglist

	print kinglist

	return kinglist
Exemplo n.º 11
0
for f in range(2, len(sys.argv)):
	print sys.argv[f]
	datafile = open(sys.argv[f],'r')
	for line in datafile:
		if '>' in line:
			orgid = line.split('-')[0].split('.')[-1]
			if orgid in orgid_list:
				orgid_dic[orgid][f-2] += 1
	datafile.close()

print orgid_dic

for f in range(2, len(sys.argv)):
        print 'Working on file ' + sys.argv[f]
	seq_dic, seq_list = bitk.fastareader(sys.argv[f])
	output = ''
	for tag in seq_list:
		orgid = tag.split('-')[0].split('.')[-1]
		if orgid in orgid_list:
			output += '>' + tag + '-' + '-'.join([str(i) for i in orgid_dic[orgid]]) + '\n' + seq_dic[tag] + '\n'
	datafile = open(sys.argv[f][:-3] + '.countche.fa', 'w')
	datafile.write(output)
	datafile.close()






Exemplo n.º 12
0
    orgid_list.append(orgid)
    orgid_dic[orgid] = copy.deepcopy(start_list)

datafile.close()

for f in range(2, len(sys.argv)):
    print sys.argv[f]
    datafile = open(sys.argv[f], 'r')
    for line in datafile:
        if '>' in line:
            orgid = line.split('-')[0].split('.')[-1]
            if orgid in orgid_list:
                orgid_dic[orgid][f - 2] += 1
    datafile.close()

print orgid_dic

for f in range(2, len(sys.argv)):
    print 'Working on file ' + sys.argv[f]
    seq_dic, seq_list = bitk.fastareader(sys.argv[f])
    output = ''
    for tag in seq_list:
        orgid = tag.split('-')[0].split('.')[-1]
        if orgid in orgid_list:
            output += '>' + tag + '-' + '-'.join(
                [str(i)
                 for i in orgid_dic[orgid]]) + '\n' + seq_dic[tag] + '\n'
    datafile = open(sys.argv[f][:-3] + '.countche.fa', 'w')
    datafile.write(output)
    datafile.close()
Exemplo n.º 13
0
def aligncogbykingdom(cog_list, algo='linsi', Np=20):
    def submit_to_linsi(fasta=''):
        os.system(' linsi --quiet --thread 12 ' + fasta + ' > ' + fasta[:-3] +
                  '.linsi.fa')
        return fasta[:-3] + '.linsi.fa'

    def submit_to_einsi(fasta=''):
        os.system(' einsi --quiet --thread 12 ' + fasta + ' > ' + fasta[:-3] +
                  '.einsi.fa')
        return fasta[:-3] + '.einsi.fa'

    def submit_to_muscle(fasta=''):
        os.system('muscle -in ' + fasta + ' -out ' + fasta[:-3] +
                  '.muscle.fa -maxiters 100')
        return fasta[:-3] + '.muscle.fa'

    sets = {}
    kingdic = {}
    kinglist = []
    #	for cog in cog_list:
    if len(cog_list) < Np:
        Np = len(cog_list)

    if Np < 2 or algo == 'linsi' or algo == 'einsi':
        for cog in cog_list:
            seq_dic, tags = bitk.fastareader('cdd.' + cog.split('|')[-1] +
                                             '.fa')
            if kingdic == {}:
                tagsI = [int(tag) for tag in tags]
                taxinfo = mist.genomes.find({'_id': {
                    '$in': tagsI
                }}, {
                    'ta': 1,
                    'n': 1
                })
                for i in taxinfo:
                    kingdic[i['_id']] = i['ta'][0]
                kinglist = list(set(kingdic.values()))
                print kinglist
            for kingdom in kinglist:
                print "Aligning " + cog + ' from kingdom ' + kingdom
                if kingdom not in kinglist:
                    kinglist.append(kingdom)
                output = ''
                for tag in tags:
                    if kingdic[int(tag)] == kingdom:
                        output += '>' + str(tag) + '\n' + seq_dic[str(
                            tag)] + '\n'
                with open('cdd.' + cog.split('|')[-1] + '.' + kingdom + '.fa',
                          'w') as f:
                    f.write(output)
                if algo == 'linsi':
                    submit_to_linsi('cdd.' + cog.split('|')[-1] + '.' +
                                    kingdom + '.fa')
                elif algo == 'einsi':
                    submit_to_einsi('cdd.' + cog.split('|')[-1] + '.' +
                                    kingdom + '.fa')
                elif algo == 'muscle':
                    submit_to_muscle('cdd.' + cog.split('|')[-1] + '.' +
                                     kingdom + '.fa')
                else:
                    print "Alignment algorithm " + algo + " not supported"
                    sys.exit()

    elif algo == 'muscle':
        pool = multip.Pool(processes=Np)
        kinglist = pool.map(doalnmuscle, cog_list)

        newkinglist = []
        for i in kinglist:
            for j in i:
                if j not in newkinglist:
                    newkinglist.append(j)

        kinglist = newkinglist

    print kinglist

    return kinglist