def get_orthology_matrix_merging_plasmids_biosqldb(server, biodatabase_name): all_orthogroups = get_all_orthogroup_size(server, biodatabase_name) #server, db = manipulate_biosqldb.load_db(biodatabase_name) #sql='select orthogroup, count(*) from biosqldb.orthology_detail_%s group by orthogroup' % biodatabase_name # based on the new orthology detail table #all_orthogroups = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) n_groups = len(all_orthogroups.keys()) print("ngroups", n_groups) all_taxons = manipulate_biosqldb.get_taxon_id_list(server, biodatabase_name) ##print all_taxons #print 'get dico ortho count' detailed_orthology_count = {} for group in all_orthogroups.keys(): detailed_orthology_count[group] = {} for taxon_id in all_taxons: detailed_orthology_count[group][int(taxon_id)] = 0 for taxon_id in all_taxons: ##print taxon_id dico = manipulate_biosqldb.taxon_id2orthogroup_size( server, biodatabase_name, taxon_id) ##print dico #import time #time.sleep(3) ##print "taxon", taxon_id for group in dico.keys(): detailed_orthology_count[group][int(taxon_id)] += dico[group] #print 'count ok' ##print detailed_orthology_count return detailed_orthology_count
def get_mysql_table(db_name, table_name): import numpy as np server, db = manipulate_biosqldb.load_db(db_name) all_taxons_id = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_taxons = "id, " for i in range(0, len(all_taxons_id) - 1): sql_taxons += ' `%s`,' % all_taxons_id[i] sql_taxons += ' `%s`' % all_taxons_id[-1] sql = "select %s from comparative_tables.%s_%s" % (sql_taxons, table_name, db_name) mat = np.array(server.adaptor.execute_and_fetchall(sql, )) f = open("%s_matrix.tab" % table_name, "w") taxonid2genome = manipulate_biosqldb.taxon_id2genome_description( server, db_name, True) taxons_ids = [taxonid2genome[int(i)] for i in all_taxons_id] f.write('"id"\t"' + '"\t"'.join(taxons_ids) + '"\n') for row in mat: row = [str(i) for i in row] f.write("\t".join(row) + "\n")
def collect_COGs(db_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_head = 'INSERT INTO comparative_tables.COG_%s (id,' % db_name for taxon in taxon_id_list: sql_head += '`%s`,' % taxon sql_head = sql_head[0:-1] + ') values (' all_cogs_ids_sql = 'select COG_id from COG.locus_tag2gi_hit_%s group by COG_id;' % db_name all_cogs_ids = [ i[0] for i in server.adaptor.execute_and_fetchall(all_cogs_ids_sql, ) ] database_id = server.adaptor.execute_and_fetchall( 'select biodatabase_id from biosqldb.biodatabase where name="%s"' % db_name)[0][0] i = 0 for accession in all_cogs_ids: print(i, '/', len(all_cogs_ids), accession) i += 1 sql = 'select B.taxon_id, A.count from (select accession,count(*) as count from ' \ 'COG.locus_tag2gi_hit_%s ' \ 'where COG_id = "%s" group by accession) A ' \ 'left join biosqldb.bioentry as B on A.accession=B.accession and biodatabase_id = %s;' % (db_name, accession, database_id) data = server.adaptor.execute_and_fetchall(sql, ) taxon2count = {} for n in data: if n[0] not in taxon2count: taxon2count[n[0]] = int(n[1]) else: taxon2count[n[0]] += int(n[1]) sql_temp = sql_head + '"%s",' % accession for taxon in taxon_id_list: try: sql_temp += '%s,' % taxon2count[int(taxon)] except: sql_temp += '0,' sql_temp = sql_temp[0:-1] + ');' server.adaptor.execute(sql_temp, ) server.adaptor.commit()
def collect_EC(db_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) sql_db_id = 'select biodatabase_id from biodatabase where name="%s"' % db_name biodb_id = server.adaptor.execute_and_fetchall(sql_db_id,)[0][0] print "biodb_id", biodb_id taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_head = 'INSERT INTO comparative_tables.EC_%s (id,' % db_name for taxon in taxon_id_list: sql_head += '`%s`,' % taxon sql_head = sql_head[0:-1] + ') values (' all_EC_ids_sql = 'select distinct ec from enzyme.locus2ec_%s as t1 inner join enzyme.enzymes as t2 on t1.ec_id=t2.enzyme_id;' % (db_name) all_ec_ids = [i[0] for i in server.adaptor.execute_and_fetchall(all_EC_ids_sql,)] i = 0 for accession in all_ec_ids: print i,'/', len(all_ec_ids), accession i+=1 sql = 'select taxon_id, count(*) from (select taxon_id, t1.accession, ec_id from enzyme.locus2ec_%s as t1 ' \ ' inner join biosqldb.bioentry as t2 on t1.accession=t2.accession where biodatabase_id=%s) A ' \ ' inner join enzyme.enzymes as B on A.ec_id=B.enzyme_id where ec="%s" group by taxon_id;' % (db_name, biodb_id, accession) print sql #sql= 'select taxon_id, count(*) from enzyme.locus2ec_%s ' \ # ' where ec = "%s" group by taxon_id;' % (db_name, accession) data = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) sql_temp = sql_head + '"%s",' % accession for taxon in taxon_id_list: try: sql_temp += '%s,' % data[str(taxon)] except: sql_temp += '0,' sql_temp = sql_temp[0:-1] + ');' print sql_temp server.adaptor.execute(sql_temp,) server.adaptor.commit()
def biodb2randomized_matrix(bio_db_name): server, db = manipulate_biosqldb.load_db(bio_db_name) matrix = np.array( manipulate_biosqldb.get_orthology_table(server, bio_db_name)) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, bio_db_name) #print taxon_id2description #group_names = matrix[:,0] taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name) print 'Number of taxons:', len(taxons_ids) taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids] import re for i, accession in enumerate(taxons_ids): #print i, accession description = taxons_ids[i] description = re.sub(", complete genome\.", "", description) description = re.sub(", complete genome", "", description) description = re.sub(", complete sequence\.", "", description) description = re.sub("strain ", "", description) description = re.sub("str\. ", "", description) description = re.sub(" complete genome sequence\.", "", description) description = re.sub(" complete genome\.", "", description) description = re.sub(" chromosome", "", description) description = re.sub(" DNA", "S.", description) description = re.sub("Merged record from ", "", description) description = re.sub(", wgs", "", description) description = re.sub("Candidatus ", "", description) description = re.sub(".contig.0_1, whole genome shotgun sequence.", "", description) description = re.sub("Protochlamydia", "P.", description) description = re.sub("Chlamydia", "C.", description) description = re.sub("Chlamydophila", "E.", description) description = re.sub("Estrella", "E.", description) description = re.sub("Rhodopirellula", "R.", description) description = re.sub("Methylacidiphilum", "M.", description) description = re.sub(" phage", "", description) description = re.sub("Parachlamydia", "P.", description) description = re.sub("Neochlamydia", "Neo.", description) description = re.sub("Simkania", "S.", description) description = re.sub("Waddlia", "W.", description) description = re.sub("Pirellula", "P.", description) description = re.sub("Rhabdochlamydiaceae sp.", "Rhabdo", description) taxons_ids[i] = description M = matrix.astype(float) # [:, 1:] M = heatmap.randomize_table(M) return (M, taxons_ids)
def create_comparative_tables(db_name, table_name): # create id column + one_column per taxon_id from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql = "CREATE TABLE comparative_tables.%s_%s(id VARCHAR(100) NOT NULL" % (table_name, db_name) for i in taxon_id_list: sql+=" ,`%s` INT" % i sql+=")" server.adaptor.execute(sql)
def get_conserved_core_groups(server, biodatabase_name): all_taxons_id = manipulate_biosqldb.get_taxon_id_list( server, biodatabase_name) sql_taxons = "" for i in range(0, len(all_taxons_id) - 1): sql_taxons += ' `%s` = 1 and' % all_taxons_id[i] sql_taxons += ' `%s` = 1' % all_taxons_id[-1] sql = "select orthogroup from orthology_%s where %s;" % (biodatabase_name, sql_taxons) #print sql result = server.adaptor.execute_and_fetchall(sql, ) return [i[0] for i in result]
def collect_EC(db_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) sql_db_id = 'select biodatabase_id from biodatabase where name="%s"' % db_name biodb_id = server.adaptor.execute_and_fetchall(sql_db_id, )[0][0] taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_head = 'INSERT INTO comparative_tables.EC_%s (id,' % db_name for taxon in taxon_id_list: sql_head += '`%s`,' % taxon sql_head = sql_head[0:-1] + ') values (' all_EC_ids_sql = 'select distinct ec from enzyme.seqfeature_id2ec_%s t1 inner join enzyme.enzymes t2 on t1.ec_id=t2.enzyme_id;' % ( db_name) all_ec_ids = [ i[0] for i in server.adaptor.execute_and_fetchall(all_EC_ids_sql, ) ] i = 0 for accession in all_ec_ids: print(i, '/', len(all_ec_ids), accession) i += 1 sql = 'select taxon_id,count(*) as n from enzyme.seqfeature_id2ec_%s t1 inner join enzyme.enzymes t2 on t1.ec_id=t2.enzyme_id inner join annotation.seqfeature_id2locus_%s t3 on t1.seqfeature_id=t3.seqfeature_id where ec="%s" group by taxon_id,ec;' % ( db_name, db_name, accession) data = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql_temp = sql_head + '"%s",' % accession for taxon in taxon_id_list: try: sql_temp += '%s,' % data[str(taxon)] except: sql_temp += '0,' sql_temp = sql_temp[0:-1] + ');' server.adaptor.execute(sql_temp, ) server.adaptor.commit()
def collect_ko(db_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) sql_db_id = 'select biodatabase_id from biodatabase where name="%s"' % db_name biodb_id = server.adaptor.execute_and_fetchall(sql_db_id, )[0][0] taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_head = 'INSERT INTO comparative_tables.ko_%s (id,' % db_name for taxon in taxon_id_list: sql_head += '`%s`,' % taxon sql_head = sql_head[0:-1] + ') values (' all_ko_ids_sql = 'select distinct ko_id from enzyme.locus2ko_%s;' % ( db_name) all_ko_ids = [ i[0] for i in server.adaptor.execute_and_fetchall(all_ko_ids_sql, ) ] i = 0 for accession in all_ko_ids: print(i, '/', len(all_ko_ids), accession) i += 1 sql = 'select taxon_id, count(*) from enzyme.locus2ko_%s where ko_id="%s" group by taxon_id;' % ( db_name, accession) data = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql_temp = sql_head + '"%s",' % accession for taxon in taxon_id_list: try: sql_temp += '%s,' % data[str(taxon)] except: sql_temp += '0,' sql_temp = sql_temp[0:-1] + ');' server.adaptor.execute(sql_temp, ) server.adaptor.commit()
def collect_interpro(db_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_head = 'INSERT INTO comparative_tables.interpro_%s (id,' % db_name for taxon in taxon_id_list: sql_head += '`%s`,' % taxon sql_head = sql_head[0:-1] + ') values (' import sys #sys.exit() all_interpro_ids_sql = 'select interpro_accession from biosqldb.interpro_%s ' \ ' where interpro_accession != "0" group by interpro_accession;' % db_name all_interpro_ids = [ i[0] for i in server.adaptor.execute_and_fetchall(all_interpro_ids_sql, ) ] i = 0 for accession in all_interpro_ids: print(i, '/', len(all_interpro_ids), accession) i += 1 sql= 'select A.taxon_id, count(*) as n from (select taxon_id, locus_tag, interpro_accession ' \ ' from biosqldb.interpro_%s where interpro_accession="%s" group by locus_tag) A group by taxon_id;' % (db_name, accession) data = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql_temp = sql_head + '"%s",' % accession for taxon in taxon_id_list: try: sql_temp += '%s,' % data[str(taxon)] except: sql_temp += '0,' sql_temp = sql_temp[0:-1] + ');' server.adaptor.execute(sql_temp, ) server.adaptor.commit()
def write_ortho_matrix(bio_db_name): server, db = manipulate_biosqldb.load_db(bio_db_name) matrix = np.array( manipulate_biosqldb.get_orthology_table(server, bio_db_name)) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, bio_db_name) group_names = matrix[:, 0] taxons_ids = manipulate_biosqldb.get_taxon_id_list(server, bio_db_name) import re for i in taxon_id2description.keys(): taxon_id2description[i] = re.sub(" subsp\. aureus", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("strain ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" chromosome", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" DNA", "", taxon_id2description[i]) taxons_ids = [taxon_id2description[str(i)] for i in taxons_ids] #print taxon_id2description f = open("ortho_matrix.tab", "w") f.write("orthogroup\t" + "\t".join(taxons_ids) + "\n") for row in matrix: f.write("\t".join(row) + "\n") f.close()
def collect_pfam(db_name): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(db_name) taxon_id_list = manipulate_biosqldb.get_taxon_id_list(server, db_name) sql_head = 'INSERT INTO comparative_tables.Pfam_%s (id,' % db_name for taxon in taxon_id_list: sql_head += '`%s`,' % taxon sql_head = sql_head[0:-1] + ') values (' all_pfam_ids_sql = 'select signature_accession from interpro_%s where analysis="Pfam" ' \ 'group by signature_accession;' % db_name all_pfam_ids = [ i[0] for i in server.adaptor.execute_and_fetchall(all_pfam_ids_sql, ) ] i = 0 for accession in all_pfam_ids: print(i, '/', len(all_pfam_ids), accession) i += 1 sql = 'select taxon_id, count(*) from biosqldb.interpro_%s ' \ ' where analysis="Pfam" and signature_accession="%s" group by taxon_id;' % (db_name, accession) data = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql_temp = sql_head + '"%s",' % accession for taxon in taxon_id_list: try: sql_temp += '%s,' % data[str(taxon)] except: sql_temp += '0,' sql_temp = sql_temp[0:-1] + ');' server.adaptor.execute(sql_temp, ) server.adaptor.commit()
def plot_orthogroup_size_distrib(server, biodatabase_name, out_name="orthogroup_size_distrib.svg", taxon_id=False): from collections import Counter import numpy as np import pandas import itertools import matplotlib.pyplot as plt import matplotlib as mpl import math import re if not taxon_id: all_taxons = manipulate_biosqldb.get_taxon_id_list( server, biodatabase_name) for key in all_taxons: sql = 'select description from bioentry where taxon_id=%s and description not like "%%%%plasmid%%%%" limit 1;' #print sql, key description = server.adaptor.execute_and_fetchall(sql, key)[0][0] #print all_taxons all_data = {} for taxon in all_taxons: sql = 'select orthogroup,`%s` from comparative_tables.orthology_%s where `%s` >0' % ( taxon, biodatabase_name, taxon) result = manipulate_biosqldb._to_dict( server.adaptor.execute_and_fetchall(sql, )) all_data[taxon] = Counter(result.values()) # get maximum group size and max number of proteins for a single group max_orthogroup_size = 0 max_n_groups = 0 for key in all_data.keys(): dico = all_data[key] temp = int(max(dico.keys())) temp2 = int(max(dico.values())) if temp > max_orthogroup_size: max_orthogroup_size = temp if temp2 > max_n_groups: max_n_groups = temp2 #for key in all_data.keys(): # dico= all_data[key] # for i in range(1, max_orthogroup_size): # if i not in dico.keys(): # dico[i] = 0 # transform into pandas serie objects complete_data = {} for key in all_data.keys(): dico = all_data[key] df = pandas.Series(dico) complete_data[key] = df # create figure with one subplot per genome # plot per genome group size distribution n = int(math.ceil(len(all_taxons) / 2.0)) #print n if n % 2 == 0: n += 1 fig, axes = plt.subplots(nrows=n, ncols=2) for key, location in zip(complete_data.keys(), list(itertools.product(range(n), repeat=2))): serie = complete_data[key] sql = 'select description from bioentry where taxon_id=%s and description not like "%%%%plasmid%%%%" limit 1;' #print sql, key description = server.adaptor.execute_and_fetchall(sql, key)[0][0] #print description description = re.sub(", complete genome\.", "", description) description = re.sub(", complete genome", "", description) description = re.sub(", complete sequence\.", "", description) description = re.sub("strain ", "", description) description = re.sub("str\. ", "", description) description = re.sub(" complete genome sequence\.", "", description) description = re.sub(" complete genome\.", "", description) description = re.sub(" chromosome", "", description) description = re.sub(" DNA", "S.", description) description = re.sub("Merged record from ", "", description) description = re.sub(", wgs", "", description) description = re.sub("Candidatus ", "", description) description = re.sub(".contig.0_1, whole genome shotgun sequence.", "", description) p = serie.plot(ax=axes[location[1], location[0]], kind="bar", alpha=1, color='r') p.set_ylim(-100, max_n_groups + 0.1 * max_n_groups) p.set_title(description) for rect in p.patches: height = rect.get_height() p.text(rect.get_x() + rect.get_width() / 2., height + 0.02 * max_n_groups, int(height), ha='center', va='bottom') # plot all genomes group size distribution #server, db = manipulate_biosqldb.load_db(db_name) sql = 'select orthogroup, count(*) from biosqldb.orthology_detail_%s group by orthogroup' % biodatabase_name # based on the new orthology detail table all_grp_size = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) # based on biosqldb tables #all_grp_size = get_all_orthogroup_size(server, biodatabase_name) df = pandas.Series(Counter(all_grp_size.values())) p = df.plot(kind="bar", alpha=0.5, color='b') p.set_ylim(-100, max(df) + 0.1 * max(df)) p.set_title("Orthogroup size distribution") # add group size as text at the top of the bars for rect in p.patches: height = rect.get_height() p.text(rect.get_x() + rect.get_width() / 2., height + 0.02 * max_n_groups, int(height), ha='center', va='bottom') fig.set_size_inches(18, 6 * n) plt.savefig(out_name, format="svg") if taxon_id: #print "ID!!!" sql = 'select orthogroup,`%s` from %s where `%s` >0' othogroup_id2size = manipulate_biosqldb._to_dict( server.adaptor.execute_and_fetchall( sql, (taxon_id, biodatabase_name, taxon_id))) serie = pandas.Series(Counter(othogroup_id2size.values())) p = serie.plot(kind="bar", alpha=0.5, color='b') p.set_ylim(-100, max(serie) + 0.1 * max(serie)) sql = 'select description from bioentry where taxon_id=%s limit 1;' description = server.adaptor.execute_and_fetchall(sql, taxon_id) p.set_title(description[0][0]) for rect in p.patches: height = rect.get_height() p.text(rect.get_x() + rect.get_width() / 2., height + 0.02 * max(serie), int(height), ha='center', va='bottom') #plt.show() plt.savefig(out_name, format="svg")
print 'getting single copy core groups' core_ortho = get_conserved_core_groups(server, args.db_name) import shutil for group in core_ortho: shutil.copy( os.path.join( asset_path, "%s_fasta_by_taxons/%s.txt" % (args.db_name, group)), os.path.join(asset_path, "%s_fasta_core/%s.txt" % (args.db_name, group))) get_nucleotide_core_fasta(server, db, args.db_name, "./nucl") if args.core_groups_path: taxon_ids = manipulate_biosqldb.get_taxon_id_list(server, args.db_name) sql_include = '' if len(taxon_ids) > 0: for i in range(0, len(taxon_ids) - 1): sql_include += ' `%s` = 1 and ' % taxon_ids[i] sql_include += '`%s` = 1' % taxon_ids[-1] sql = 'select orthogroup from orthology_%s where %s' % (args.db_name, sql_include) print sql group_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql, )] locus_or_protein_id2taxon_id = manipulate_biosqldb.locus_or_protein_id2taxon_id( server, args.db_name) print 'Number of core groups: %s' % len(group_list) get_one_group_data_taxon(group_list, locus_or_protein_id2taxon_id, args.db_name, args.core_groups_path)