def get_pairwise_connexions(accession_1, accession_2, biodb): import manipulate_biosqldb import numpy import pandas server, db = manipulate_biosqldb.load_db(biodb) sql1 = 'select seqfeature_id, start, stop from biosqldb.orthology_detail_%s where accession in ("%s","%s") ' % (biodb, accession_1, accession_2) seqfeature_id2location = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql1,)) print seqfeature_id2location.keys()[0:10] sql2 = 'select accession, taxon_id from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' where t1.name="%s"' % biodb print sql2 accession2taxon_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql2,)) comp1_sql = 'select locus_1,locus_2,identity from (select * from ' \ ' comparative_tables.identity_closest_homolog2_%s where taxon_1=%s and taxon_2=%s) A ' \ ' inner join biosqldb.orthology_detail_%s B on A.locus_1=B.seqfeature_id;' % (biodb, accession2taxon_id[accession_1], accession2taxon_id[accession_2], biodb) data = server.adaptor.execute_and_fetchall(comp1_sql,) comparison_table = [] for row in data: print row try: start1 = seqfeature_id2location[int(row[0])][0] stop1 = seqfeature_id2location[int(row[0])][0] start2 = seqfeature_id2location[int(row[1])][0] stop2 = seqfeature_id2location[int(row[1])][0] identity = row[2] comparison_table.append([start1, stop1, start2, stop2, identity]) except: pass data = numpy.array(comparison_table) columns = ['start1', 'end1', 'start2', 'end2', 'identity'] df = pandas.DataFrame(data, columns=columns) return df
def get_profile_fasta(biodb, taxon_id): ''' - ordered taxon - transposed orthology table => each row is a different taxon :return: ''' import manipulate_biosqldb import pandas import numpy from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO server, db = manipulate_biosqldb.load_db(biodb) sql = 'select taxon_id, accession from biodatabase t1 inner join bioentry t2 on t1.biodatabase_id=t2.biodatabase_id' \ ' where (t1.name="%s" and t2.description not like "%%%%plasmid%%%%")' % biodb taxon2accession = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) taxon_id_filter = '`' + '`,`'.join(taxon2accession.keys()) + '`' sql = 'select t2.locus_tag,%s from comparative_tables.orthology_%s t1 inner join orthology_detail_%s t2 on t1.orthogroup=t2.orthogroup' \ ' where t2.taxon_id=%s' % (taxon_id_filter, biodb, biodb, taxon_id) sql3 = 'show columns from comparative_tables.orthology_%s' % (biodb) data = numpy.array( [list(i) for i in server.adaptor.execute_and_fetchall(sql, )]) all_cols = [i[0] for i in server.adaptor.execute_and_fetchall(sql3, )] count_df = pandas.DataFrame(data, columns=all_cols) count_df = count_df.set_index(['orthogroup']) count_df = count_df.apply(pandas.to_numeric, args=('coerce', )) count_df[(count_df > 1)] = 1 #print count_df transposed_table = count_df.transpose() #print transposed_table #transposed_table.columns = [] all_records = [] for taxon, row in transposed_table.iterrows(): #print taxon, row profile_dat = [str(i) for i in row] profile = ''.join(profile_dat) simple_seq = Seq(profile) simple_seq_r = SeqRecord(simple_seq) simple_seq_r.id = taxon2accession[taxon] simple_seq_r.description = "" all_records.append(simple_seq_r) with open("profiles_all.fasta", 'w') as tt: SeqIO.write(all_records, tt, 'fasta')
def get_module_count_all_db(biodb, category=False): ''' :param biodb: <biodatabase name> :param category: KEGG module category (optional) :return: for each module, return the total count from KEGG, and the total count of KO present in the <biodb> ''' import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql_biodb_id = 'select biodatabase_id from biodatabase where name="%s"' % biodb database_id = server.adaptor.execute_and_fetchall(sql_biodb_id, )[0][0] if category: sql_pathway_count = 'select BB.module_name,count_all,count_db,count_db/count_all from (select module_id, count(*) ' \ ' as count_db from (select distinct ko_id from enzyme.locus2ko_%s) as t1' \ ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by module_id) AA ' \ ' right join (select t1.module_id,module_name, count_all from (select module_id, count(*) as count_all ' \ 'from enzyme.module2ko group by module_id) t1 inner join enzyme.kegg_module as t2 ' \ 'on t1.module_id=t2.module_id where module_sub_cat="%s")BB on AA.module_id=BB.module_id;' % (biodb, category) # where pathway_category!="1.0 Global and overview maps" else: # select distinct KO # join with module sql_pathway_count = 'select BB.module_name,count_all,count_db,count_db/count_all from (select module_id, count(*) ' \ ' as count_db from (select distinct ko_id from enzyme.locus2ko_%s) as t1' \ ' inner join enzyme.module2ko as t2 on t1.ko_id=t2.ko_id group by module_id) AA ' \ ' right join (select t1.module_id,module_name, count_all from (select module_id, count(*) as count_all ' \ 'from enzyme.module2ko group by module_id) t1 inner join enzyme.kegg_module as t2 ' \ 'on t1.module_id=t2.module_id)BB on AA.module_id=BB.module_id;' % (biodb) # where pathway_category!="1.0 Global and overview maps" map2count = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql_pathway_count, )) return map2count
def plot_tree(ete3_tree, orthogroup, biodb, mysql_host="localhost", mysql_user="******", mysql_pwd="baba", mysql_db="blastnr"): import MySQLdb import manipulate_biosqldb from ete3 import Tree, TreeStyle, faces, AttrFace conn = MySQLdb.connect( host=mysql_host, # your host, usually localhost user=mysql_user, # your username passwd=mysql_pwd, # your password db=mysql_db) # name of the data base cursor = conn.cursor() locus_list = [lf.name for lf in ete3_tree.iter_leaves()] filter = '"' + '","'.join(locus_list) + '"' print('get uniprot taxnomy') sql1 = 'select subject_accession,subject_scientific_name,t2.phylum from blast_swissprot_%s t1 ' \ ' inner join blastnr_taxonomy as t2 on t1.subject_taxid=t2.taxon_id where subject_accession in (%s);' % (biodb, filter) sql1 = 'select subject_accession,subject_scientific_name,t4.phylum from biosqldb.orthology_detail_%s t1 ' \ ' inner join custom_tables.locus2seqfeature_id_%s t2 ' \ ' on t1.locus_tag=t2.locus_tag ' \ ' inner join blast_swissprot_%s t3 on t2.seqfeature_id=t3.seqfeature_id ' \ ' inner join blastnr_taxonomy as t4 on t3.subject_taxid=t4.taxon_id ' \ ' where t1.orthogroup="%s"' % (biodb, biodb, biodb, orthogroup) print('get refseq taxonomy') cursor.execute(sql1, ) accession2name_and_phylum = manipulate_biosqldb.to_dict(cursor.fetchall()) sql2 = 'select subject_accession,subject_scientific_name,t4.phylum from biosqldb.orthology_detail_%s t1 ' \ ' inner join custom_tables.locus2seqfeature_id_%s t2 ' \ ' on t1.locus_tag=t2.locus_tag ' \ ' inner join blastnr_%s t3 on t2.seqfeature_id=t3.seqfeature_id ' \ ' inner join blastnr_taxonomy as t4 on t3.subject_taxid=t4.taxon_id ' \ ' where t1.orthogroup="%s"' % (biodb, biodb, biodb, orthogroup) print(sql2) cursor.execute(sql2, ) accession2name_and_phylum.update( manipulate_biosqldb.to_dict(cursor.fetchall())) print('plotting tree') phylum_list = list( set([ accession2name_and_phylum[i][1] for i in accession2name_and_phylum.keys() ])) sql = 'select locus_tag, organism from biosqldb.orthology_detail_%s' % biodb cursor.execute(sql, ) locus2organism = manipulate_biosqldb.to_dict(cursor.fetchall()) phylum2col = dict(zip(phylum_list, get_spaced_colors(len(phylum_list)))) R = ete3_tree.get_midpoint_outgroup() # and set it as tree outgroup ete3_tree.set_outgroup(R) for lf in ete3_tree.iter_leaves(): try: col = phylum2col[accession2name_and_phylum[lf.name][1]] lf.name = '%s|%s-%s' % (lf.name, accession2name_and_phylum[lf.name][0], accession2name_and_phylum[lf.name][1]) ff = AttrFace("name", fsize=12) #ff.background.color = 'red' ff.fgcolor = col lf.add_face(ff, column=0) #nameFace = AttrFace(lf.name, fsize=30, fgcolor=phylum2col[accession2name_and_phylum[lf.name][1]]) #faces.add_face_to_node(nameFace, lf, 0, position="branch-right") # #nameFace.border.width = 1 except: col = 'red' try: lf.name = '%s| %s' % (lf.name, locus2organism[lf.name]) except: lf.name = '%s| ??' % (lf.name) ff = AttrFace("name", fsize=12) #ff.background.color = 'red' ff.fgcolor = col lf.add_face(ff, column=0) ts = TreeStyle() ts.show_leaf_name = False ts.show_branch_support = True return ete3_tree, ts
outname="%s_swiss_homologs.faa" % grp, swissprot=True, refseq=True) if alignment: t = aafasta2phylogeny("%s_swiss_homologs.faa" % grp) tree, ts = plot_tree(t, grp, "chlamydia_04_16", mysql_pwd=sqlpsw) out_name = "%s.svg" % grp tree.render(out_name, tree_style=ts) else: server, db = manipulate_biosqldb.load_db(args.biodb) sql = 'select orthogroup, count(*) as n from orthology_detail_%s group by orthogroup' % args.biodb print('gettig orthogroup2n_hits refseq') orthgroup2orthogroup_size = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) filter = '"' + '","'.join(exclude) + '"' sql2 = 'select orthogroup, count(*) from ' \ ' (select locus_tag, count(*) as n from custom_tables.locus2seqfeature_id_%s t1 ' \ ' inner join blastnr.blastnr_%s as t2 on t1.seqfeature_id=t2.seqfeature_id ' \ ' inner join blastnr.blastnr_taxonomy t3 on t2.subject_taxid=t3.taxon_id ' \ ' where t3.phylum not in (%s) group by t1.seqfeature_id) A ' \ ' inner join biosqldb.orthology_detail_%s B on A.locus_tag=B.locus_tag ' \ ' group by orthogroup;' % (args.biodb, args.biodb, filter, args.biodb) group2n_blast_refseq = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2, )) print('gettig orthogroup2n_hits swissprot')
def get_set_data(biodb, set_list_restrict=[], frequency=False, six_frame_translation=False, return_lists=False, score_cutoff=0): import manipulate_biosqldb ''' :param biodb: :param set_list: restrict analysis to specific sets (empty list mean all sets) :param frequency: return ratio n genes identified/n genes in the set :param cutoff_percent: onle return presence absence data (1 and 0) given a cutoff percentage of the genes identified/genes in the set :param six_frame_translation: get data from the six fram translation analysis :return: dictionnary taxon2list of values OR taxon2set2value dictionnary ''' server, db = manipulate_biosqldb.load_db(biodb) sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb db_id = server.adaptor.execute_and_fetchall(sql, )[0][0] if six_frame_translation: hmm_table = 'hmm_hits_six_frame_genome' else: hmm_table = 'hmm_hits_annotated_genome' sql = 'select taxon_id,set_id, count(*) from ' \ ' (select t1.*,t2.set_id from hmm.%s_%s t1 ' \ ' inner join hmm.hmm_sets_entry t2 on t1.hmm_id=t2.hmm_id where t1.bitscore>%s' \ ' group by taxon_id,set_id,t1.hmm_id) A group by taxon_id,set_id;' % (hmm_table, biodb, score_cutoff) data = server.adaptor.execute_and_fetchall(sql, ) if frequency: sql = 'select taxon_id,count(*) as n from COG.locus_tag2gi_hit_%s t1 ' \ ' inner join COG.cog_names_2014 t2 on t1.COG_id=t2.COG_id ' \ ' inner join biosqldb.bioentry as t3 on t1.accession=t3.accession ' \ ' where biodatabase_id=%s group by taxon_id;' % (biodb, db_id) taxon_id2count = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select * from hmm.hmm_sets' set_id2description = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) set2taxon2count = {} taxon2list = {} set_list = [] for row in data: if row[0] not in taxon2list: taxon2list[row[0]] = [row[1]] else: taxon2list[row[0]].append(row[1]) set = set_id2description[str(row[1])] if set not in set_list: set_list.append(set) if len(set_list_restrict) > 0: if set not in set_list_restrict: continue if set not in set2taxon2count: set2taxon2count[set] = {} if frequency: freq = round( (float(row[2]) / float(taxon_id2count[str(row[0])])) * 100, 2) set2taxon2count[set][str(row[0])] = freq else: set2taxon2count[set][str(row[0])] = int(row[2]) else: if frequency: freq = round( (float(row[2]) / float(taxon_id2count[str(row[0])])) * 100, 2) set2taxon2count[set][str(row[0])] = freq else: set2taxon2count[set][str(row[0])] = int(row[2]) if not return_lists: return set2taxon2count, set_list else: return taxon2list, set_list
import argparse import manipulate_biosqldb parser = argparse.ArgumentParser() parser.add_argument("-i", '--input', type=str, help="input genbank") parser.add_argument("-l", '--locus', type=str, help="locus_tag_prefix") args = parser.parse_args() target_aa = ['U', 'C', 'u', 'c'] server, db = manipulate_biosqldb.load_db('chlamydia_03_15') sql = 'select locus_tag, SP, TM from orthology_detail_chlamydia_03_15' sql2 = 'select protein_id, locus_tag from orthology_detail_chlamydia_03_15' protein_id2locus_tag = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql2, )) #print protein_id2locus_tag data = server.adaptor.execute_and_fetchall(sql, ) locus_tag2SP_TM = {} for i in data: locus_tag2SP_TM[i[0]] = [i[1], i[2]] from Bio import SeqIO handle = open(args.input, "rU") print 'protein_id\tlocus\ttransmembrane_domains\tsignal_peptide\tcystein(%)\tn_C_U\tprotein_length\tdescription' for record in SeqIO.parse(handle, "fasta"): target_n = 0 protein_length = len(record.seq) for aa in record.seq: if aa in target_aa: target_n += 1
def get_whole_db_uniprot_crossref(biodb): # get gi from all database locus import MySQLdb from datetime import datetime import httplib import time import manipulate_biosqldb import re import urllib2 import os #sqlpsw = os.environ['SQLPSW'] from tempfile import NamedTemporaryFile conn = MySQLdb.connect(host="localhost", # your host, usually localhost user="******", # your username passwd="estrella3", # your password db="custom_tables") # name of the data base cursor = conn.cursor() sql1 = 'CREATE TABLE IF NOT EXISTS uniprot_id2seqfeature_id_%s (seqfeature_id INT UNIQUE, uniprot_id INT AUTO_INCREMENT,' \ ' uniprot_accession varchar(400), uniprot_status varchar(400), annotation_score INT, insert_date varchar(300), INDEX uniprot_id(uniprot_id))' % biodb sql2 = 'CREATE TABLE IF NOT EXISTS db_xref (db_xref_id INT AUTO_INCREMENT, db_xref_name varchar(200) UNIQUE, INDEX db_xref_id(db_xref_id))' sql3 = 'CREATE TABLE IF NOT EXISTS uniprot_db_xref_%s (uniprot_id INT, db_xref_id INT, db_accession varchar(200), ' \ ' INDEX db_xref_id(db_xref_id), index uniprot_id(uniprot_id))' % biodb sql4 = 'CREATE TABLE IF NOT EXISTS uniprot_go_terms_%s (seqfeature_id INT, go_term_id varchar(400), go_description TEXT, ' \ ' INDEX seqfeature_id(seqfeature_id))' % biodb sql5 = 'CREATE TABLE IF NOT EXISTS uniprot_annotation_%s (seqfeature_id INT, comment_function TEXT,' \ ' ec_number TEXT,comment_similarity TEXT,comment_catalyticactivity TEXT,comment_pathway TEXT,keywords TEXT,' \ ' comment_subunit TEXT, gene TEXT, recommendedName_fullName TEXT, proteinExistence TEXT, ' \ ' developmentalstage TEXT, index seqfeature_id(seqfeature_id))' % biodb print sql1 cursor.execute(sql1, ) cursor.execute(sql2, ) cursor.execute(sql3, ) cursor.execute(sql4, ) cursor.execute(sql5, ) conn.commit() sql1 = 'select locus_tag, seqfeature_id from locus2seqfeature_id_%s' % biodb sql2 = 'select locus_tag, old_locus_tag from biosqldb.locus_tag2old_locus_tag' # attention EDIT!!!!!!!!!!!!!! sql3 = 'select locus_tag, protein_id from biosqldb.orthology_detail_%s where protein_id not like "%%%%CHUV%%%%"' % biodb sql4 = 'select locus_tag,t2.seqfeature_id from locus2seqfeature_id_%s t1 inner join uniprot_annotation_%s t2' \ ' on t1.seqfeature_id=t2.seqfeature_id group by locus_tag;' % (biodb, biodb) sql5 = 'select locus_tag, organism from biosqldb.orthology_detail_%s' % biodb sql6 = 'select locus_tag, translation from biosqldb.orthology_detail_%s' % biodb sql7 = 'select locus_tag, accession from biosqldb.orthology_detail_%s' % biodb cursor.execute(sql1, ) locus2seqfeature_id = manipulate_biosqldb.to_dict(cursor.fetchall()) cursor.execute(sql2, ) locus2old_locus = manipulate_biosqldb.to_dict(cursor.fetchall()) cursor.execute(sql3, ) locus2protein_id = manipulate_biosqldb.to_dict(cursor.fetchall()) cursor.execute(sql4, ) locus2uniprot_id = manipulate_biosqldb.to_dict(cursor.fetchall()) cursor.execute(sql5, ) locus2organism = manipulate_biosqldb.to_dict(cursor.fetchall()) cursor.execute(sql6, ) locus2sequence = manipulate_biosqldb.to_dict(cursor.fetchall()) cursor.execute(sql7, ) locus2genome_accession = manipulate_biosqldb.to_dict(cursor.fetchall()) for i, locus in enumerate(locus2protein_id): print "%s -- %s : %s / %s" % (locus, locus2protein_id[locus],i, len(locus2protein_id)) # already into database if locus in locus2uniprot_id: continue genome_accession = locus2genome_accession[locus] uniprot_id = ncbi_accession2uniprotid(locus2protein_id[locus], genome_accession=genome_accession) if not uniprot_id: uniprot_id = ncbi_accession2uniprotid(locus2protein_id[locus]) if not uniprot_id: try: old_locus = locus2old_locus[locus] except KeyError: old_locus = False if old_locus: genus = locus2organism[locus].split(' ')[0] print 'trying with old_locus_tag' uniprot_id = ncbi_accession2uniprotid(old_locus, gene=True, organism=genus) if not uniprot_id: print 'trying to match with sequence: %s' % locus try: uniprot_id = sequence2uniprot_id(locus2sequence[locus]) print 'ok: %s' % uniprot_id except: continue if uniprot_id: # insert uniprot_id into mysql table # 1. get seqfeatureid of the corresponding locus seqid = locus2seqfeature_id[locus] try: uniprot_score, uniprot_status, go_data = uniprot_accession2go_and_status(uniprot_id) except: print 'echec, continue' continue # add go data if go_data: for one_go in go_data: sql = 'insert into uniprot_go_terms_%s (seqfeature_id, go_term_id, go_description) ' \ 'values(%s, "%s", "%s")' % (biodb, seqid, one_go, go_data[one_go]) cursor.execute(sql, ) conn.commit() # insert uniprot_id now = datetime.now() str_date = "%s-%s-%s" % (now.year, now.month, now.day) sql = 'insert into uniprot_id2seqfeature_id_%s (seqfeature_id,uniprot_accession, uniprot_status, annotation_score,insert_date) ' \ ' values (%s, "%s", "%s", %s,"%s")' % (biodb, seqid, uniprot_id, uniprot_status, uniprot_score, str_date) try: cursor.execute(sql, ) conn.commit() # if seqfeature id already already inserted, no need to insert it again except conn.IntegrityError: print '%s already into uniprot_id2seqfeature_id_%s' % (seqid, biodb) pass sqlid = 'select t1.uniprot_id from uniprot_id2seqfeature_id_%s as t1 where t1.seqfeature_id=%s' % (biodb, locus2seqfeature_id[locus]) #print sqlid cursor.execute(sqlid, ) uniprot_db_id = cursor.fetchall()[0][0] #print 'uniprotdb id', uniprot_db_id uniprot_record = uniprot_id2record(uniprot_id) if not uniprot_record: import time time.sleep(5) uniprot_record = uniprot_id2record(uniprot_id) alldbref = uniprot_record2db_refs(uniprot_record) annotation = uniprot_record2annotations(uniprot_record) # add annotation sql = 'insert into uniprot_annotation_%s (seqfeature_id, comment_function,' \ ' ec_number,comment_similarity,comment_catalyticactivity,comment_pathway,keywords,' \ ' comment_subunit, gene, recommendedName_fullName, proteinExistence,developmentalstage) values' \ ' (%s, "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (biodb, seqid, re.sub('"','',annotation["comment_function"]), re.sub('"','',annotation["ec_number"]), re.sub('"','',annotation["comment_similarity"]), re.sub('"','',annotation["comment_catalyticactivity"]), re.sub('"','',annotation["comment_pathway"]), re.sub('"','',annotation["keywords"]), re.sub('"','',annotation["comment_subunit"]), re.sub('"','',annotation["gene"]), re.sub('"','',annotation["recommendedName_fullName"]), re.sub('"','',annotation["proteinExistence"]), re.sub('"','',annotation["developmentalstage"])) cursor.execute(sql, ) conn.commit() # add dbxrefs if alldbref: for database in alldbref: # 1. check if cross ref database already in the database list sql1 = 'select db_xref_id from db_xref where db_xref_name="%s"' % database try: cursor.execute(sql1, ) database_index = cursor.fetchall()[0][0] except: # insert new database name sql2 = 'insert into db_xref (db_xref_name) values ("%s")' % database cursor.execute(sql2, ) conn.commit() cursor.execute(sql1, ) database_index = cursor.fetchall()[0][0] for crossref in alldbref[database]: # insert cross reference into database sql3 = 'insert into uniprot_db_xref_%s (uniprot_id, db_xref_id, db_accession) values (%s, %s, "%s")' % (biodb, uniprot_db_id, database_index, crossref) #print sql3 cursor.execute(sql3, ) conn.commit() else: print 'echec ----------------' print 'echec ----------------' else: print 'UNIPRITID NOT FOUND'
def find_clusters_of_locus(db_name, identity_cutoff, distance_cutoff=20000): ''' ATTENTION: tous les paralogues pris en comptes si on a 1 prot dans genome A et 3 dans le genome B prot 1A sera comparee a son best hit B prot 1B, 2B et 2C seront comparee a 1A dans tous les cas cette approche est redontante car on compare tjs A vs B et B vs A... :param db_name: biodatabase name :param identity_cutoff: average ortholog identity cutoff: if genomes are too close, do not identify clusters (too much clusters) :param distance_cutoff: size of the considered window :return: ''' import manipulate_biosqldb import mysqldb_plot_genomic_feature from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import pylibmc # for memory storage of all biorecords mc = pylibmc.Client(["127.0.0.1"], binary=True, behaviors={"tcp_nodelay": True, "ketama": True}) server, db = manipulate_biosqldb.load_db(db_name) sql_locus = 'select locus_tag from orthology_detail_%s' % db_name all_locus_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql_locus,)] sql = 'select locus_tag, orthogroup from orthology_detail_%s' % db_name locus2orthogroup = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) sql = 'select locus_tag, start, stop from orthology_detail_%s' % db_name locus2start_end = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) orthogroup2locus_list = {} for locus in locus2orthogroup: if locus2orthogroup[locus] not in orthogroup2locus_list: orthogroup2locus_list[locus2orthogroup[locus]] = [locus] else: orthogroup2locus_list[locus2orthogroup[locus]].append(locus) sql_identity = 'select taxon_1, taxon_2, median_identity from comparative_tables.shared_orthogroups_average_identity_%s' % db_name taxon2taxon_median_id = {} for row in server.adaptor.execute_and_fetchall(sql_identity,): if row[0] not in taxon2taxon_median_id: taxon2taxon_median_id[row[0]] = {} taxon2taxon_median_id[row[0]][row[1]] = row[2] else: taxon2taxon_median_id[row[0]][row[1]] = row[2] sql = 'select locus_tag, accession from orthology_detail_%s' % db_name locus2accession = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) accession_list = set(locus2accession.values()) accession2record = {} locus2closest_locus_list = {} sql = 'select locus_1,locus_2 from comparative_tables.identity_closest_homolog_%s' % db_name data = server.adaptor.execute_and_fetchall(sql,) for i in data: if i[0] not in locus2closest_locus_list: locus2closest_locus_list[i[0]] = [i[1]] else: locus2closest_locus_list[i[0]].append(i[1]) #if i[1] not in locus2closest_locus_list: # locus2closest_locus_list[i[1]] = [i[0]] #else: # locus2closest_locus_list[i[1]].append(i[0]) # storage of all records into memory for accession in accession_list: #print accession rec_raw = db.lookup(accession=accession) try: new_record_reformat = mc[db_name + "_" + accession] print accession, 'in memory' except KeyError: print accession, 'NOT in memory' new_record_reformat = SeqRecord(Seq(rec_raw.seq.data, rec_raw.seq.alphabet), id=rec_raw.id, name=rec_raw.name, description=rec_raw.description, dbxrefs =rec_raw.dbxrefs, features=rec_raw.features, annotations=rec_raw.annotations) mc[db_name + "_" + accession]= new_record_reformat accession2record[accession] = new_record_reformat accession2taxon = manipulate_biosqldb.accession2taxon_id(server, db_name) # iter all orthogroups locus2linked_locus = {} locus2linked_taxons = {} all_pairs = [] # iterate all orthogroups for t, ref_ortho in enumerate(list(set(locus2orthogroup.values()))):#: #: enumerate(["group_53"]) print 'group %s / %s' % (t, len(list(set(locus2orthogroup.values())))) tmp_dico = {} # locus list of the considered group locus_list = orthogroup2locus_list[ref_ortho] # if a single locus, slip if len(locus_list) == 1: continue # iter all locus of the orthogroup for x, locus_a in enumerate(locus_list): locus2linked_locus[locus_a] = {} tmp_dico[locus_a] = {} locus2linked_taxons[locus_a] = {} # for each locus, initiate the count of comparisons comp_count = 0 # extract region start_a = locus2start_end[locus_a][0] end_a = locus2start_end[locus_a][1] record = accession2record[locus2accession[locus_a]] size = distance_cutoff/2 region_a = mysqldb_plot_genomic_feature.get_feature_neighborhood(start_a, end_a, record, size, 'rec') # get list of orthogroups in the neiborhood & get corresp between locus and groups grp_list = [] grp2locus = {} for feature in region_a.features: if feature.type == 'CDS' and 'pseudo' not in feature.qualifiers: locus_b = feature.qualifiers['locus_tag'][0] orthogroup_locus = locus2orthogroup[locus_b] if orthogroup_locus not in grp2locus: grp2locus[orthogroup_locus] = [locus_b] else: grp2locus[orthogroup_locus].append(locus_b) if orthogroup_locus not in grp_list: grp_list.append(orthogroup_locus) # compare neighbours of all other locus to the reference locus try: closet_locus = locus2closest_locus_list[locus_a] except: continue for locus_b in closet_locus:#locus_list[x+1:len(locus_list)]: taxon_a = accession2taxon[locus2accession[locus_a]] taxon_b = accession2taxon[locus2accession[locus_b]] # if both locus are encoded by the same taxon, skip the comparison if taxon_a == taxon_b: continue try: identity = taxon2taxon_median_id[taxon_a][taxon_b] except KeyError: identity = taxon2taxon_median_id[taxon_b][taxon_a] # if the 2 considered genomes are too closely related, skip the comparison if identity < identity_cutoff: # increment the number of effective comparisons comp_count+=1 start_b = locus2start_end[locus_b][0] # if border of contig/chromosome if start_b < 0: start_b = 0 end_b = locus2start_end[locus_b][1] record_b = accession2record[locus2accession[locus_b]] region_b = mysqldb_plot_genomic_feature.get_feature_neighborhood(start_b, end_b, record_b, size, 'rec') # get group list b grp_list_b = [] for feature in region_b.features: if feature.type == 'CDS' and 'pseudo' not in feature.qualifiers: locus_b = feature.qualifiers['locus_tag'][0] orthogroup_locus = locus2orthogroup[locus_b] if orthogroup_locus not in grp_list_b: grp_list_b.append(orthogroup_locus) # get list of common groups common = list(set(grp_list).intersection(set(grp_list_b))) # remove ref group try: common.pop(common.index(ref_ortho)) except: with open('problems.txt', 'a') as f: f.write('%s\t%s\t%s\n' % (ref_ortho, locus_a,locus_b)) if len(common)>0: # store locus and taxons linked for linked_group in common: # store reciprocal relationship between the 2 genomes and the 2 locus # we can have more than one locus/group (i.e identical genes side by side) for linked_locus in grp2locus[linked_group]: # if reverse comparison was already made #if linked_locus in locus2linked_locus: # continue if linked_locus not in tmp_dico[locus_a]: #locus2linked_locus # store n link and n comparisons #locus2linked_locus[locus_a][linked_locus] = [1, '-'] tmp_dico[locus_a][linked_locus] = [1, '-'] locus2linked_taxons[locus_a][linked_locus] = [[ref_ortho, linked_group, taxon_a,taxon_b]] else: #locus2linked_locus[locus_a][linked_locus][0] += 1 tmp_dico[locus_a][linked_locus][0] += 1 locus2linked_taxons[locus_a][linked_locus].append([ref_ortho, linked_group, taxon_a,taxon_b]) # end of loop for locus_a: store the number of comparisons done for linked_locus in tmp_dico[locus_a]: # locus2linked_locus #locus2linked_locus[locus_a][linked_locus][1] = comp_count tmp_dico[locus_a][linked_locus][1] = comp_count #if len(locus2linked_locus[locus_a]) == 0: # del locus2linked_locus[locus_a] if len(tmp_dico[locus_a]) > 0: #print 'insert!' #print tmp_dico for locus_b in tmp_dico[locus_a]: # only add minimum of 50% links if tmp_dico[locus_a][locus_b][0]/float(tmp_dico[locus_a][locus_b][1]) > 0.5: sql = 'insert into interactions.colocalization_table_locus_%s (locus_1, locus_2, n_links, n_comparisons, ratio)' \ ' values ("%s","%s",%s,%s,%s)' % (db_name, locus_a, locus_b, tmp_dico[locus_a][locus_b][0], tmp_dico[locus_a][locus_b][1], tmp_dico[locus_a][locus_b][0]/float(tmp_dico[locus_a][locus_b][1])) server.adaptor.execute(sql,) server.adaptor.commit() if len(locus2linked_taxons[locus_a]) == 0: del locus2linked_taxons[locus_a] #print locus2linked_locus return locus2linked_locus, locus2linked_taxons
def find_clusters_of_orthogroups(db_name, identity_cutoff, distance_cutoff=10000): ''' ATTENTION: tous les paralogues pris en comptes si on a 1 prot dans genome A et 3 dans le genome B prot 1A sera comparee a son best hit B prot 1B, 2B et 2C seront comparee a 1A dans tous les cas cette approche est redontante car on compare tjs A vs B et B vs A... :param db_name: biodatabase name :param identity_cutoff: average ortholog identity cutoff: if genomes are too close, do not identify clusters (too much clusters) :param distance_cutoff: size of the considered window :return: ''' import manipulate_biosqldb import mysqldb_plot_genomic_feature from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import pylibmc # for memory storage of all biorecords mc = pylibmc.Client(["127.0.0.1"], binary=True, behaviors={"tcp_nodelay": True, "ketama": True}) server, db = manipulate_biosqldb.load_db(db_name) sql_locus = 'select locus_tag from orthology_detail_%s' % db_name all_locus_list = [i[0] for i in server.adaptor.execute_and_fetchall(sql_locus,)] sql = 'select locus_tag, orthogroup from orthology_detail_%s' % db_name locus2orthogroup = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) sql = 'select locus_tag, start, stop from orthology_detail_%s' % db_name locus2start_end = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) orthogroup2locus_list = {} for locus in locus2orthogroup: if locus2orthogroup[locus] not in orthogroup2locus_list: orthogroup2locus_list[locus2orthogroup[locus]] = [locus] else: orthogroup2locus_list[locus2orthogroup[locus]].append(locus) sql_identity = 'select taxon_1, taxon_2, median_identity from comparative_tables.shared_orthogroups_average_identity_%s' % db_name taxon2taxon_median_id = {} for row in server.adaptor.execute_and_fetchall(sql_identity,): if row[0] not in taxon2taxon_median_id: taxon2taxon_median_id[row[0]] = {} taxon2taxon_median_id[row[0]][row[1]] = row[2] else: taxon2taxon_median_id[row[0]][row[1]] = row[2] sql = 'select locus_tag, accession from orthology_detail_%s' % db_name locus2accession = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) accession_list = set(locus2accession.values()) accession2record = {} locus2closest_locus_list = {} sql = 'select locus_1,locus_2 from comparative_tables.identity_closest_homolog_%s' % db_name data = server.adaptor.execute_and_fetchall(sql,) for i in data: if i[0] not in locus2closest_locus_list: locus2closest_locus_list[i[0]] = [i[1]] else: locus2closest_locus_list[i[0]].append(i[1]) if i[1] not in locus2closest_locus_list: locus2closest_locus_list[i[1]] = [i[0]] else: locus2closest_locus_list[i[1]].append(i[0]) # storage of all records into memory for accession in accession_list: #print accession rec_raw = db.lookup(accession=accession) try: new_record_reformat = mc[db_name + "_" + accession] except KeyError: #print accession, 'not in memory' new_record_reformat = SeqRecord(Seq(rec_raw.seq.data, rec_raw.seq.alphabet), id=rec_raw.id, name=rec_raw.name, description=rec_raw.description, dbxrefs =rec_raw.dbxrefs, features=rec_raw.features, annotations=rec_raw.annotations) mc[db_name + "_" + accession]= new_record_reformat accession2record[accession] = new_record_reformat accession2taxon = manipulate_biosqldb.accession2taxon_id(server, db_name) # iter all orthogroups group2linked_groups = {} group2linked_taxons = {} all_pairs = [] for t, ref_ortho in enumerate(list(set(locus2orthogroup.values()))):#: #: enumerate(["group_53"]) #print t, len(list(set(locus2orthogroup.values()))) comp_count = 0 #reference_grp = locus2orthogroup[locus] group2linked_groups[ref_ortho] = {} locus_list = orthogroup2locus_list[ref_ortho] # if no homologs, skip if len(locus_list) == 1: continue # iter all locus of the orthogroup for x, locus_a in enumerate(locus_list): # extract region start_a = locus2start_end[locus_a][0] end_a = locus2start_end[locus_a][1] record = accession2record[locus2accession[locus_a]] size = distance_cutoff/2 region_a = mysqldb_plot_genomic_feature.get_feature_neighborhood(start_a, end_a, record, size, 'rec') grp_list = [] for feature in region_a.features: if feature.type == 'CDS' and 'pseudo' not in feature.qualifiers: locus_b = feature.qualifiers['locus_tag'][0] orthogroup_locus = locus2orthogroup[locus_b] if orthogroup_locus not in grp_list: grp_list.append(orthogroup_locus) # compare neighbours of all other locus to the reference locus try: closet_locus = locus2closest_locus_list[locus_a] except: continue for locus_b in locus_list[x+1:len(locus_list)]: # only consider "best hit", locus with the highest identity # si on a une relation 1 vs 3 # on va avoir une seule comparaison pour le genome A vs B mais 3 pour la comparsion B vs A... # oubien: si plusieurs pairs: ne comparer que la paire la plus proche. # dans tous les cas ca va associer les groupes qui incluent de multiples paralogues. # keep all comparisons in memory and do it only once? # cas des multiples paralogues side by side if locus_b not in closet_locus: continue taxon_a = accession2taxon[locus2accession[locus_a]] taxon_b = accession2taxon[locus2accession[locus_b]] # if both locus are encoded by the same taxon, skip the comparison if taxon_a == taxon_b: continue try: identity = taxon2taxon_median_id[taxon_a][taxon_b] except KeyError: identity = taxon2taxon_median_id[taxon_b][taxon_a] # if the 2 considered genomes are too closely related, skip the comparison if identity < identity_cutoff: comp_count+=1 #print comp_count, "comp_count" start_b = locus2start_end[locus_b][0] if start_b < 0: start_b = 0 end_b = locus2start_end[locus_b][1] record_b = accession2record[locus2accession[locus_b]] region_b = mysqldb_plot_genomic_feature.get_feature_neighborhood(start_b, end_b, record_b, size, 'rec') grp_list_b = [] for feature in region_b.features: if feature.type == 'CDS' and 'pseudo' not in feature.qualifiers: #print feature locus_b = feature.qualifiers['locus_tag'][0] orthogroup_locus = locus2orthogroup[locus_b] if orthogroup_locus not in grp_list_b: grp_list_b.append(orthogroup_locus) common = list(set(grp_list).intersection(set(grp_list_b))) # remove ref group try: common.pop(common.index(ref_ortho)) except: pass if len(common)>0: # store groups and taxons linked for linked_group in common: # store reciprocal relationship between the 2 genomes and the 2 groups # group a vs group b == group b vs group a try: if [taxon_a, taxon_b] not in group2linked_taxons[ref_ortho][linked_group]: group2linked_taxons[ref_ortho][linked_group].append([taxon_a, taxon_b]) except KeyError: try: # remove potential redundant pairs due to paralogs # all paralogs are taken into acount if [taxon_a, taxon_b] not in group2linked_taxons[linked_group][ref_ortho]: group2linked_taxons[linked_group][ref_ortho].append([taxon_a, taxon_b]) except KeyError: if ref_ortho in group2linked_taxons: group2linked_taxons[ref_ortho][linked_group] = [[taxon_a, taxon_b]] elif linked_group in group2linked_taxons: group2linked_taxons[linked_group][ref_ortho] = [[taxon_a, taxon_b]] else: group2linked_taxons[ref_ortho] = {} group2linked_taxons[ref_ortho][linked_group] = [[taxon_a, taxon_b]] # store counts of links out of the total number of comparsions if linked_group in group2linked_groups[ref_ortho]: group2linked_groups[ref_ortho][linked_group][0] += 1 else: group2linked_groups[ref_ortho][linked_group] = [1] else: pass #print 'no common groups' # check if multiple common elements for linked_group in group2linked_groups[ref_ortho]: group2linked_groups[ref_ortho][linked_group].append(comp_count) return group2linked_groups, group2linked_taxons
def edit_svg_map(map_path, keep_ko_list, biodb_name, map_name, taxon_id=False): import manipulate_biosqldb import re server, db = manipulate_biosqldb.load_db(biodb_name) sql = 'select description,pathway_name from enzyme.kegg_pathway;' description2map = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) from xml.etree import ElementTree tree = ElementTree.parse(map_path) #print tree for element in tree.iter(): if element.tag.split("}")[1] == 'text': #print element.tag #print element.attrib for child in element: #print child.tag #print child.attrib if child.text[0] != 'K': #print child.text try: if not taxon_id: add = 'window.open("/chlamdb/KEGG_mapp_ko/%s", "_top");' % ( description2map[child.text]) else: add = 'window.open("/chlamdb/KEGG_mapp_ko_organism/%s/%s", "_top");' % ( description2map[child.text], taxon_id) except: continue mystyle = element.get("style") add4 = "this.style.stroke = '#ff0000'; this.style['stroke-width'] = 1;" add5 = "this.style.stroke = '#000000'; this.style['stroke-width'] = 0;" element.set("onclick", add) #element.set("target", add2) element.set("onmouseover", add4) element.set("onmouseout", add5) if child.text in keep_ko_list: #print 'match-----------' add = 'window.open("/chlamdb/fam/%s/ko", "_top");' % ( child.text) mystyle = element.get("style") add4 = "this.style.stroke = '#ff0000'; this.style['stroke-width'] = 1;" add5 = "this.style.stroke = '#000000'; this.style['stroke-width'] = 0;" element.set("onclick", add) #element.set("target", add2) element.set("onmouseover", add4) element.set("onmouseout", add5) if '...' in child.text: #print 'match-----------' add = 'window.open("/chlamdb/kegg_multi/%s/%s/", "_top");' % ( map_name, re.sub('\.\.\.', '', child.text)) mystyle = element.get("style") add4 = "this.style.stroke = '#ff0000'; this.style['stroke-width'] = 1;" add5 = "this.style.stroke = '#000000'; this.style['stroke-width'] = 0;" element.set("onclick", add) #element.set("target", add2) element.set("onmouseover", add4) element.set("onmouseout", add5) return tree
def biodb2all_connections(biodb): import manipulate_biosqldb import time import re server, db = manipulate_biosqldb.load_db(biodb) sql = 'select db_accession from custom_tables.uniprot_id2seqfeature_id_%s t0 ' \ ' inner join custom_tables.uniprot_db_xref_%s t1 on t0.uniprot_id=t1.uniprot_id ' \ ' inner join custom_tables.db_xref t2 on t1.db_xref_id=t2.db_xref_id where db_xref_name="string" and db_accession like "%%%%CPn%%%%";' % (biodb, biodb) all_string_accessions = [ i[0] for i in server.adaptor.execute_and_fetchall(sql, ) ] sql = 'select seqfeature_id, taxon_id from custom_tables.locus2seqfeature_id_%s' % biodb seqfeature_id2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select locus_tag,seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb new_locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select old_locus_tag,seqfeature_id from custom_tables.seqfeature_id2old_locus_tag_%s' % biodb old_locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'create table if not exists string.interactions_%s (taxon_id INT, ' \ ' seqfeature_id_1 INT, ' \ ' seqfeature_id_2 INT,' \ ' old_locus_tag_1 varchar(400), ' \ ' old_locus_tag_2 varchar (400), ' \ ' label_1 varchar(400), ' \ ' label_2 varchar (400), ' \ ' global_score FLOAT,' \ ' neighborhood FLOAT,' \ ' gene_fusion FLOAT,' \ ' cooccurence FLOAT,' \ ' coexpression FLOAT,' \ ' experiments FLOAT,' \ ' biodatabases FLOAT,' \ ' textmining FLOAT, ' \ ' index seqfeature_id_1 (seqfeature_id_1),' \ ' index seqfeature_id_2 (seqfeature_id_2),' \ ' INDEX old_locus_tag_1 (old_locus_tag_1),' \ ' index old_locus_tag_2 (old_locus_tag_2))' % biodb print sql #server.adaptor.execute(sql,) ref_locus_list = [] for n, string_accession in enumerate(all_string_accessions): print "%s / %s" % (n, len(all_string_accessions)) interactions = string_id2connexions(string_accession) if not interactions: while interactions is False: print 'trying again...' time.sleep(10) interactions = string_id2connexions(string_accession) for one_interaction in interactions: print string_accession, one_interaction gscore = 0 fscore = 0 pscore = 0 nscore = 0 ascore = 0 escore = 0 dscore = 0 tscore = 0 if string_accession in one_interaction[0]: ref_locus = one_interaction[0].split(':')[1].split('.')[1] link_locus = one_interaction[1].split(':')[1].split('.')[1] elif string_accession in one_interaction[1]: ref_locus = one_interaction[1].split(':')[1].split('.')[1] link_locus = one_interaction[0].split(':')[1].split('.')[1] else: # connection does not contain reference link, skiping continue ref_locus_list.append(ref_locus) if link_locus in ref_locus_list: # not a new connection continue label_1 = one_interaction[2] label_2 = one_interaction[3] # locus tag corresp between old and new RefSeq annotation try: ref_locus_seqfeature_id = old_locus_tag2seqfeature_id[ ref_locus] except: # special case trachomatis try: ref_locus = re.sub('CT', 'CT_', ref_locus) ref_locus_seqfeature_id = new_locus_tag2seqfeature_id[ ref_locus] except: ref_locus_seqfeature_id = 'NULL' print 'ref_locus', ref_locus # locus tag corresp OK but pseudogene try: taxon_id = seqfeature_id2taxon_id[str(ref_locus_seqfeature_id)] except: taxon_id = 'NULL' if taxon_id is None: taxon_id = 'NULL' # locus tag corresp between old and new RefSeq annotation try: link_locus_seqfeature_id = old_locus_tag2seqfeature_id[ link_locus] except: try: link_locus = re.sub('CT', 'CT_', link_locus) link_locus_seqfeature_id = new_locus_tag2seqfeature_id[ link_locus] except: link_locus_seqfeature_id = 'NULL' scores = one_interaction[4].split('|') for one_score in scores: score, value = one_score.split(':') #print ref_locus, link_locus, score, value if score == 'score': gscore = value elif score == 'nscore': nscore = value elif score == 'fscore': fscore = value elif score == 'pscore': pscore = value elif score == 'ascore': ascore = value elif score == 'escore': escore = value elif score == 'dscore': dscore = value elif score == 'tscore': tscore = value else: print 'unkonwn score type', score, value # ref_locus, link_locus, ref_locus_seqfeature_id, link_locus_seqfeature_id, label_1, label_2, gscore, ncore, fscore, pscore, ascore, escore, dscore, tscore sql = 'insert into string.interactions_%s values ' \ ' (%s, %s, %s, "%s", "%s", "%s", "%s", %s, %s, %s, %s, %s, %s, %s, %s)' % (biodb, taxon_id, ref_locus_seqfeature_id, link_locus_seqfeature_id, ref_locus, link_locus, label_1, label_2, gscore, nscore, fscore, pscore, ascore, escore, dscore, tscore) print taxon_id, sql server.adaptor.execute(sql, ) server.commit()
def biodb2string_pmid_data(biodb): import manipulate_biosqldb import pubmed_utils import time import re server, db = manipulate_biosqldb.load_db(biodb) sql = 'select db_accession from custom_tables.uniprot_id2seqfeature_id_%s t0 ' \ ' inner join custom_tables.uniprot_db_xref_%s t1 on t0.uniprot_id=t1.uniprot_id ' \ ' inner join custom_tables.db_xref t2 on t1.db_xref_id=t2.db_xref_id where db_xref_name="string" and db_accession like "%%%%CPn%%%%";' % (biodb, biodb) all_string_accessions = [ i[0] for i in server.adaptor.execute_and_fetchall(sql, ) ] sql = 'create table if not exists string.seqfeature_id2string_pmid_%s (taxon_id INT, ' \ ' seqfeature_id INT, ' \ ' pmid INT, ' \ ' authors TEXT,' \ ' title TEXT,' \ ' abstract TEXT, ' \ ' source TEXT,' \ ' INDEX seqfeature_id(seqfeature_id))' % biodb server.adaptor.execute(sql, ) server.commit() sql = 'select seqfeature_id, taxon_id from custom_tables.locus2seqfeature_id_%s' % biodb seqfeature_id2taxon_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select old_locus_tag, seqfeature_id from custom_tables.seqfeature_id2old_locus_tag_%s' % biodb old_locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select locus_tag,seqfeature_id from custom_tables.locus2seqfeature_id_%s' % biodb new_locus_tag2seqfeature_id = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) for n, string_accession in enumerate(all_string_accessions): print "%s / %s" % (n, len(all_string_accessions)) old_locus_tag = string_accession.split('.')[1] try: seqfeature_id = old_locus_tag2seqfeature_id[old_locus_tag] except: try: # special case trachomatis old_locus_tag = re.sub('CT', 'CT_', old_locus_tag) seqfeature_id = new_locus_tag2seqfeature_id[old_locus_tag] except: continue taxon_id = seqfeature_id2taxon_id[str(seqfeature_id)] if taxon_id is None: taxon_id = 'NULL' pmid_list = string_id2pubmed_id_list(string_accession) print 'miidjdjnjdhd', pmid_list if pmid_list is False: while pmid_list is False: print 'trying again' time.sleep(10) pmid_list = string_id2pubmed_id_list(string_accession) if len(pmid_list) == 0: print '0 pmid for', string_accession continue else: for one_pmid in pmid_list: abstract_data = pubmed_utils.pmid2abstract_info(one_pmid) print 'data', abstract_data abstract = re.sub("'", "", abstract_data['abstract']) abstract = re.sub("%", "%%%%", abstract) title = re.sub("'", "", abstract_data['title']) title = re.sub("%", "%%%%", title) source = re.sub("'", "", abstract_data['source']) source = re.sub("%", "%%%%", source) sql = '''insert into string.seqfeature_id2string_pmid_%s values (%s, %s, %s, '%s', '%s', '%s', '%s')''' % ( biodb, taxon_id, seqfeature_id, abstract_data['pmid'], re.sub("'", "", str( abstract_data['authors'])), title, abstract, source) print sql server.adaptor.execute(sql, ) server.commit()
def plot_cog_eatmap(biodb, ref_tree, taxon_id_list=[], frequency=False, group_by_cog_id=False): import manipulate_biosqldb import ete_motifs server, db = manipulate_biosqldb.load_db(biodb) sql = 'select biodatabase_id from biodatabase where name="%s"' % biodb db_id = server.adaptor.execute_and_fetchall(sql, )[0][0] # RESTRICT TO AS SUBSET OF THE TAXON AVAILABLE sql = '' if len(taxon_id_list) > 0: filter = ','.join(taxon_id_list) sql = 'select taxon_id, code, count(*) as n from COG.seqfeature_id2best_COG_hit_%s t1 ' \ ' inner join biosqldb.bioentry t2 on t1.bioentry_id=t2.bioentry_id' \ ' inner join COG.cog_id2cog_category t3 on t1.hit_cog_id=t3.COG_id ' \ ' inner join COG.code2category t4 on t3.category_id=t4.category_id ' \ ' where t2.biodatabase_id=%s and taxon_id in (%s)' \ ' group by taxon_id, code;' % (biodb, db_id, filter) print(sql) else: if not group_by_cog_id: sql = 'select taxon_id,functon,count(*) as n ' \ ' from COG.locus_tag2gi_hit_%s t1 ' \ ' inner join COG.cog_names_2014 t2 on t1.COG_id=t2.COG_id ' \ ' inner join biosqldb.bioentry as t3 on t1.accession=t3.accession ' \ ' where biodatabase_id=%s group by taxon_id,functon' % (biodb, db_id) else: sql = ' select A.taxon_id,B.functon,count(*) from (select t1.COG_id, t3.taxon_id from COG.locus_tag2gi_hit_%s t1 ' \ ' inner join biosqldb.orthology_detail_%s t3 on t1.locus_tag=t3.locus_tag ' \ ' group by taxon_id,t1.COG_id) A inner join COG.cog_names_2014 B on A.COG_id=B.COG_id ' \ ' group by A.taxon_id,B.functon;' % (biodb, biodb) data = server.adaptor.execute_and_fetchall(sql, ) if frequency: ''' ATTENTION: based on total annotated with COG and not genome size ''' sql = 'select taxon_id, count(*) as n from COG.seqfeature_id2best_COG_hit_%s t1' \ ' inner join biosqldb.bioentry t2 on t1.bioentry_id=t2.bioentry_id' \ ' where t2.biodatabase_id=%s group by taxon_id;' % (biodb, db_id) taxon_id2count = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) code2taxon2count = {} cog_list = [] else: sql = 'select taxon_id, count(*) from biosqldb.orthology_detail_%s t1 left join COG.locus_tag2gi_hit_%s t2 ' \ ' on t1.locus_tag=t2.locus_tag where COG_id is NULL group by t1.taxon_id;' % (biodb, biodb) taxon2count_no_GOG = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) sql = 'select taxon_id, count(*) from orthology_detail_%s group by taxon_id' % biodb taxon2proteome_size = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) code2taxon2count = {} code2taxon2count['-'] = {} code2taxon2count['TOTAL'] = {} for taxon in taxon2count_no_GOG: if taxon in taxon_id_list: code2taxon2count['-'][taxon] = int(taxon2count_no_GOG[taxon]) code2taxon2count['TOTAL'][taxon] = int( taxon2proteome_size[taxon]) cog_list = ['TOTAL', '-'] sql = 'select code, description from COG.code2category;' code2description = manipulate_biosqldb.to_dict( server.adaptor.execute_and_fetchall(sql, )) for row in data: descr = "%s (%s)" % (code2description[row[1]], row[1]) if descr not in cog_list: cog_list.append(descr) if descr not in code2taxon2count: code2taxon2count[descr] = {} if frequency: code2taxon2count[descr][str(row[0])] = round( (float(row[2]) / float(taxon_id2count[str(row[0])])) * 100, 2) else: code2taxon2count[descr][str(row[0])] = int(row[2]) else: if frequency: code2taxon2count[descr][str(row[0])] = round( (float(row[2]) / float(taxon_id2count[str(row[0])])) * 100, 2) else: code2taxon2count[descr][str(row[0])] = int(row[2]) tree2 = ete_motifs.multiple_profiles_heatmap(biodb, cog_list, code2taxon2count, show_labels=True, column_scale=True, tree=ref_tree, as_float=frequency) return tree2