def locus2inference_table(biodb): server, db = manipulate_biosqldb.load_db(biodb) sql = 'CREATE TABLE locus_tag2uniprot_hit_%s (locus_tag varchar(400),' \ ' uniprot_id varchar(400), index locus_tag(locus_tag))' % biodb server.adaptor.execute(sql, ) locus2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, biodb) for locus in locus2seqfeature_id: sql = 'select value from seqfeature_qualifier_value where seqfeature_id=%s and value like "%%%%UniProtKB%%%%"' % ( locus2seqfeature_id[locus]) try: data = server.adaptor.execute_and_fetchall(sql, )[0][0] sql2 = 'insert into locus_tag2uniprot_hit_%s values ("%s", "%s")' % ( biodb, locus, data.split(':')[2]) try: server.adaptor.execute(sql2, ) server.commit() except: print sql2 except: pass
def locus_list2identity_in_other_genomes(locus_list, biodb): server, db = manipulate_biosqldb.load_db(biodb) locus_tag2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, biodb) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, biodb) import re for i in taxon_id2description.keys(): taxon_id2description[i] = re.sub(" subsp\. aureus", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("strain ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" chromosome", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("Staphylococcus aureus ", "", taxon_id2description[i]) header = 'orthogroup\t' dico = locus_tag2identity_best_hit_all_genomes(biodb, 'wcw_1594', 'group_417') for i in dico.keys(): header += taxon_id2description[i] + '\t' final_out = header + '\n' for locus in locus_list: #print "locus", i seqfeature_id = locus_tag2seqfeature_id[locus] orthogroup = manipulate_biosqldb.seqfeature_id2orthogroup( server, seqfeature_id, biodb) #print "ortho", orthogroup dico = locus_tag2identity_best_hit_all_genomes(biodb, locus, orthogroup) #print "dico done..." out = '%s\t' % orthogroup for i in dico.keys(): identity = dico[i] out += '%s\t' % identity final_out += out + '\n' return final_out
def locus_list2presence_absence_all_genomes(locus_list, biodb_name): server, db = manipulate_biosqldb.load_db(biodb_name) locus_tag2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, biodb_name) taxon_id2description = manipulate_biosqldb.taxon_id2genome_description( server, biodb_name) import re for i in taxon_id2description.keys(): taxon_id2description[i] = re.sub(" subsp\. aureus", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(", complete sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("strain ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("str\. ", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome sequence\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" complete genome\.", "", taxon_id2description[i]) taxon_id2description[i] = re.sub(" chromosome", "", taxon_id2description[i]) taxon_id2description[i] = re.sub("Staphylococcus aureus ", "", taxon_id2description[i]) header = 'orthogroup\t' genomes = manipulate_biosqldb.get_genome_taxons_list(server, biodb_name) for i in genomes: header += taxon_id2description[i] + '\t' final_out = header + '\n' for i in locus_list: #print "locus", i seqfeature_id = locus_tag2seqfeature_id[i] orthogroup = manipulate_biosqldb.seqfeature_id2orthogroup( server, seqfeature_id, biodb_name) #print "ortho", orthogroup dico = heatmap_presence_absence(biodb_name, orthogroup) #print "dico done..." #print dico out = '%s\t' % orthogroup for i in genomes: out += '%s\t' % dico[i] final_out += out + '\n' return final_out
def create_locus_tag2seqfeature_table(biodb, locus2seqfeature_id=False, locus2taxon_id=False): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) sql = 'CREATE TABLE IF NOT EXISTS custom_tables.locus2seqfeature_id_%s (locus_tag varchar(400), ' \ ' seqfeature_id INT, ' \ ' taxon_id INT,' \ ' index locus_tag (locus_tag), ' \ ' index seqfeature_id(seqfeature_id), ' \ ' index taxon_id (taxon_id))' % biodb server.adaptor.execute(sql) server.commit() if not locus2seqfeature_id: locus2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, biodb) if not locus2taxon_id: locus2taxon_id = manipulate_biosqldb.locus_tag2genome_taxon_id( server, biodb) for locus in locus2seqfeature_id: try: sql = 'insert into custom_tables.locus2seqfeature_id_%s values ("%s", %s, %s)' % ( biodb, locus, locus2seqfeature_id[locus], locus2taxon_id[locus]) server.adaptor.execute(sql) except: # pseudogenes sql = 'insert into custom_tables.locus2seqfeature_id_%s values ("%s", %s, %s)' % ( biodb, locus, locus2seqfeature_id[locus], "NULL") server.adaptor.execute(sql) server.commit()
if args.create_tables: create_sql_blastnr_tables(args.mysql_database, mysql_host, mysql_user, mysql_pwd, mysql_db, main_blastnr_table=True, alternate_tables=True) if args.load_tables: server, db = manipulate_biosqldb.load_db(biodb) sys.stdout.write("creating locus_tag2seqfeature_id") locus_tag2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, biodb) sys.stdout.write("creating protein_id2seqfeature_id") protein_id2seqfeature_id = manipulate_biosqldb.protein_id2seqfeature_id_dict( server, biodb) sys.stdout.write("getting seqfeature_id2locus_tag") seqfeature_id2locus_tag = manipulate_biosqldb.seqfeature_id2locus_tag_dico( server, biodb) sys.stdout.write("getting locus_tag2accession") locus_tag2accession = manipulate_biosqldb.locus_tag2accession( server, args.mysql_database) blastnr2biosql(seqfeature_id2locus_tag, locus_tag2seqfeature_id, protein_id2seqfeature_id, locus_tag2accession, biodb,
parser.add_argument("-d", '--db_name', type=str, help="db name") args = parser.parse_args() server, db = manipulate_biosqldb.load_db(args.db_name) asset_path = "/home/trestan/work/dev/django/chlamydia/assets" print("parsing orthofinder file") locus_tag2orthogroup_id, \ orthomcl_groups2locus_tag_list, \ genome_orthomcl_code2proteins, \ protein_id2genome_ortho_mcl_code = parse_orthomcl_output(args.mcl, True) print("get locus_tag2seqfeature_id") locus_tag2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, args.db_name) locus_tag2seqfeature_id_CDS = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, args.db_name, all=False) print("number of groups:", len(orthomcl_groups2locus_tag_list)) print("number of locus in locus_tag2orthogroup_id:", len(locus_tag2orthogroup_id)) print("number of locus in locus_tag2seqfeature_id:", len(locus_tag2seqfeature_id)) print("number of locus in locus_tag2seqfeature_id_CDS:", len(locus_tag2seqfeature_id_CDS)) print("adding orthogroup to seqfeature_qualifier_values") add_orthogroup_to_seq(server, locus_tag2orthogroup_id, locus_tag2seqfeature_id)
help= "Path to save core orthogroup fasta. Taxon id as header for concatenation.", default=None) parser.add_argument("-o", '--orthofinder', action="store_true", help="orthofinder input file (and not orthomcl)") args = parser.parse_args() server, db = manipulate_biosqldb.load_db(args.db_name) asset_path = "/home/trestan/work/dev/django/chlamydia/assets" if not args.get_sequences and not args.core_groups_path: print "creating locus_tag2seqfeature_id" locus_tag2seqfeature_id = manipulate_biosqldb.locus_tag2seqfeature_id_dict( server, args.db_name) print "creating protein_id2seqfeature_id" protein_id2seqfeature_id = manipulate_biosqldb.protein_id2seqfeature_id_dict( server, args.db_name) print "creating locus_tag2taxon_id dictionnary..." locus_tag2genome_taxon_id = manipulate_biosqldb.locus_tag2genome_taxon_id( server, args.db_name) print "creating protein_id2taxon_id dictionnary..." protein_id2genome_taxon_id = manipulate_biosqldb.protein_id2genome_taxon_id( server, args.db_name) print "creating locus_tag2accession dictionnary..." locus_tag2accession = manipulate_biosqldb.locus_tag2accession( server, args.db_name)