def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) cursor.close() db.close() outpath = cfg.get_path('afs_dumps') outdir = "{0}/exon_map".format(outpath) if (not os.path.exists(outdir)): mkdir_p(outdir) outfile = "{0}/exon_map.sql".format(outdir) if os.path.exists('.creds'): [user, passwd, host, port] = read_creds() else: print "creds not found" exit(1) credentials = " -h {0} -P {1} -u {2} -p{3}".format( host, port, user, password) cmd = "mysqldump {0} {1} exon_map > {2}".format( credentials, ensembl_db_name['homo_sapiens'], outfile) print cmd ret = commands.getoutput(cmd) print ret return True
def main (): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) cursor.close() db .close() outpath = cfg.get_path('afs_dumps') outdir = "{0}/exon_map".format(outpath) if (not os.path.exists(outdir)): mkdir_p(outdir) outfile = "{0}/exon_map.sql".format(outdir) if os.path.exists('.creds'): [user, passwd, host, port] = read_creds() else: print "creds not found" exit(1) credentials = " -h {0} -P {1} -u {2} -p{3}".format(host, port, user, password) cmd = "mysqldump {0} {1} exon_map > {2}".format (credentials, ensembl_db_name['homo_sapiens'], outfile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): db = connect_to_mysql(Config.mysql_conf_file) cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) # add orthologue table to human - we are human-centered here # ditto for map (which exons from other species map onto human exons) table = "exon_map" qry = "select representative_species from exolocator_meta.taxonomy_groups" for representative_species in [ line[0] for line in hard_landing_search(cursor, qry) ]: print(f"adding exon_map to {representative_species}") db_name = ensembl_db_name[representative_species] if check_table_exists(cursor, db_name, table): check_and_drop_table(cursor, db_name, table) make_exon_map_table(cursor, db_name) # print(table, " found in ", db_name) else: print(table, " not found in ", db_name) make_exon_map_table(cursor, db_name) # cursor, db_name, table, index_name, columns, create_index(cursor, db_name, table, 'exon_index', ['exon_id']) create_index(cursor, db_name, table, 'cognate_exon_index', ['cognate_exon_id', 'cognate_genome_db_id']) cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) if 1: check_genome_sizes(cursor, all_species, ensembl_db_name) if 0: check_table_sizes(cursor, all_species, ensembl_db_name) cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) if 1: check_genome_sizes (cursor, all_species, ensembl_db_name) if 0: check_table_sizes (cursor, all_species, ensembl_db_name) cursor.close() db.close()
def main(): db = connect_to_mysql(Config.mysql_conf_file) cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) # add exon tables to all species for species in all_species: print(species) db_name = ensembl_db_name[species] switch_to_db(cursor, db_name) #make_exon_seq_table(cursor) #for table in ['gene2exon', 'exon_seq', 'sw_exon', 'usearch_exon', 'coding_region', 'problems']: for table in ['gene2exon']: check_and_drop_table(cursor, db_name, table) make_table(cursor, db_name, table) # if check_table_exists(cursor, db_name, table): # print(table, " found in ", db_name) # else: # print(table, " not found in ", db_name) # make_table (cursor, db_name, table) print("optimizing gene2exon") qry = "optimize table gene2exon" print(search_db(cursor, qry)) # (cursor, db_name, table, index_name, columns, verbose=False) create_index(cursor, db_name, 'gene2exon', 'eg_index', ['exon_id', 'gene_id'], verbose=True) create_index(cursor, db_name, 'gene2exon', 'gene_id_idx', ['gene_id'], verbose=True) # create_index (cursor, db_name, 'ek_index', 'exon_seq', ['exon_id', 'is_known']) # create_index (cursor, db_name, 'seq_index', 'exon_seq', ['exon_seq_id']) # print("optimizing exon_seq") # qry = "optimize table exon_seq" # print(search_db(cursor, qry)) cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) tree = Tree() for species in all_species: leaf = Node(species) tree.leafs.append(leaf) tree.build(cursor) print print tree.nhx_string() print cursor.close() db.close()
def main(): db = connect_to_mysql(Config.mysql_conf_file) cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) tree = Tree() for species in all_species: leaf = Node(species) tree.leafs.append(leaf) tree.build(cursor) print() print(tree.nhx_string()) print() cursor.close() db.close()
def main(): parameter = {} # in case I ever have to handle multiple versions of ensembl # (but for now I don't have enough space) # note though that there are functions in el_utils/mysql.py that assume # that whatever ensembl stuff is available to the mysql server corresponds to the same release release_number = '76' parameter['ensembl_release_number'] = release_number parameter['blastp_e_value'] = "1.e-10" # it will be used as a string when fmting the blastp cmd parameter['min_accptbl_exon_sim'] = 0.33333 #minimum acceptable exon similarity dir_path = {} dir_path['ensembl_fasta'] = '/mnt/ensembl-mirror/release-'+release_number+'/fasta' # local juggling of data from one database base to the other dir_path['afs_dumps'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['afs_dumps'] += 'ExoLocator/results/dumpster' dir_path['resources'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['resources'] += 'pypeworks/exolocator/resources' dir_path['scratch'] = '/tmp' dir_path['maxentscan'] = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/' dir_path['maxentscan'] += 'pypeworks/exolocator/pl_utils/maxentscan' util_path = {} util_path['mafft'] = '/usr/bin/mafft' util_path['blastall'] = '/usr/bin/blastall' util_path['fastacmd'] = '/usr/bin/fastacmd' util_path['sw#'] = '/usr/bin/swsharp' util_path['usearch'] = '/usr/bin/usearch' util_path['score3'] = dir_path['maxentscan'] + '/score3.pl' util_path['score5'] = dir_path['maxentscan'] + '/score5.pl' if 1: # check if the paths are functioning (at this point at least) for util in util_path.values(): if (not os.path.exists(util)): print util, " not found " sys.exit (1) for dir in dir_path.values(): if (not os.path.exists(dir)): print dir, " not found " sys.exit (1) if (not os.path.isdir (dir)): print dir, " is not a directory " sys.exit (1) db = connect_to_mysql() cursor = db.cursor() ####################################################### # check if the config db exists -- if not, make it db_name = "exolocator_config" qry = "show databases like'%s'" % db_name rows = search_db (cursor, qry) if (not rows): print db_name, "database not found" qry = "create database %s " % db_name rows = search_db (cursor, qry) if (rows): print "some problem creating the database ..." rows = search_db (cursor, qry, verbose = True) else: print db_name, "database found" qry = "use %s " % db_name search_db (cursor, qry) # make tables for table in ['util_path', 'dir_path', 'parameter']: if ( check_table_exists (cursor, db_name, table)): print table, " found in ", db_name else: print table, " not found in ", db_name make_table (cursor, table) # fill util, dir and path tables fixed_fields = {} update_fields = {} for [name, path] in util_path.iteritems(): fixed_fields['name'] = name update_fields['path'] = path store_or_update (cursor, 'util_path', fixed_fields, update_fields) fixed_fields = {} update_fields = {} for [name, path] in dir_path.iteritems(): fixed_fields['name'] = name update_fields['path'] = path store_or_update (cursor, 'dir_path', fixed_fields, update_fields) fixed_fields = {} update_fields = {} for [name, value] in parameter.iteritems(): fixed_fields['name'] = name update_fields['value'] = value store_or_update (cursor, 'parameter', fixed_fields, update_fields) ####################################################### # add trivial names to ncbi_taxonomy.names [all_species, ensembl_db_name] = get_species (cursor) feed_trivial_names (cursor, all_species) ####################################################### # add species shorthands (used in ENS* names formation) # though we will not needed unit the paralogue alignment reconstruction point) feed_name_shorthands (cursor, all_species) cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) for species in all_species: if not species=='homo_sapiens': continue print print species switch_to_db (cursor, ensembl_db_name[species]) if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') tot_exons = 0 no_exon_seq = 0 short_dna = 0 pepseq_ok = 0 mismatch = 0 stored_incorrect = 0 translation_fail = 0 ##################################### #for gene_id in [10092907]: for gene_id in gene_ids: #for tot in range(1000): #gene_id = choice(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for gene', gene_id sys.exit(1) for exon in exons: ##################################### if not exon.is_coding: print exon.exon_id, " not coding " continue if exon.covering_exon >0: print exon.exon_id, " is covered by ", exon.covering_exon continue tot_exons += 1 # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): no_exon_seq += 1 print "no exon seqs for ", gene_id, exon.exon_id #exit(1) continue [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs if len(dna_seq)<3: short_dna += 1 print "short_dna:", dna_seq continue if (pepseq_transl_start == -10): # ??? what is this shit? adn what happens downstream if the pepseq_transl_start is None? translation_fail += 1 print "pepseq_transl_start:", pepseq_transl_start continue mitochondrial = is_mitochondrial(cursor, gene_id) dnaseq = Seq (dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna) if (mitochondrial): pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring() else: pepseq2 = dnaseq.translate().tostring() if True: print exon.exon_id print "pep stored:", pepseq print "dna transl:", pepseq2 print "dna begin:", dna_seq[:12] print "start:" , pepseq_transl_start, print "end:", pepseq_transl_end print if (not pepseq == pepseq2): stored_incorrect += 1 else: pepseq_ok += 1 print "total coding exons ", tot_exons print "no exon seq info ", no_exon_seq print "short dna ", short_dna print "transl failure ", translation_fail print "stored pepseq does not correspond to the translation of stored dna: ", stored_incorrect print "pepseq ok ", pepseq_ok cursor.close() db .close()
def main(): db = connect_to_mysql(Config.mysql_conf_file) cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) # do we have the tree? ret = error_intolerant_search( cursor, "select value from exolocator_meta.taxonomy where name = 'species_tree'" ) if not ret: print("building the species tree ...") tree = species_tree(cursor, all_species) print(" tree done.") print("storing") qry = f"insert into exolocator_meta.taxonomy (name,value) values ('species_tree','{tree.nhx_string()}') " error_intolerant_search(cursor, qry) else: print("reading the species tree ...") tree = Tree(ret[0][0]) print(" tree done.") trivial_name = { 'Archosauria': "birds_and_crocs", 'Testudines': "turtles", 'Lepidosauria': "lizards_and_snakes", 'Eutheria': "mammals", 'Marsupialia': "marsupials", 'ornithorhynchus_anatinus': "platypus", 'Anura': "frogs", 'latimeria_chalumnae': "coelacanth", 'Euteleosteomorpha': "euteleosts", 'Otomorpha': "otomorpha", 'Osteoglossiformes': "osteoglossiforms", # ray-finned fish 'lepisosteus_oculatus': "spotted_gar", 'erpetoichthys_calabaricus': "snakefish", # more ray-finned fish 'callorhinchus_milii': "elephant_shark", # Australian ghostshark or elephant shark 'Cyclostomata': "nightmare stuff" } species_subtrees = trivial_name.keys() # which species in the subtree has the best annotation so far? switch_to_db(cursor, "exolocator_meta") for tax_group in species_subtrees: node = tree.get_node(tax_group) print() print(tax_group) number_of_genes = {} number_of_transcripts = {} group_species = [node.name] if node.is_leaf else node.subtree_leafs() for species in group_species: qry = f"select count(*) from {ensembl_db_name[species]}.transcript" number_of_transcripts[species] = hard_landing_search(cursor, qry)[0][0] qry = f"select count(*) from {ensembl_db_name[species]}.gene" number_of_genes[species] = hard_landing_search(cursor, qry)[0][0] # we are using the reported number of transcript as an ad hoc measure of reliability of the genome annotation sorted_species_in_the_group = sorted( group_species, key=lambda s: number_of_transcripts[s], reverse=True) for species in sorted_species_in_the_group: strformat = "%50s: transcripts: %6d genes: %6d" print(strformat % (species, number_of_transcripts[species], number_of_genes[species])) fixed_fields = {'name': tax_group} update_fields = { 'trivial_name': trivial_name[tax_group], 'representative_species': sorted_species_in_the_group[0], 'members': ",".join(sorted_species_in_the_group[1:]) } print(fixed_fields) print(update_fields) print() store_or_update(cursor, "taxonomy_groups", fixed_fields, update_fields) cursor.close() db.close()
def main(): db = connect_to_mysql() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) for species in all_species: if not species == 'homo_sapiens': continue print print species switch_to_db(cursor, ensembl_db_name[species]) if (species == 'homo_sapiens'): gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1) else: gene_ids = get_gene_ids(cursor, biotype='protein_coding') tot_exons = 0 no_exon_seq = 0 short_dna = 0 pepseq_ok = 0 mismatch = 0 stored_incorrect = 0 translation_fail = 0 ##################################### #for gene_id in [10092907]: for gene_id in gene_ids: #for tot in range(1000): #gene_id = choice(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for gene', gene_id sys.exit(1) for exon in exons: ##################################### if not exon.is_coding: print exon.exon_id, " not coding " continue if exon.covering_exon > 0: print exon.exon_id, " is covered by ", exon.covering_exon continue tot_exons += 1 # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): no_exon_seq += 1 print "no exon seqs for ", gene_id, exon.exon_id #exit(1) continue [ exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq ] = exon_seqs if len(dna_seq) < 3: short_dna += 1 print "short_dna:", dna_seq continue if ( pepseq_transl_start == -10 ): # ??? what is this shit? adn what happens downstream if the pepseq_transl_start is None? translation_fail += 1 print "pepseq_transl_start:", pepseq_transl_start continue mitochondrial = is_mitochondrial(cursor, gene_id) dnaseq = Seq(dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna) if (mitochondrial): pepseq2 = dnaseq.translate( table="Vertebrate Mitochondrial").tostring() else: pepseq2 = dnaseq.translate().tostring() if True: print exon.exon_id print "pep stored:", pepseq print "dna transl:", pepseq2 print "dna begin:", dna_seq[:12] print "start:", pepseq_transl_start, print "end:", pepseq_transl_end print if (not pepseq == pepseq2): stored_incorrect += 1 else: pepseq_ok += 1 print "total coding exons ", tot_exons print "no exon seq info ", no_exon_seq print "short dna ", short_dna print "transl failure ", translation_fail print "stored pepseq does not correspond to the translation of stored dna: ", stored_incorrect print "pepseq ok ", pepseq_ok cursor.close() db.close()