def main(): no_threads = 1 db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) in_path = cfg.get_path('resources') if (not os.path.exists(in_path)): print in_path, "not found" ############### if not check_table_exists (cursor, db_name, 'name_resolution'): make_name_resolution_table (cursor) ############### os.chdir(in_path) filenames = glob.glob("*name_resolution.txt") for infile in filenames: store (cursor, in_path, infile) ############### cursor.close() db .close()
def main (): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) cursor.close() db .close() outpath = cfg.get_path('afs_dumps') outdir = "{0}/exon_map".format(outpath) if (not os.path.exists(outdir)): mkdir_p(outdir) outfile = "{0}/exon_map.sql".format(outdir) if os.path.exists('.creds'): [user, passwd, host, port] = read_creds() else: print "creds not found" exit(1) credentials = " -h {0} -P {1} -u {2} -p{3}".format(host, port, user, password) cmd = "mysqldump {0} {1} exon_map > {2}".format (credentials, ensembl_db_name['homo_sapiens'], outfile) print cmd ret = commands.getoutput(cmd) print ret return True
def main (): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader (user="******", passwd="tooiram", check=False) inpath = cfg.get_path('afs_dumps') indir = "%s/exon_map" % inpath infile = "%s/exon_map.sql" % indir if (not os.path.exists(infile)): print "not found: ", infile sys.exit(1) print "reading", infile qry = "drop table exon_map" rows = search_db(cursor, qry) # I could not get this to run, though it runs fine directly from the mysql shell: #qry = "source %s" % infile #rows = search_db(cursor, qry, verbose=True) cursor.close() db.close() credentials = " -u marioot -ptooiram" cmd = "mysql %s exolocator_db < %s" % (credentials, infile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): no_threads = 1 db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) # afs is killing me here ... in_path = cfg.get_path('afs_dumps')+"/exons" if (not os.path.exists(in_path)): print in_path, "not found" cursor.close() db .close() ############### os.chdir(in_path) filenames = glob.glob("*exon_dump.txt") parallelize (no_threads, load_from_infiles, filenames, in_path)
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db(cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) inpath = cfg.get_path('afs_dumps') indir = "%s/exon_map" % inpath infile = "%s/exon_map.sql" % indir if (not os.path.exists(infile)): print "not found: ", infile sys.exit(1) print "reading", infile qry = "drop table exon_map" rows = search_db(cursor, qry) # I could not get this to run, though it runs fine directly from the mysql shell: #qry = "source %s" % infile #rows = search_db(cursor, qry, verbose=True) cursor.close() db.close() credentials = " -u marioot -ptooiram" cmd = "mysql %s exolocator_db < %s" % (credentials, infile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) cursor.close() db.close() outpath = cfg.get_path('afs_dumps') outdir = "{0}/exon_map".format(outpath) if (not os.path.exists(outdir)): mkdir_p(outdir) outfile = "{0}/exon_map.sql".format(outdir) if os.path.exists('.creds'): [user, passwd, host, port] = read_creds() else: print "creds not found" exit(1) credentials = " -h {0} -P {1} -u {2} -p{3}".format( host, port, user, password) cmd = "mysqldump {0} {1} exon_map > {2}".format( credentials, ensembl_db_name['homo_sapiens'], outfile) print cmd ret = commands.getoutput(cmd) print ret return True
def main(): db = connect_to_mysql() cr = ConfigurationReader() cursor = db.cursor() fasta_path = cr.get_path('ensembl_fasta') [all_species, ensembl_db_name] = get_species (cursor) for species in all_species: #for species in ['danio_rerio']: print species dna_path = "{0}/{1}/dna".format(fasta_path, species) if (not os.path.exists(dna_path)): print "problem:", dna_path, "not found" exit(1) fasta_files = [] for r,d,files in os.walk(dna_path): for file in files: if (not file[-3:] == ".fa"): continue fasta_files.append(file) name2file = {} for file in fasta_files: print dna_path, file cmd = "grep '>' {0}/{1}".format(dna_path, file) ret = commands.getoutput(cmd) headers = ret.split("\n") print "number of headers: ", len(headers) for hdr in headers: fields = hdr.split(" ") name = fields[0].replace (">", "") #print name if (not name2file.has_key(name)): name2file[name] = [] name2file[name].append(file) qry = "use "+ensembl_db_name[species] search_db (cursor, qry) for name in name2file.keys(): file_names = "" for file in name2file[name]: if file_names: file_names += " " file_names += file store_seq_filenames (cursor, name, file_names) cursor.close() db .close()
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db (cursor, db_name) cfg = ConfigurationReader (user="******", passwd="tooiram", check=False) in_path = cfg.get_path('afs_dumps') in_path += "/para_dump" if (not os.path.exists(in_path)): print in_path, "not found" sys.exit(1) # exit on non-existent outdir ############### if 1: qry = "drop table paralog" search_db (cursor, qry) qry = "create table paralog (id int(10) primary key auto_increment) " search_db (cursor, qry) qry = "alter table paralog ADD gene_id1 varchar(30) " search_db (cursor, qry) qry = "alter table paralog ADD gene_id2 varchar(30) " search_db (cursor, qry) create_index (cursor, db_name,'gene_id_index', 'paralog', ['gene_id1', 'gene_id2']) ############### os.chdir(in_path) filenames = glob.glob("*_para_dump.txt") ############### for infile in filenames: print infile store(cursor, infile) cursor.close() db .close()
def dump_orthos (species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) # in the afa headers use 'trivial' names for the species: cow, dog, pig, ... trivial_name = translate_to_trivial(cursor, all_species) out_path = cfg.get_path('afs_dumps') outfile = "{0}/orthologue_dump.txt".format(out_path) print outfile of = erropen (outfile,"w") species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) qry = "select * from orthologue" rows = search_db (cursor, qry) for row in rows: [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] = row species = genome_db_id2species (cursor, genome_db_id) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) human_stable_id = gene2stable(cursor, human_gene_id) switch_to_db (cursor, ensembl_db_name[species]) cognate_stable_id = gene2stable(cursor, cognate_gene_id) print >>of, orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]]) of.close() cursor.close() db .close()
def main(): db_name = "exolocator_db" db = connect_to_mysql(user="******", passwd="tooiram") cursor = db.cursor() switch_to_db(cursor, db_name) cfg = ConfigurationReader(user="******", passwd="tooiram", check=False) in_path = cfg.get_path('afs_dumps') in_path += "/para_dump" if (not os.path.exists(in_path)): print in_path, "not found" sys.exit(1) # exit on non-existent outdir ############### if 1: qry = "drop table paralog" search_db(cursor, qry) qry = "create table paralog (id int(10) primary key auto_increment) " search_db(cursor, qry) qry = "alter table paralog ADD gene_id1 varchar(30) " search_db(cursor, qry) qry = "alter table paralog ADD gene_id2 varchar(30) " search_db(cursor, qry) create_index(cursor, db_name, 'gene_id_index', 'paralog', ['gene_id1', 'gene_id2']) ############### os.chdir(in_path) filenames = glob.glob("*_para_dump.txt") ############### for infile in filenames: print infile store(cursor, infile) cursor.close() db.close()
def dump_exons(species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() out_path = "{0}/exons".format(cfg.get_path('afs_dumps')) if not os.path.exists(out_path): print out_path, "not found" exit(1) # exit on failed output dir check for species in species_list: #if (not species=='homo_sapiens'): # continue outfile = "{0}/{1}_exon_dump.txt".format(out_path, species) of = erropen(outfile, "w") if not of: continue switch_to_db(cursor, ensembl_db_name[species]) if (species == 'homo_sapiens'): gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids(cursor, biotype='protein_coding') source = get_analysis_dict(cursor) ct = 0 for gene_id in gene_ids: ct += 1 if (not ct % 1000): print species, ct, len(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for ', gene_id continue for exon in exons: if exon.covering_exon > 0: continue # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): continue # human readable string describing the source of annotation for this exon if exon.is_known == 2: analysis = 'sw_sharp' elif exon.is_known == 3: analysis = 'usearch' else: analysis = source[exon.analysis_id] # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it gene_stable_id = gene2stable(cursor, gene_id) if (exon.is_known == 1): exon_stable_id = exon2stable(cursor, exon.exon_id) elif (exon.is_known == 2): exon_stable_id = 'sw_sharp_' + str(exon.exon_id) elif (exon.is_known == 3): exon_stable_id = 'usearch_' + str(exon.exon_id) else: exon_stable_id = "anon" print >> of, exon_tabstring(exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:]) of.close() print species, "done" cursor.close() db.close()
def dump_exons (species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() out_path = "{0}/exons".format(cfg.get_path('afs_dumps')) if not os.path.exists(out_path): print out_path, "not found" exit (1) # exit on failed output dir check for species in species_list: #if (not species=='homo_sapiens'): # continue outfile = "{0}/{1}_exon_dump.txt".format(out_path, species) of = erropen (outfile,"w") if not of: continue switch_to_db (cursor, ensembl_db_name[species]) if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') source = get_analysis_dict(cursor) ct = 0 for gene_id in gene_ids: ct += 1 if (not ct%1000): print species, ct, len(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for ', gene_id continue for exon in exons: if exon.covering_exon > 0: continue # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): continue # human readable string describing the source of annotation for this exon if exon.is_known==2: analysis = 'sw_sharp' elif exon.is_known==3: analysis = 'usearch' else: analysis = source[exon.analysis_id] # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it gene_stable_id = gene2stable(cursor,gene_id) if ( exon.is_known == 1): exon_stable_id = exon2stable(cursor,exon.exon_id) elif ( exon.is_known == 2): exon_stable_id = 'sw_sharp_'+str(exon.exon_id) elif ( exon.is_known == 3): exon_stable_id = 'usearch_'+str(exon.exon_id) else: exon_stable_id = "anon" print >> of, exon_tabstring (exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:]) of.close() print species, "done" cursor.close() db .close()