示例#1
0
def collect_orthologues(gene_list, db_info):
    
    [local_db, ensembl_db_name] = db_info

    db     = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)

    db_human  = connect_to_mysql()
    cursor_human = db_human.cursor()
    switch_to_db (cursor_human, ensembl_db_name['homo_sapiens'])

    ensembl_compara_name = get_compara_name(cursor)
    print ensembl_compara_name
 
    db_compara     = connect_to_mysql()
    cursor_compara = db_compara.cursor()
    switch_to_db (cursor_compara, ensembl_compara_name)


    ortho_table = {}
    ortho_table ['ortholog_one2one']          = 'orthologue'
    ortho_table ['apparent_ortholog_one2one'] = 'orthologue'
    ortho_table ['possible_ortholog']         = 'unresolved_ortho'
    ortho_table ['ortholog_one2many']         = 'unresolved_ortho'
    ortho_table ['ortholog_many2many']        = 'unresolved_ortho'
    ct = 0
    for gene_id in gene_list:
        ct += 1
        # find stable
        stable_id = gene2stable(cursor_human, gene_id=gene_id)
        # memebr id refers to entries in compara db
        member_id = stable2member(cursor_compara, stable_id)

        #print gene_id, stable_id, member_id
        if ( not ct%100): print ct , "out of ", len(gene_list)
        # in compara table, get everything that homology has to say about
        # the possible orthologues
        # find all orthologous pairs suggested for this gene
        for ortho_type in ['ortholog_one2one','possible_ortholog', 'apparent_ortholog_one2one',
                           'ortholog_one2many','ortholog_many2many']:
            orthos = get_orthologues(cursor_compara, ortho_type, member_id)

            if ( orthos):
                store_orthologues (cursor_human, ortho_table[ortho_type], cursor, all_species, 
                                   ensembl_db_name, gene_id, orthos)
        
 
    cursor.close()
    db.close()
    cursor_human.close()
    db_human.close()
    cursor_compara.close()
    db_compara.close()
def main ():

    
    db_name = "exolocator_db"
    db      = connect_to_mysql(user="******", passwd="tooiram")
    cursor  = db.cursor()
    switch_to_db (cursor, db_name)

    cfg     = ConfigurationReader (user="******", passwd="tooiram", check=False)

    inpath = cfg.get_path('afs_dumps')
    indir   = "%s/exon_map"     % inpath
    infile  = "%s/exon_map.sql" % indir
    if (not os.path.exists(infile)):
        print "not found: ", infile
        sys.exit(1)
    print "reading", infile

    qry = "drop table exon_map"
    rows = search_db(cursor, qry)
    # I could not get this to run, though it runs fine directly from the mysql shell:
    #qry = "source %s" % infile
    #rows = search_db(cursor, qry, verbose=True)
    cursor.close()
    db.close()

    credentials = " -u marioot -ptooiram"
    cmd = "mysql %s  exolocator_db  <  %s" % (credentials, infile)
    print cmd
    ret = commands.getoutput(cmd)
    print ret

 
    return True
def main():

    db     = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)

    for species in all_species:
        print species

        switch_to_db (cursor, ensembl_db_name[species])

        qry  = "select seq_region.name, seq_region.file_name from seq_region, gene "
        qry += " where gene.biotype='protein_coding' and gene.seq_region_id =  seq_region.seq_region_id "
            

        rows = search_db (cursor, qry)
        if (not rows):
            print "\t no seq region info found "
            continue
        tot = 0
        no_file = 0
        for row in rows:
            [name,  file_name] = row
            #print name, file_name
            tot += 1
            if (not file_name):
                no_file += 1
                print name, file_name
                #exit (1)

        print "\t tot seq_regions: ", tot, " no file: ", no_file
 
    cursor.close()
    db    .close()
def main():

    db_name = "exolocator_db"
    db = connect_to_mysql(user="******", passwd="tooiram")
    cursor = db.cursor()
    switch_to_db(cursor, db_name)

    cfg = ConfigurationReader(user="******", passwd="tooiram", check=False)

    inpath = cfg.get_path('afs_dumps')
    indir = "%s/exon_map" % inpath
    infile = "%s/exon_map.sql" % indir
    if (not os.path.exists(infile)):
        print "not found: ", infile
        sys.exit(1)
    print "reading", infile

    qry = "drop table exon_map"
    rows = search_db(cursor, qry)
    # I could not get this to run, though it runs fine directly from the mysql shell:
    #qry = "source %s" % infile
    #rows = search_db(cursor, qry, verbose=True)
    cursor.close()
    db.close()

    credentials = " -u marioot -ptooiram"
    cmd = "mysql %s  exolocator_db  <  %s" % (credentials, infile)
    print cmd
    ret = commands.getoutput(cmd)
    print ret

    return True
def all_species_all_genes_loop(species_list, db_info):

    [local_db, ensembl_db_name] = db_info
    db     = connect_to_mysql()
    cursor = db.cursor()
    #####################################
    for species in species_list:
        print
        print "############################"
        print  species
        sys.stdout.flush()

        if not switch_to_db(cursor, ensembl_db_name[species]):
            return False
        if (species=='homo_sapiens'):
            gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        else:
            gene_ids = get_gene_ids (cursor, biotype='protein_coding')
        #for all protein coding genes in a species
        #for gene_id in [10093105]:
        for gene_id in gene_ids:
            # for all exons in the gene
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for gene', gene_id
                continue            
            ####################################
            pep_seqs(cursor, gene_id, exons)
            ####################################
            if not gene_ids.index(gene_id)%1000:
                print "%50s:  %5.1f%% " %  (species, 100*(float( gene_ids.index(gene_id) +1 )/len(gene_ids))  )
                sys.stdout.flush()    
        print species, "done"
    cursor.close()
    db.close()
示例#6
0
def main():

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)
    cursor.close()
    db.close()

    outpath = cfg.get_path('afs_dumps')
    outdir = "{0}/exon_map".format(outpath)
    if (not os.path.exists(outdir)):
        mkdir_p(outdir)

    outfile = "{0}/exon_map.sql".format(outdir)
    if os.path.exists('.creds'):
        [user, passwd, host, port] = read_creds()
    else:
        print "creds not found"
        exit(1)
    credentials = " -h {0} -P {1} -u {2}  -p{3}".format(
        host, port, user, password)
    cmd = "mysqldump {0} {1} exon_map > {2}".format(
        credentials, ensembl_db_name['homo_sapiens'], outfile)

    print cmd
    ret = commands.getoutput(cmd)

    print ret

    return True
def ortologues_for_given_genes_loop (gene_list, db_info):

    [local_db, ensembl_db_name] = db_info
    db     = connect_to_mysql()
    cursor = db.cursor()

    #####################################
    for gene_id in gene_list:

        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        orthologues  = get_orthos (cursor, gene_id, 'orthologue') # get_orthos changes the db pointer

        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        orthologues += get_orthos (cursor, gene_id, 'unresolved_ortho')

        for [ortho_gene_id, ortho_species] in [[gene_id,'homo_sapiens']] + orthologues:
 
            print ">>> ", ortho_species, ortho_gene_id
            switch_to_db (cursor, ensembl_db_name[ortho_species])

            # for all exons in the gene
            exons = gene2exon_list(cursor, ortho_gene_id)
            if (not exons):
                if verbose: print 'no exons for gene', ortho_gene_id
                continue
            ##############################
            pep_seqs(cursor, ortho_gene_id, exons)
            
        ####################################
        if not gene_list.index(gene_id)%1000:
            print "%5.1f%% " %  (100*(float( gene_list.index(gene_id) +1 )/len(gene_list))  )
            sys.stdout.flush()
            
    cursor.close()
    db.close()
def main():
    
    special    = None
    no_threads = 1
    db  = connect_to_mysql()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    total = 0
    for species in all_species:
        print species
        switch_to_db (cursor, ensembl_db_name[species])
        qry  = "select count(1) from usearch_exon"
        rows = search_db (cursor, qry)
        count = int(rows[0][0])
        print "\t usearch exons: ", count 
        total += count
        qry  = "select count(1) from sw_exon"
        rows = search_db (cursor, qry)
        count = int(rows[0][0])
        print "\t sw exons: ", count 
        total += count
    print
    print 'total: ', total
    cursor.close()
    db.close()
def main():

    special = None
    no_threads = 1
    db = connect_to_mysql()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)
    total = 0
    for species in all_species:
        print species
        switch_to_db(cursor, ensembl_db_name[species])
        qry = "select count(1) from usearch_exon"
        rows = search_db(cursor, qry)
        count = int(rows[0][0])
        print "\t usearch exons: ", count
        total += count
        qry = "select count(1) from sw_exon"
        rows = search_db(cursor, qry)
        count = int(rows[0][0])
        print "\t sw exons: ", count
        total += count
    print
    print 'total: ', total
    cursor.close()
    db.close()
示例#10
0
def main ():

    
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)
    cursor.close()
    db    .close()

    outpath = cfg.get_path('afs_dumps')
    outdir   = "{0}/exon_map".format(outpath)
    if (not os.path.exists(outdir)):
        mkdir_p(outdir)

    outfile  = "{0}/exon_map.sql".format(outdir)
    if os.path.exists('.creds'):
        [user, passwd, host, port] = read_creds()
    else:
        print "creds not found"
        exit(1)
    credentials = " -h {0} -P {1} -u {2}  -p{3}".format(host, port, user, password)
    cmd = "mysqldump {0} {1} exon_map > {2}".format (credentials, ensembl_db_name['homo_sapiens'], outfile)

    print cmd
    ret = commands.getoutput(cmd)
    
    print ret

    return True
def main():

    no_threads = 1

    db = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)

    parallelize (no_threads, dump_orthos, all_species, [local_db, ensembl_db_name])
def main():

    db = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)
    # human and mouse are the only two species that have CCDs info
    for species in ['homo_sapiens']:
        check_alt_splices(cursor, species, ensembl_db_name)

    cursor.close()
    db.close()
示例#13
0
def main():

    db = connect_to_mysql()
    cr = ConfigurationReader()

    cursor = db.cursor()
    fasta_path = cr.get_path('ensembl_fasta')

    [all_species, ensembl_db_name] = get_species (cursor)

    for species in all_species:
    #for species in ['danio_rerio']:
        print species
        dna_path = "{0}/{1}/dna".format(fasta_path, species)
        if (not os.path.exists(dna_path)):
            print "problem:", dna_path, "not found"
            exit(1)

        fasta_files = []
        for r,d,files in os.walk(dna_path):
            for file in files:
                if (not file[-3:] == ".fa"):
                    continue
                fasta_files.append(file)
        
        name2file = {}
        for file in fasta_files:
            print dna_path, file
            cmd = "grep '>' {0}/{1}".format(dna_path, file)
            ret = commands.getoutput(cmd)
            headers = ret.split("\n")
            print "number of headers: ", len(headers)
            for hdr in headers:
                fields = hdr.split(" ")
                name = fields[0].replace (">", "")
                #print name
                if (not name2file.has_key(name)):
                    name2file[name] = []
                name2file[name].append(file)

        qry = "use "+ensembl_db_name[species]
        search_db (cursor, qry)

        for name in name2file.keys():
            file_names = ""
            for file in  name2file[name]:
                if file_names:
                    file_names += " "
                file_names += file
            store_seq_filenames (cursor, name, file_names)
 
    cursor.close()
    db    .close()
示例#14
0
def collect_paralogues(species_list, db_info):
    
    [local_db, ensembl_db_name] = db_info

    db_species  = connect_to_mysql()
    cursor_species = db_species.cursor()

    ensembl_compara_name = get_compara_name(cursor_species)
    print ensembl_compara_name
 
    db_compara     = connect_to_mysql()
    cursor_compara = db_compara.cursor()
    switch_to_db (cursor_compara, ensembl_compara_name)

    for species in species_list:
        switch_to_db (cursor_species,  ensembl_db_name[species])
        # it looks I cannot demand that the gene is known, because for many species
        # most of the genes still have 'predicted' status
        gene_list = get_gene_ids (cursor_species, biotype='protein_coding')
        ct = 0
        for gene_id in gene_list:
            ct += 1
            # find stable
            stable_id = gene2stable(cursor_species, gene_id=gene_id)
            # memebr id refers to entries in compara db
            member_id = stable2member(cursor_compara, stable_id)

            #print gene_id, stable_id, member_id
            if (not ct%100):
                print species, ct , "out of ", len(gene_list) 
            # find all paralogue pairs suggested for this gene
            ortho_type = 'within_species_paralog'
            paralogues = get_orthologues(cursor_compara, ortho_type, member_id)
            if not paralogues: continue
            store_paralogues (cursor_species, gene_id, paralogues)
        print species, 'done'
    cursor_species.close()
    db_species.close()
    cursor_compara.close()
    db_compara.close()
def main():

    db     = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    # human and mouse are the only two species that have CCDs info
    for species in ['homo_sapiens']:
        check_alt_splices (cursor, species, ensembl_db_name)



    cursor.close()
    db    .close()
示例#16
0
def main():
    
    no_threads = 10

    db     = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    cursor.close()
    db.close()

    parallelize (no_threads, collect_paralogues, all_species, [local_db, ensembl_db_name])
    
    return True
def main():
    
    no_threads = 1

    db = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)
    cursor.close()
    db    .close()

    parallelize (no_threads, make_alignments, all_species, [local_db, ensembl_db_name])
    
    return True
示例#18
0
def main():

    db = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)

    if 1:
        check_genome_sizes(cursor, all_species, ensembl_db_name)

    if 0:
        check_table_sizes(cursor, all_species, ensembl_db_name)

    cursor.close()
    db.close()
示例#19
0
def main():
    
    db    = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)

    if 1:
        check_genome_sizes (cursor, all_species, ensembl_db_name)

    if 0:
        check_table_sizes (cursor, all_species, ensembl_db_name)
      

    cursor.close()
    db.close()
示例#20
0
def main():
    
    no_threads = 10
    db     = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)
    species                        = 'homo_sapiens'
    switch_to_db (cursor, ensembl_db_name[species])
    gene_list                      = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
    cursor.close()
    db.close()

    parallelize (no_threads, collect_orthologues, gene_list, [local_db, ensembl_db_name])
    
    return True
def main():

    db  = connect_to_mysql()
    acg = AlignmentCommandGenerator()
    cfg = ConfigurationReader()
  
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    # human and mouse are the only two species that have CCDs info
    for species in [ 'homo_sapiens', 'mus_musculus']:
        alt_splice_almt (cursor, cfg, acg, species, ensembl_db_name)



    cursor.close()
    db    .close()
def main():
    
    no_threads = 12
    special    = None


    if len(sys.argv) > 1 and  len(sys.argv)<3:
        print "usage: %s <set name> <no of processes>" % sys.argv[0]
        exit(1) # after usage statement
    elif len(sys.argv)>=3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_processes = int(sys.argv[2])

    db  = connect_to_mysql()
    cfg = ConfigurationReader()

    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)

    print "running ", sys.argv[0]

    if special:
        print "using", special, "set"
        if special == 'complement':
            gene_list = get_complement_ids(cursor, ensembl_db_name, cfg)
        else:
            gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )

    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
 

    cursor.close()
    db.close()

    parallelize (no_processes, multiple_exon_alnmt, gene_list, [local_db, ensembl_db_name])
    
    return True
示例#23
0
def main():

	db     = connect_to_mysql(Config.mysql_conf_file)
	cursor = db.cursor()
	[all_species, ensembl_db_name] = get_species (cursor)

	tree   = Tree()
	for species in all_species:
		leaf = Node(species)
		tree.leafs.append(leaf)

	tree.build(cursor)

	print()
	print(tree.nhx_string())
	print()

	cursor.close()
	db.close()
示例#24
0
def main():
    
    db     = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    
    tree   = Tree()
    for species in all_species:
        leaf = Node(species)
        tree.leafs.append(leaf)

    tree.build(cursor)

    print
    print tree.nhx_string()
    print
    
    cursor.close()
    db.close()
示例#25
0
def main():

    no_threads = 1
    special    = ''
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)

    species = ''
    if len(sys.argv) > 1 and  len(sys.argv)<3  or len(sys.argv) >= 2 and sys.argv[1]=="-h":
        print "usage: %s <set name/species> <number of processes>" % sys.argv[0]
        exit(1) # after usage statement
    elif len(sys.argv)==3:
        special = sys.argv[1].lower()
        if special == 'none': 
            special = None
        elif special in all_species:
            species = special
        no_threads = int(sys.argv[2])
        
    print '======================================='
    print sys.argv[0]
    if species:
        print species, "only"
        switch_to_db (cursor, ensembl_db_name[species])
        if (species=='homo_sapiens'):
            gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True)
        else:
            gene_ids = get_gene_ids (cursor, biotype='protein_coding')
        parallelize_args = [no_threads, one_species_all_genes_loop, gene_ids,  [local_db, ensembl_db_name, species]]
    elif special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )
        parallelize_args = [no_threads, ortologues_for_given_genes_loop, gene_list,  [local_db, ensembl_db_name]]
    else:
        parallelize_args = [no_threads, all_species_all_genes_loop, all_species, [local_db, ensembl_db_name]]
        
    cursor.close()
    db    .close()

    parallelize (*parallelize_args)
def make_alignments(species_list, db_info):

    [local_db, ensembl_db_name] = db_info

    verbose = False
    flank_length = 10

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species(cursor)

    max_days = 60

    for species in species_list:

        if species == "homo_sapiens":
            species_shorthand = "HSA"
        else:
            species_shorthand = get_species_shorthand(cursor, species)
        print species, species_shorthand

        directory = check_directory(cfg, species, species_shorthand, "pep")
        if not directory:
            continue

        removed = 0
        remaining = 0
        for dirname, dirnames, filenames in os.walk(directory):
            for filename in filenames:
                full_name = os.path.join(dirname, filename)
                time_modified = os.path.getmtime(full_name)
                number_of_days_since_modified = (time.time() - time_modified) / (60 * 60 * 24)
                if number_of_days_since_modified > max_days:
                    # print "removing", filename, "made", number_of_days_since_modified, "ago"
                    os.remove(full_name)
                else:
                    remaining += 1
        print species, "done, removed", removed, "files, remaining", remaining
def dump_orthos (species_list, db_info):

    
    [local_db, ensembl_db_name] = db_info
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

     # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)

    # in the afa headers use 'trivial' names for the species: cow, dog, pig, ...
    trivial_name   = translate_to_trivial(cursor, all_species)

    out_path = cfg.get_path('afs_dumps')
    outfile  = "{0}/orthologue_dump.txt".format(out_path)
    print outfile
    of       = erropen (outfile,"w")

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])


    qry = "select * from orthologue"
    rows = search_db (cursor, qry)
    for row in rows:
        [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] =  row
        species = genome_db_id2species (cursor, genome_db_id)
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        human_stable_id = gene2stable(cursor, human_gene_id)
        switch_to_db (cursor,  ensembl_db_name[species])
        cognate_stable_id = gene2stable(cursor, cognate_gene_id)
        print  >>of,  orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]])


    of.close()
    
    cursor.close()
    db    .close()
def main():

    db_name = "exolocator_db"
    db      = connect_to_mysql(user="******", passwd="tooiram")
    cursor  = db.cursor()
    switch_to_db (cursor, db_name)

    cfg      = ConfigurationReader (user="******", passwd="tooiram", check=False)
    in_path  = cfg.get_path('afs_dumps')
    in_path += "/para_dump"
    if (not os.path.exists(in_path)):
        print in_path, "not found"
        sys.exit(1) # exit on non-existent outdir

    
    ###############
    if 1:
        qry = "drop table paralog"
        search_db (cursor, qry)
        qry = "create table paralog (id int(10) primary key auto_increment) "
        search_db (cursor, qry)
        qry = "alter table paralog  ADD gene_id1 varchar(30) " 
        search_db (cursor, qry)
        qry = "alter table paralog  ADD gene_id2 varchar(30) " 
        search_db (cursor, qry)
        create_index (cursor, db_name,'gene_id_index', 'paralog', ['gene_id1', 'gene_id2'])
        

    ###############
    os.chdir(in_path)
    filenames = glob.glob("*_para_dump.txt")

    ###############
    for infile in filenames:
        print infile
        store(cursor, infile)

    cursor.close()
    db    .close()
def make_alignments (species_list, db_info):

    [local_db, ensembl_db_name] = db_info

    verbose      = False
    flank_length = 10

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)

    max_days = 60

    for species in species_list:

        species_shorthand = get_species_shorthand(cursor, species)
        print(species, species_shorthand)

        directory = check_directory (cfg, species, species_shorthand, "pep")
        if not directory: continue

        removed   = 0
        remaining = 0
        for dirname, dirnames, filenames in os.walk(directory):
            for filename in filenames:
                full_name =  os.path.join(dirname, filename)
                time_modified = os.path.getmtime(full_name)
                number_of_days_since_modified = (time.time() - time_modified)/(60*60*24)
                if number_of_days_since_modified > max_days:
                    #print "removing", filename, "made", number_of_days_since_modified, "ago"
                    os.remove(full_name)
                else:
                    remaining += 1
        print(species, "done, removed", removed, "files, remaining", remaining)
示例#30
0
def main():

    db_name = "exolocator_db"
    db = connect_to_mysql(user="******", passwd="tooiram")
    cursor = db.cursor()
    switch_to_db(cursor, db_name)

    cfg = ConfigurationReader(user="******", passwd="tooiram", check=False)
    in_path = cfg.get_path('afs_dumps')
    in_path += "/para_dump"
    if (not os.path.exists(in_path)):
        print in_path, "not found"
        sys.exit(1)  # exit on non-existent outdir

    ###############
    if 1:
        qry = "drop table paralog"
        search_db(cursor, qry)
        qry = "create table paralog (id int(10) primary key auto_increment) "
        search_db(cursor, qry)
        qry = "alter table paralog  ADD gene_id1 varchar(30) "
        search_db(cursor, qry)
        qry = "alter table paralog  ADD gene_id2 varchar(30) "
        search_db(cursor, qry)
        create_index(cursor, db_name, 'gene_id_index', 'paralog',
                     ['gene_id1', 'gene_id2'])

    ###############
    os.chdir(in_path)
    filenames = glob.glob("*_para_dump.txt")

    ###############
    for infile in filenames:
        print infile
        store(cursor, infile)

    cursor.close()
    db.close()
示例#31
0
def one_species_all_genes_loop(gene_ids, db_info):
    [local_db, ensembl_db_name, species] = db_info
    db     = connect_to_mysql()
    cursor = db.cursor()
    
    switch_to_db (cursor, ensembl_db_name[species])
    #for gene_id in [10092907]:
    for gene_id in gene_ids:
        # for all exons in the gene
        exons = gene2exon_list(cursor, gene_id)
        if (not exons):
            if verbose: print 'no exons for gene', gene_id
            continue
        ####################################
        pep_seqs(cursor, gene_id, exons)
        ####################################
        if not gene_ids.index(gene_id) % 100:
            print "\t done with  %d out of %d (%5.1f%%) " % (gene_ids.index(gene_id) + 1, len(gene_ids), 
                                             100 * (float(gene_ids.index(gene_id) + 1) / len(gene_ids)))
            sys.stdout.flush()      
                                      
    cursor.close()
    db.close()
def main():

    db     = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)

    switch_to_db(cursor, ensembl_db_name['homo_sapiens'])

    magical_list = ['APC', 'BUB1', 'BUB1B', 'BUB3', 'C11orf51', 
                    'CDC20', 'CDC27', 'CENPF', 'TERF1', 'TPR', 
                    'TTK', 'UBE2C', 'UBE2D1', 'UBE2E1', 'TP53', 'BCL',
                    ' RAS', ' MIC ', 'actin']
    for gene_name in magical_list:
        description = ""
        gene_id = gene_name2gene_id(cursor, gene_name)
        if (not gene_id): 
            [gene_id, description] = search_description (cursor, gene_name)
        if (not gene_id): continue

        print gene_name, " ** ",  gene_id, description


    cursor.close()
    db    .close()
def main():

    db = connect_to_mysql()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)

    switch_to_db(cursor, ensembl_db_name['homo_sapiens'])

    magical_list = [
        'APC', 'BUB1', 'BUB1B', 'BUB3', 'C11orf51', 'CDC20', 'CDC27', 'CENPF',
        'TERF1', 'TPR', 'TTK', 'UBE2C', 'UBE2D1', 'UBE2E1', 'TP53', 'BCL',
        ' RAS', ' MIC ', 'actin'
    ]
    for gene_name in magical_list:
        description = ""
        gene_id = gene_name2gene_id(cursor, gene_name)
        if (not gene_id):
            [gene_id, description] = search_description(cursor, gene_name)
        if (not gene_id): continue

        print(gene_name, " ** ", gene_id, description)

    cursor.close()
    db.close()
示例#34
0
def main():

    local_db = False

    db     = connect_to_mysql()
    acg    = AlignmentCommandGenerator()

    cursor = db.cursor()
    
    [all_species, ensembl_db_name] = get_species (cursor)
    
    species = 'homo_sapiens'

    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    for gene_id in gene_ids:
        
        print(gene2stable (cursor, gene_id = gene_id), end=' ')

        # what is the length of the canonical transcript according to Ensembl
        canonical_translation = get_canonical_transl (acg, cursor, gene_id, species, strip_X=False)
        if ( not canonical_translation):
            print("no canonical transl found for ", gene2stable (cursor, gene_id = gene_id))
            continue

        # find all canonical coding exons associated with the gene id
        exons = get_canonical_coding_exons (cursor, gene_id)
        if (not exons):
            ct +=1
            print(gene_id, gene2stable (cursor, gene_id = gene_id), " no exons found ", ct, tot)

        exit(1)


    cursor.close()
    db.close()
示例#35
0
def main():

    local_db = False

    db     = connect_to_mysql()
    acg    = AlignmentCommandGenerator()

    cursor = db.cursor()
    
    [all_species, ensembl_db_name] = get_species (cursor)
    
    species = 'homo_sapiens'

    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    for gene_id in gene_ids:
        
        print gene2stable (cursor, gene_id = gene_id),

        # what is the length of the canonical transcript according to Ensembl
        canonical_translation = get_canonical_transl (acg, cursor, gene_id, species, strip_X=False)
        if ( not canonical_translation):
            print "no canonical transl found for ", gene2stable (cursor, gene_id = gene_id)
            continue

        # find all canonical coding exons associated with the gene id
        exons = get_canonical_coding_exons (cursor, gene_id)
        if (not exons):
            ct +=1
            print gene_id, gene2stable (cursor, gene_id = gene_id), " no exons found ", ct, tot

        exit(1)


    cursor.close()
    db.close()
def main():


    no_threads = 1
    special    = None

    if len(sys.argv) > 1 and  len(sys.argv)<3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv)==3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    species                        = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )
    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        
    incomplete = 0
    genes_checked = 0
    #for gene_id in gene_list: 
    #for gene_id in [743609]: 
    for sampling_count in range(1000):
 
        gene_id = choice(gene_list)
        genes_checked += 1
        with_map = 0
        tot      = 0
        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        # find all exons we are tracking in the database
        human_exons = gene2exon_list(cursor, gene_id)
        human_exons.sort(key=lambda exon: exon.start_in_gene)
        has_a_map = False
        for human_exon in human_exons:
            if (not human_exon.is_canonical or  not human_exon.is_coding): continue
            if verbose:
                print  
                print "\t human",   human_exon.exon_id,  human_exon.is_known
                print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens'])
                print "\t checking maps ..."
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            tot += 1
            if maps:
                has_a_map = True
                with_map += 1
                #print "ok"
            else:
                print"no maps for exon", human_exon.exon_id
                continue
            if verbose:
                for map in maps:
                    species            = map.species_2
                    exon               = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(cursor, exon, ensembl_db_name[species])
                    if ( map.similarity):
                        print "\t", species,  map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ",  map.exon_id_1, map.exon_known_1
                        print "\tsim",  map.similarity,
                        print "\tsource",  map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)): 
                                print "\talnd seq mismatch"
                            
                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else:        reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print
        if not tot== with_map:
            print "####  gene id: %d   total exons: %d     with map:  %d   ( = %d%%) " % \
                (gene_id,  tot,  with_map, int(float(with_map)/tot*100) )
            incomplete += 1

    print "genes checked: %d,  incomplete: %d"  %  (genes_checked, incomplete)
    cursor.close()
    db.close()

    print tot, with_map
def main():

    if (len(sys.argv) < 2):
        print "Usage: %s  <stable gene id> [<exon1> <exon2> ... ]" % sys.argv[0]
        exit(1)

    stable_id = sys.argv[1]
    species = 'homo_sapiens'
    selected_exons = sys.argv[2:]

    db = connect_to_mysql()
    cursor = db.cursor()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)

    print species, stable_id, ensembl_db_name[species]

    switch_to_db(cursor, ensembl_db_name[species])
    gene_id = stable2gene(cursor, stable_id)

    print get_description(cursor, gene_id)
    print "gene id:", gene_id

    # find all exons we are tracking in the database
    human_exons = gene2exon_list(cursor, gene_id)
    canonical_human_exons = []
    for human_exon in human_exons:
        if not human_exon.is_canonical or not human_exon.is_coding:
            continue
        canonical_human_exons.append(human_exon)

    # the exons are not guaranteed to be in order
    canonical_human_exons.sort(key=lambda exon: exon.start_in_gene)

    print "exons:"
    for exon in canonical_human_exons:
        if selected_exons and not str(exon.exon_id) in selected_exons: continue
        switch_to_db(cursor, ensembl_db_name[species])
        exon_seqs = get_exon_seqs(cursor, exon.exon_id, 1)
        [
            exon_pep_seq, trsl_from, trsl_to, exon_left_flank,
            exon_right_flank, exon_dna_seq
        ] = exon_seqs[1:]
        print "exon:", exon.exon_id, "covering exon:", exon.covering_exon, "pepseq:", exon_pep_seq
        if not exon.covering_exon == -1:
            [
                exon_pep_seq_2, trsl_from, trsl_to, exon_left_flank,
                exon_right_flank, exon_dna_seq
            ] = get_exon_seqs(cursor, exon.covering_exon, 1)[1:]
            print "\t", exon.covering_exon, " seq:", exon_pep_seq_2

        if 1:
            print
            print 'exon_alignments:'

            maps = get_maps(cursor, ensembl_db_name, exon.exon_id,
                            exon.is_known)
            if not maps:
                print "no maps for exon", exon.exon_id
            else:
                for map in maps:
                    species_2 = map.species_2
                    exon_2 = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(
                        cursor, exon_2, ensembl_db_name[species_2])
                    if (map.similarity):
                        print "\t", species_2, map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ", map.exon_id_1, map.exon_known_1
                        print "\tsim", map.similarity,
                        print "\tsource", map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)):
                                print "\talnd seq mismatch"

                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else: reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print

    cursor.close()
    db.close()
def main():

    no_threads = 1
    special = None

    if len(sys.argv) > 1 and len(sys.argv) < 3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv) == 3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species(cursor)
    species = 'homo_sapiens'
    switch_to_db(cursor, ensembl_db_name[species])

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special)
    else:
        print "using all protein coding genes"
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1)

    incomplete = 0
    genes_checked = 0
    #for gene_id in gene_list:
    #for gene_id in [743609]:
    for sampling_count in range(1000):

        gene_id = choice(gene_list)
        genes_checked += 1
        with_map = 0
        tot = 0
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        print gene2stable(cursor, gene_id), get_description(cursor, gene_id)

        # find all exons we are tracking in the database
        human_exons = gene2exon_list(cursor, gene_id)
        human_exons.sort(key=lambda exon: exon.start_in_gene)
        has_a_map = False
        for human_exon in human_exons:
            if (not human_exon.is_canonical or not human_exon.is_coding):
                continue
            if verbose:
                print
                print "\t human", human_exon.exon_id, human_exon.is_known
                print "\t ", get_exon_pepseq(cursor, human_exon,
                                             ensembl_db_name['homo_sapiens'])
                print "\t checking maps ..."
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id,
                            human_exon.is_known)
            tot += 1
            if maps:
                has_a_map = True
                with_map += 1
                #print "ok"
            else:
                print "no maps for exon", human_exon.exon_id
                continue
            if verbose:
                for map in maps:
                    species = map.species_2
                    exon = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(
                        cursor, exon, ensembl_db_name[species])
                    if (map.similarity):
                        print "\t", species, map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ", map.exon_id_1, map.exon_known_1
                        print "\tsim", map.similarity,
                        print "\tsource", map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)):
                                print "\talnd seq mismatch"

                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else: reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print
        if not tot == with_map:
            print "####  gene id: %d   total exons: %d     with map:  %d   ( = %d%%) " % \
                (gene_id,  tot,  with_map, int(float(with_map)/tot*100) )
            incomplete += 1

    print "genes checked: %d,  incomplete: %d" % (genes_checked, incomplete)
    cursor.close()
    db.close()

    print tot, with_map
def multiple_exon_alnmt(species_list, db_info):


    [local_db, ensembl_db_name] = db_info

    verbose  = False

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()


    for species in species_list:

        print
        print "############################"
        print  species

        switch_to_db (cursor,  ensembl_db_name[species])
        gene_ids = get_gene_ids (cursor, biotype='protein_coding')
        #gene_ids = get_theme_ids(cursor, cfg, 'wnt_pathway')
        if not gene_ids:
            print "no gene_ids"
            continue


        gene_ct       = 0
        tot           = 0
        ok            = 0
        no_maps       = 0
        no_pepseq     = 0
        no_paralogues = 0
        for gene_id in gene_ids:

            if verbose: start = time()
            gene_ct += 1
            if not gene_ct%100: print species, gene_ct, "genes out of", len(gene_ids)
            if verbose: 
                print
                print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id)

            # get the paralogues - only the representative for  the family will have this 
            paralogues = get_paras (cursor, gene_id)  
            if not paralogues:
                if verbose:  print "\t not a template or no paralogues"
                continue

            if verbose:  print "paralogues: ", paralogues

            # get _all_ exons
            template_exons = gene2exon_list(cursor, gene_id)
            if (not template_exons):
                if verbose: print 'no exons for ', gene_id
                continue

            # find all template  exons we are tracking in the database
            for template_exon in template_exons:

                if verbose: print template_exon.exon_id
                maps = get_maps(cursor, ensembl_db_name, template_exon.exon_id,
                                template_exon.is_known, species=species, table='para_exon_map')

                if not maps:
                    no_maps += 1
                    continue

                # output to fasta:
                seqname        = "{0}:{1}:{2}".format('template', template_exon.exon_id, template_exon.is_known)
                exon_seqs_info =  get_exon_seqs (cursor, template_exon.exon_id, template_exon.is_known)
                if not exon_seqs_info: continue
                [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
                 left_flank, right_flank, dna_seq] = exon_seqs_info
                if (not pepseq):
                    if ( template_exon.is_coding and  template_exon.covering_exon <0): # this should be a master exon
                        print "no pep seq for",  template_exon.exon_id, "coding ", template_exon.is_coding,
                        print "canonical: ",  template_exon.is_canonical
                        print "length of dna ", len(dna_seq)
                        no_pepseq += 1
                    continue
                
                tot += 1

                sequences = {seqname:pepseq}
                headers   = [seqname]
                for map in maps:
                    exon    = map2exon(cursor, ensembl_db_name, map, paralogue=True)
                    pepseq  = get_exon_pepseq (cursor,exon)
                    if (not pepseq):
                        continue
                    seqname = "{0}:{1}:{2}".format('para', map.exon_id_2, map.exon_known_2)
                    headers.append(seqname)
                    sequences[seqname] = pepseq

                fasta_fnm = "{0}/{1}_{2}_{3}.fa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                output_fasta (fasta_fnm, headers, sequences)

                if (len(headers) <=1 ):
                    print "single species in the alignment (?)"
                    no_paralogues += 1
                    continue

                # align
                afa_fnm  = "{0}/{1}_{2}_{3}.afa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
                ret      = commands.getoutput(mafftcmd)

                # read in the alignment
                inf = erropen(afa_fnm, "r")
                if not inf:
                    print gene_id
                    continue
                template_seq_seen = False
                for record in SeqIO.parse(inf, "fasta"):
                    ### store the alignment as bitstring
                    # Generate the bitmap
                    bs         = Bits(bin='0b' + re.sub("[^0]","1", str(record.seq).replace('-','0')))
                    msa_bitmap = bs.tobytes()
                    # Retrieve information on the cognate
                    label, cognate_exon_id, cognate_exon_known = record.id.split(':')
                    if (label == 'template'):
                        template_seq_seen = True
                    # Write the bitmap to the database
                    #print "updating: ", template_exon.exon_id
                    store_or_update(cursor, "para_exon_map", {"cognate_exon_id"    :cognate_exon_id,
                                                         "cognate_exon_known" :cognate_exon_known,
                                                         "exon_id"            :template_exon.exon_id,
                                                         "exon_known"         :template_exon.is_known},
                                    {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                inf.close()
                ok += 1
                commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)
            if verbose: print " time: %8.3f\n" % (time()-start);
 
        outstr  =  species + " done \n"
        outstr +=  "tot: %d   ok: %d  \n" % (tot,  ok)
        outstr +=  "no maps       %d  \n" % no_pepseq
        outstr +=  "no pepseq     %d  \n" % no_pepseq
        outstr +=  "no paralogues %d  \n" % no_paralogues
        outstr += "\n"
        print outstr
def main():

    db = connect_to_mysql()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)

    if len(sys.argv) > 1:
        species_list = sys.argv[1:]
    else:
        species_list = all_species

    ############################
    for species in species_list:
        print
        print "############################"
        print species

        switch_to_db(cursor, ensembl_db_name[species])

        if (species == 'homo_sapiens'):
            gene_ids = get_gene_ids(cursor,
                                    biotype='protein_coding',
                                    is_known=1)
        else:
            gene_ids = get_gene_ids(cursor, biotype='protein_coding')

        ct = 0
        tot = 0

        for tot in range(1000):
            #for gene_id in gene_ids:
            #tot += 1
            gene_id = choice(gene_ids)
            # find all canonical coding exons associated with the gene id
            exons = get_canonical_coding_exons(cursor, gene_id)
            if (not exons):
                ct += 1
                print gene_id, gene2stable(
                    cursor, gene_id=gene_id), " no exons found ", ct, tot

            if not tot % 100:
                print species, tot, ct

            # add up the coding length of the canonical exons
            exons.sort(key=lambda exon: exon.start_in_gene)

            inside_the_coding_range = False
            start_properly_marked = False
            length = 0
            for exon in exons:

                if not exon.canon_transl_start is None:
                    start_properly_marked = True  # if it is not propermy marked, we'll never start reading
                    inside_the_coding_range = True
                    length -= exon.canon_transl_start - 1

                if not exon.canon_transl_end is None:
                    inside_the_coding_range = False
                    length += exon.canon_transl_end

                if inside_the_coding_range:
                    length += exon.end_in_gene - exon.start_in_gene + 1

            # take that all exons are coding full length if there is no start and end annotation
            # (this I believe is the case for predicted transcripts)
            if not start_properly_marked:
                length = 0
                for exon in exons:
                    length += exon.end_in_gene - exon.start_in_gene + 1

            if (not length):
                print gene2stable(
                    cursor, gene_id=gene_id), " no exons marked as canonical"
                continue

            # what is the length of the canonical transcript according to Ensembl
            canonical_translation = get_canonical_transl(acg,
                                                         cursor,
                                                         gene_id,
                                                         species,
                                                         strip_X=False)
            if (not canonical_translation):
                print "no canonical transl found for ", gene_id
                continue

            if (abs(length / 3 - len(canonical_translation)) > 3):
                ct += 1
                print gene_id, gene2stable(cursor, gene_id), get_description(
                    cursor, gene_id)
                print "(length of all exons)/3 ", length / 3,
                print " does not match reported canonical transl len ", len(
                    canonical_translation)
                if False:
                    # print out all exons
                    print "exons:"
                    inspect(exons)
                    print
                    print 'canonical sequence'
                    print re.sub(
                        "(.{50})", "\\1\n", canonical_translation
                    )  # print canonical sequence with \n stuck in every 50 positions
                    print
                    # print out exons more carefully filtered to belong to the canonical version of the translation
                    print
                    get_translated_region_talkative(cursor, gene_id, species)
                    all_exons = gene2exon_list(cursor, gene_id)
                    print "all exons:"
                    inspect(all_exons)
                    print
                    compare_seqs(canonical_translation,
                                 translated_seq,
                                 verbose=False)
                    exit(1)

        print species, "checked a sample of ", tot + 1, "genes;  problematic:", ct

    cursor.close()
    db.close()
    #
    #    print 'Note: some problems could not have be resolved up to this point,'
    #    print 'becasue we have not really looged at the exons seqs yet.'
    #    print 'For example, for MP furo the, start fo the cannonical translation'
    #    print 'is sometimes given in the middle of NNNNN region.'
    #
    return True
def main():

    db     = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)    

    for species in all_species:

        if not species=='homo_sapiens': continue

        print
        print species

        switch_to_db (cursor,  ensembl_db_name[species])

        if (species=='homo_sapiens'):
            gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        else:
            gene_ids = get_gene_ids (cursor, biotype='protein_coding')

        tot_exons   = 0
        no_exon_seq = 0
        short_dna   = 0
        pepseq_ok   = 0
        mismatch    = 0
        stored_incorrect = 0
        translation_fail = 0
        #####################################
        #for gene_id in [10092907]:
        for gene_id in gene_ids:
        #for tot in range(1000):
            #gene_id = choice(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for gene', gene_id
                sys.exit(1)

            for exon in exons:

                #####################################                
                if not exon.is_coding:
                    print exon.exon_id, " not coding "
                    continue
                if exon.covering_exon >0:
                    print exon.exon_id, " is covered by ", exon.covering_exon 
                    continue
                    

                tot_exons += 1
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    no_exon_seq += 1
                    print "no exon seqs for  ", gene_id, exon.exon_id
                    #exit(1)
                    continue                   

                [exon_seq_id, pepseq, pepseq_transl_start, 
                 pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs

                if len(dna_seq)<3:
                    short_dna += 1
                    print "short_dna:", dna_seq
                    continue

                if (pepseq_transl_start == -10): # ??? what is this shit? adn what happens downstream if the pepseq_transl_start is None?
                    translation_fail += 1
                    print "pepseq_transl_start:", pepseq_transl_start
                    continue

                mitochondrial        = is_mitochondrial(cursor, gene_id)
                dnaseq  = Seq (dna_seq[pepseq_transl_start:pepseq_transl_end], generic_dna)
                if (mitochondrial):
                    pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring()
                else:
                    pepseq2 = dnaseq.translate().tostring()

                if True:
                    print exon.exon_id
                    print "pep stored:", pepseq
                    print "dna transl:", pepseq2
                    print "dna begin:", dna_seq[:12]
                    print "start:" , pepseq_transl_start, 
                    print "end:",  pepseq_transl_end
                    print

                if (not pepseq == pepseq2):
                    stored_incorrect += 1
                else:
                    pepseq_ok += 1

        print "total coding exons ", tot_exons
        print "no exon seq info   ", no_exon_seq
        print "short dna          ", short_dna
        print "transl failure     ", translation_fail
        print "stored pepseq does not correspond to the translation of stored dna:   ", stored_incorrect
        print "pepseq ok          ", pepseq_ok

    cursor.close()
    db    .close()
def main():
    
    special    = None
    no_threads = 1
    db  = connect_to_mysql()
    cfg = ConfigurationReader()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special)
    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    # loop over all genes
    sw_count = 0
    tot_count = 0
    for human_gene_id in gene_list:
        
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
 	human_stable      = gene2stable    (cursor, human_gene_id)
        human_description = get_description(cursor, human_gene_id)
        tot_count += 1
	#print human_gene_id, human_stable, human_description
   
  	human_exons = [e for e in gene2exon_list(cursor, human_gene_id, verbose=True) 
                       if e.covering_exon < 0 and e.is_canonical and e.is_known]
        if not human_exons: 
            #print "\t\t", human_stable, "no exons found"
            continue

	human_exons.sort(key=lambda exon: exon.start_in_gene)
        # loop over all exons in this gene
        maps_for_exon = {}
        for he in human_exons:
            he.stable_id = exon2stable (cursor, he.exon_id, ensembl_db_name['homo_sapiens'])
            he.pepseq = get_exon_pepseq (cursor, he,  ensembl_db_name['homo_sapiens'])
            # maps cleanup: get rid of maps that have "none" as similarity

            maps_for_exon[he] =  get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data
            if not maps_for_exon[he]: continue

            #maps_for_exon[he] = filter (lambda m: m.source == 'sw_sharp' or m.source == 'usearch', 
            #                            maps_for_exon[he])
            maps_for_exon[he] = filter (lambda m: m.source == 'usearch', 
                                        maps_for_exon[he])

            if not maps_for_exon[he]: 
                #print "\t\t", human_stable,  "no maps found"
                continue

            sw_count += len(maps_for_exon[he])
            #break

        print "tot count: ", tot_count
        print "sw count: ", sw_count


    #print "tot count: ", tot_count
    #print "sw count: ", sw_count
    
    cursor.close()
    db.close()
def multiple_exon_alnmt(gene_list, db_info):


    print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list))

    [local_db, ensembl_db_name] = db_info

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    # for each human gene
    gene_ct = 0
    tot  = 0
    ok   = 0
    no_maps        = 0
    no_pepseq      = 0
    no_orthologues = 0
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    #gene_list.reverse()
    for gene_id in gene_list:

        start = time()
        gene_ct += 1
        if  not gene_ct%10: print gene_ct, "genes out of", len(gene_list)

        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print gene_ct, len(gene_ids),  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id))
        human_exons.sort(key=lambda exon: exon.start_in_gene)

        ##################################################################
        for human_exon in human_exons:
            
            tot += 1

            # find all orthologous exons the human exon  maps to
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            if verbose: 
                print "\texon no.", tot, " id", human_exon.exon_id,
                if not maps: 
                    print " no maps"
                    print human_exon
                print 
            if not maps: 
                no_maps += 1
                continue

  
            # human sequence to fasta:
            seqname   = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known)
            switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
            [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
             left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known)
            if (not pepseq):
                if verbose and  human_exon.is_coding and  human_exon.covering_exon <0: # this should be a master exon
                    print "no pep seq for",  human_exon.exon_id, "coding ", human_exon.is_coding,
                    print "canonical: ",  human_exon.is_canonical
                    print "length of dna ", len(dna_seq)
                no_pepseq += 1
                continue

            # collect seq from all maps, and output them in fasta format
            hassw = False
            headers   = []
            sequences = {}
            exons_per_species = {}

            for map in maps:

                switch_to_db (cursor, ensembl_db_name[map.species_2])
                if map.similarity < min_similarity: continue
                exon    = map2exon(cursor, ensembl_db_name, map)
                pepseq  = get_exon_pepseq (cursor,exon)
                if (not pepseq):
                    continue
                if  map.source == 'sw_sharp':
                    exon_known_code = 2
                    hassw = True
                elif  map.source == 'usearch':
                    exon_known_code = 3
                    hassw = True
                else:
                    exon_known_code = map.exon_known_2
                seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code)
                headers.append(seqname)
                sequences[seqname] = pepseq
                # for split exon concatenation (see below)
                if not map.species_2 in exons_per_species.keys():
                    exons_per_species[map.species_2] = []
                exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]);
                
                    
            if (len(headers) <=1 ):
                if verbose: print "single species in the alignment"
                no_orthologues += 1
                continue
            
            # concatenate exons from the same gene - the alignment program might go wrong otherwise
            concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species)

            fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            output_fasta (fasta_fnm, sequences.keys(), sequences)

            # align
            afa_fnm  = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
            ret      = commands.getoutput(mafftcmd)

            if (verbose): print 'almt to', afa_fnm

            # read in the alignment 
            inf = erropen(afa_fnm, "r")
            aligned_seqs = {}
            for record in SeqIO.parse(inf, "fasta"):
                aligned_seqs[record.id] = str(record.seq)
            inf.close()
            # split back the concatenated exons
            if concatenated: split_concatenated_exons (aligned_seqs, concatenated)

            human_seq_seen = False
            for seq_name, sequence in aligned_seqs.iteritems():
                # if this is one of the concatenated seqs, split them back to two

                ### store the alignment as bitstring
                # Generate the bitmap
                bs         = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0')))
                # The returned value of tobytes() will be padded at the end 
                # with between zero and seven 0 bits to make it byte aligned.
                # I will end up with something that looks like extra alignment gaps, that I'll have to return
                msa_bitmap = bs.tobytes() 
                # Retrieve information on the cognate
                cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':')
                if cognate_exon_known == '2':
                    source = 'sw_sharp'
                elif cognate_exon_known == '3':
                    source = 'usearch'
                else:
                    source = 'ensembl'
                if (cognate_species == 'homo_sapiens'):
                    human_seq_seen = True
                cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor
                switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens
                # Write the bitmap to the database
                #if (cognate_species == 'homo_sapiens'):
                if verbose: # and (source=='sw_sharp' or source=='usearch'):
                    print "storing"
                    print human_exon.exon_id, human_exon.is_known
                    print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source
                    print sequence
                    if not msa_bitmap:
                        print "no msa_bitmap"
                        continue
                store_or_update(cursor, "exon_map",    {"cognate_genome_db_id":cognate_genome_db_id,
                   "cognate_exon_id":cognate_exon_id   ,"cognate_exon_known"  :cognate_exon_known,
                   "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known},
                  {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                 
            ok += 1
            commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)

        if verbose: print " time: %8.3f\n" % (time()-start);

    print "tot: ", tot, "ok: ", ok
    print "no maps ",   no_pepseq
    print "no pepseq ", no_pepseq
    print "no orthologues  ", no_orthologues
    print
def main():

    parameter = {}
    # in case I ever have to handle multiple versions of ensembl
    # (but for now I don't have enough space)
    # note though that there are functions in el_utils/mysql.py that assume
    # that whatever ensembl stuff is available to the mysql server corresponds to the same release 
    release_number = '76'
    parameter['ensembl_release_number'] = release_number
    parameter['blastp_e_value']         = "1.e-10" # it will be used as a string  when fmting the blastp cmd
    parameter['min_accptbl_exon_sim']   = 0.33333 #minimum acceptable exon similarity

    dir_path = {}
    dir_path['ensembl_fasta'] = '/mnt/ensembl-mirror/release-'+release_number+'/fasta'
    # local juggling of data from one database base to the other
    dir_path['afs_dumps']     = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['afs_dumps']    += 'ExoLocator/results/dumpster'
    dir_path['resources']     = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['resources']    += 'pypeworks/exolocator/resources'
    dir_path['scratch']       = '/tmp'
    dir_path['maxentscan']    = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['maxentscan']   += 'pypeworks/exolocator/pl_utils/maxentscan'

    util_path = {}
    util_path['mafft']    = '/usr/bin/mafft'
    util_path['blastall'] = '/usr/bin/blastall'
    util_path['fastacmd'] = '/usr/bin/fastacmd'
    util_path['sw#']      = '/usr/bin/swsharp'
    util_path['usearch']  = '/usr/bin/usearch'
    util_path['score3']   = dir_path['maxentscan'] + '/score3.pl'
    util_path['score5']   = dir_path['maxentscan'] + '/score5.pl'

    if 1:
        # check if the paths are functioning (at this point at least)
        for util in util_path.values():
            if (not os.path.exists(util)):
                print util, " not found "
                sys.exit (1)

        for dir in dir_path.values():
            if (not os.path.exists(dir)):
                print dir, " not found "
                sys.exit (1)
            if (not os.path.isdir (dir)):
                print dir, " is not a directory "
                sys.exit (1)
            
    db     = connect_to_mysql()
    cursor = db.cursor()


    #######################################################
    # check if the config db exists -- if not, make it
    db_name   = "exolocator_config"
    qry  = "show databases like'%s'" % db_name
    rows = search_db (cursor, qry)
    if (not rows):
        print db_name, "database not found"
        qry = "create database %s " % db_name
        rows = search_db (cursor, qry)
        if (rows):
            print "some problem creating the database ..."
            rows = search_db (cursor, qry, verbose = True)
    else:
        print db_name, "database found"

    qry = "use %s " % db_name
    search_db (cursor, qry)
        
    # make tables
    for table in ['util_path', 'dir_path', 'parameter']:
        if ( check_table_exists (cursor, db_name, table)):
            print table, " found in ", db_name
        else:
            print table, " not found in ", db_name
            make_table (cursor, table)
   
    # fill util, dir and path tables 
    fixed_fields  = {}
    update_fields = {}
    for [name, path] in util_path.iteritems():
        fixed_fields['name']  = name
        update_fields['path'] = path
        store_or_update (cursor, 'util_path', fixed_fields, update_fields)

    fixed_fields  = {}
    update_fields = {}
    for [name, path] in dir_path.iteritems():
        fixed_fields['name'] = name
        update_fields['path'] = path
        store_or_update (cursor, 'dir_path', fixed_fields, update_fields)

    fixed_fields  = {}
    update_fields = {}
    for [name, value] in parameter.iteritems():
        fixed_fields['name']  = name
        update_fields['value'] = value
        store_or_update (cursor, 'parameter', fixed_fields, update_fields)

    #######################################################
    # add trivial names to ncbi_taxonomy.names
    [all_species, ensembl_db_name] = get_species (cursor)
    feed_trivial_names (cursor, all_species)

    #######################################################
    # add species shorthands (used in ENS* names formation)
    # though we will not needed unit the paralogue alignment reconstruction point)
    feed_name_shorthands (cursor, all_species)

    cursor.close()
    db.close()
示例#45
0
def dump_exons(species_list, db_info):

    [local_db, ensembl_db_name] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    out_path = "{0}/exons".format(cfg.get_path('afs_dumps'))
    if not os.path.exists(out_path):
        print out_path, "not found"
        exit(1)  # exit on failed output dir check

    for species in species_list:
        #if (not species=='homo_sapiens'):
        #    continue
        outfile = "{0}/{1}_exon_dump.txt".format(out_path, species)
        of = erropen(outfile, "w")
        if not of: continue
        switch_to_db(cursor, ensembl_db_name[species])

        if (species == 'homo_sapiens'):
            gene_ids = get_gene_ids(cursor,
                                    biotype='protein_coding',
                                    is_known=1,
                                    ref_only=True)
        else:
            gene_ids = get_gene_ids(cursor, biotype='protein_coding')

        source = get_analysis_dict(cursor)

        ct = 0
        for gene_id in gene_ids:
            ct += 1
            if (not ct % 1000):
                print species, ct, len(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for ', gene_id
                continue

            for exon in exons:

                if exon.covering_exon > 0: continue
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    continue
                # human readable string describing the source of annotation for this exon
                if exon.is_known == 2:
                    analysis = 'sw_sharp'
                elif exon.is_known == 3:
                    analysis = 'usearch'
                else:
                    analysis = source[exon.analysis_id]
                # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it
                gene_stable_id = gene2stable(cursor, gene_id)
                if (exon.is_known == 1):
                    exon_stable_id = exon2stable(cursor, exon.exon_id)
                elif (exon.is_known == 2):
                    exon_stable_id = 'sw_sharp_' + str(exon.exon_id)
                elif (exon.is_known == 3):
                    exon_stable_id = 'usearch_' + str(exon.exon_id)
                else:
                    exon_stable_id = "anon"

                print >> of, exon_tabstring(exon, gene_stable_id,
                                            exon_stable_id, species, analysis,
                                            exon_seqs[1:])

        of.close()
        print species, "done"

    cursor.close()
    db.close()
示例#46
0
def main():

    db = connect_to_mysql()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)

    for species in all_species:

        if not species == 'homo_sapiens': continue

        print
        print species

        switch_to_db(cursor, ensembl_db_name[species])

        if (species == 'homo_sapiens'):
            gene_ids = get_gene_ids(cursor,
                                    biotype='protein_coding',
                                    is_known=1)
        else:
            gene_ids = get_gene_ids(cursor, biotype='protein_coding')

        tot_exons = 0
        no_exon_seq = 0
        short_dna = 0
        pepseq_ok = 0
        mismatch = 0
        stored_incorrect = 0
        translation_fail = 0
        #####################################
        #for gene_id in [10092907]:
        for gene_id in gene_ids:
            #for tot in range(1000):
            #gene_id = choice(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for gene', gene_id
                sys.exit(1)

            for exon in exons:

                #####################################
                if not exon.is_coding:
                    print exon.exon_id, " not coding "
                    continue
                if exon.covering_exon > 0:
                    print exon.exon_id, " is covered by ", exon.covering_exon
                    continue

                tot_exons += 1
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    no_exon_seq += 1
                    print "no exon seqs for  ", gene_id, exon.exon_id
                    #exit(1)
                    continue

                [
                    exon_seq_id, pepseq, pepseq_transl_start,
                    pepseq_transl_end, left_flank, right_flank, dna_seq
                ] = exon_seqs

                if len(dna_seq) < 3:
                    short_dna += 1
                    print "short_dna:", dna_seq
                    continue

                if (
                        pepseq_transl_start == -10
                ):  # ??? what is this shit? adn what happens downstream if the pepseq_transl_start is None?
                    translation_fail += 1
                    print "pepseq_transl_start:", pepseq_transl_start
                    continue

                mitochondrial = is_mitochondrial(cursor, gene_id)
                dnaseq = Seq(dna_seq[pepseq_transl_start:pepseq_transl_end],
                             generic_dna)
                if (mitochondrial):
                    pepseq2 = dnaseq.translate(
                        table="Vertebrate Mitochondrial").tostring()
                else:
                    pepseq2 = dnaseq.translate().tostring()

                if True:
                    print exon.exon_id
                    print "pep stored:", pepseq
                    print "dna transl:", pepseq2
                    print "dna begin:", dna_seq[:12]
                    print "start:", pepseq_transl_start,
                    print "end:", pepseq_transl_end
                    print

                if (not pepseq == pepseq2):
                    stored_incorrect += 1
                else:
                    pepseq_ok += 1

        print "total coding exons ", tot_exons
        print "no exon seq info   ", no_exon_seq
        print "short dna          ", short_dna
        print "transl failure     ", translation_fail
        print "stored pepseq does not correspond to the translation of stored dna:   ", stored_incorrect
        print "pepseq ok          ", pepseq_ok

    cursor.close()
    db.close()