Python get_theme_ids示例，el_utils.special_gene_sets.get_theme_ids Python示例

示例#1

0

显示文件

文件： 48_count_unsequenced.py 项目： ivanamihalek/exolocator

def main():

    special = 'test'
    no_threads = 10
    method = 'usearch'

    if len(sys.argv) > 1 and len(sys.argv) < 4:
        print("usage: %s <set name> <number of threads> <method>" %
              sys.argv[0])
        exit(1)
    elif len(sys.argv) == 4:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

        method = sys.argv[3]
        if not (method == 'usearch' or method == 'sw_sharp'):
            print("unrecognized method: ", method)
            exit(1)

    # sw_sharps chokes if there is only one graphics card
    if method == 'sw_sharp': no_threads = 1

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)

    print('=======================================')
    print(sys.argv[0])
    if special:
        print("using", special, "set")
        if special == 'complement':
            gene_list = get_complement_ids(cursor, ensembl_db_name, cfg)
        else:
            gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special)

    else:
        print("using all protein coding genes")
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1)

    cursor.close()
    db.close()

    parallelize(no_threads, find_missing_exons, gene_list,
                [local_db, ensembl_db_name, method])

    return True

示例#2

0

显示文件

文件： 48_count_unsequenced.py 项目： ivanamihalek/exolocator

def main():
    
    special    = 'test'
    no_threads = 10
    method     = 'usearch'


    if len(sys.argv) > 1 and  len(sys.argv)<4:
        print "usage: %s <set name> <number of threads> <method>" % sys.argv[0]
        exit(1)
    elif len(sys.argv)==4:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])
        
        method = sys.argv[3]
        if not (method =='usearch' or method=='sw_sharp'):
            print "unrecognized method: ", method
            exit(1)

    # sw_sharps chokes if there is only one graphics card
    if method=='sw_sharp': no_threads = 1

    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)


    print '======================================='
    print sys.argv[0]
    if special:
        print "using", special, "set"
        if special == 'complement':
            gene_list = get_complement_ids(cursor, ensembl_db_name, cfg)
        else:
            gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )

    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    cursor.close()
    db.close()

    parallelize (no_threads, find_missing_exons, gene_list, [local_db, ensembl_db_name, method])
    
    return True

示例#3

0

显示文件

文件： 08_exon_pep_seqs.py 项目： ivanamihalek/exolocator

def main():

    no_threads = 1
    special    = ''
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)

    species = ''
    if len(sys.argv) > 1 and  len(sys.argv)<3  or len(sys.argv) >= 2 and sys.argv[1]=="-h":
        print "usage: %s <set name/species> <number of processes>" % sys.argv[0]
        exit(1) # after usage statement
    elif len(sys.argv)==3:
        special = sys.argv[1].lower()
        if special == 'none': 
            special = None
        elif special in all_species:
            species = special
        no_threads = int(sys.argv[2])
        
    print '======================================='
    print sys.argv[0]
    if species:
        print species, "only"
        switch_to_db (cursor, ensembl_db_name[species])
        if (species=='homo_sapiens'):
            gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True)
        else:
            gene_ids = get_gene_ids (cursor, biotype='protein_coding')
        parallelize_args = [no_threads, one_species_all_genes_loop, gene_ids,  [local_db, ensembl_db_name, species]]
    elif special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )
        parallelize_args = [no_threads, ortologues_for_given_genes_loop, gene_list,  [local_db, ensembl_db_name]]
    else:
        parallelize_args = [no_threads, all_species_all_genes_loop, all_species, [local_db, ensembl_db_name]]
        
    cursor.close()
    db    .close()

    parallelize (*parallelize_args)

示例#4

0

显示文件

文件： 09_make_ortho_maps.py 项目： ivanamihalek/exolocator

def main():
    
    no_threads = 10
    special    = None

    if len(sys.argv) > 1 and  len(sys.argv)<3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv)==3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)
    
    print '======================================='
    print sys.argv[0]
    if special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )
    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        

    cursor.close()
    db.close()

    parallelize (no_threads, maps_for_gene_list, gene_list, [local_db, ensembl_db_name])
    
    return True

示例#5

0

显示文件

文件： 07_exon_dna_seqs.py 项目： ivanamihalek/exolocator

def main():

    """
    Main entry point, but in reality does nothing except taking care of the parallelization.
    The parallelization here is per-species.
    """

    no_threads = 1
    special    = ''

    if len(sys.argv) > 1 and  len(sys.argv)<3  or len(sys.argv) >= 2 and sys.argv[1]=="-h":
        print "usage: %s <set name> <number of threads>" % sys.argv[0]
        exit(1) # after usage statment
    elif len(sys.argv)==3:
        special = sys.argv[1].lower()
        if special == 'none': special = None
        no_threads = int(sys.argv[2])

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    print '======================================='
    print sys.argv[0]
    if special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )
 
    cursor.close()
    db    .close()

    # two version of the main loop:
    # 1) over all species, and all genes in each speceis
    if not special:
        parallelize (no_threads, store_exon_seqs, all_species, [local_db, ensembl_db_name])
    else:
        parallelize (no_threads, store_exon_seqs_special, gene_list,  [local_db, ensembl_db_name])

示例#6

0

显示文件

文件： 12_novel_exons_inspector.py 项目： ivanamihalek/exolocator

def main():
    
    special    = None
    no_threads = 1
    db  = connect_to_mysql()
    cfg = ConfigurationReader()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special)
    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    # loop over all genes
    sw_count = 0
    tot_count = 0
    for human_gene_id in gene_list:
        
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
 	human_stable      = gene2stable    (cursor, human_gene_id)
        human_description = get_description(cursor, human_gene_id)
        tot_count += 1
	#print human_gene_id, human_stable, human_description
   
  	human_exons = [e for e in gene2exon_list(cursor, human_gene_id, verbose=True) 
                       if e.covering_exon < 0 and e.is_canonical and e.is_known]
        if not human_exons: 
            #print "\t\t", human_stable, "no exons found"
            continue

	human_exons.sort(key=lambda exon: exon.start_in_gene)
        # loop over all exons in this gene
        maps_for_exon = {}
        for he in human_exons:
            he.stable_id = exon2stable (cursor, he.exon_id, ensembl_db_name['homo_sapiens'])
            he.pepseq = get_exon_pepseq (cursor, he,  ensembl_db_name['homo_sapiens'])
            # maps cleanup: get rid of maps that have "none" as similarity

            maps_for_exon[he] =  get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data
            if not maps_for_exon[he]: continue

            #maps_for_exon[he] = filter (lambda m: m.source == 'sw_sharp' or m.source == 'usearch', 
            #                            maps_for_exon[he])
            maps_for_exon[he] = filter (lambda m: m.source == 'usearch', 
                                        maps_for_exon[he])

            if not maps_for_exon[he]: 
                #print "\t\t", human_stable,  "no maps found"
                continue

            sw_count += len(maps_for_exon[he])
            #break

        print "tot count: ", tot_count
        print "sw count: ", sw_count


    #print "tot count: ", tot_count
    #print "sw count: ", sw_count
    
    cursor.close()
    db.close()

示例#7

0

显示文件

文件： 10_check_ortho_maps.py 项目： ivanamihalek/exolocator

def main():

    no_threads = 1
    special = None

    if len(sys.argv) > 1 and len(sys.argv) < 3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv) == 3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species(cursor)
    species = 'homo_sapiens'
    switch_to_db(cursor, ensembl_db_name[species])

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special)
    else:
        print "using all protein coding genes"
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1)

    incomplete = 0
    genes_checked = 0
    #for gene_id in gene_list:
    #for gene_id in [743609]:
    for sampling_count in range(1000):

        gene_id = choice(gene_list)
        genes_checked += 1
        with_map = 0
        tot = 0
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        print gene2stable(cursor, gene_id), get_description(cursor, gene_id)

        # find all exons we are tracking in the database
        human_exons = gene2exon_list(cursor, gene_id)
        human_exons.sort(key=lambda exon: exon.start_in_gene)
        has_a_map = False
        for human_exon in human_exons:
            if (not human_exon.is_canonical or not human_exon.is_coding):
                continue
            if verbose:
                print
                print "\t human", human_exon.exon_id, human_exon.is_known
                print "\t ", get_exon_pepseq(cursor, human_exon,
                                             ensembl_db_name['homo_sapiens'])
                print "\t checking maps ..."
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id,
                            human_exon.is_known)
            tot += 1
            if maps:
                has_a_map = True
                with_map += 1
                #print "ok"
            else:
                print "no maps for exon", human_exon.exon_id
                continue
            if verbose:
                for map in maps:
                    species = map.species_2
                    exon = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(
                        cursor, exon, ensembl_db_name[species])
                    if (map.similarity):
                        print "\t", species, map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ", map.exon_id_1, map.exon_known_1
                        print "\tsim", map.similarity,
                        print "\tsource", map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)):
                                print "\talnd seq mismatch"

                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else: reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print
        if not tot == with_map:
            print "####  gene id: %d   total exons: %d     with map:  %d   ( = %d%%) " % \
                (gene_id,  tot,  with_map, int(float(with_map)/tot*100) )
            incomplete += 1

    print "genes checked: %d,  incomplete: %d" % (genes_checked, incomplete)
    cursor.close()
    db.close()

    print tot, with_map

示例#8

0

显示文件

文件： 10_check_ortho_maps.py 项目： ivanamihalek/exolocator

def main():


    no_threads = 1
    special    = None

    if len(sys.argv) > 1 and  len(sys.argv)<3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv)==3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    species                        = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )
    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        
    incomplete = 0
    genes_checked = 0
    #for gene_id in gene_list: 
    #for gene_id in [743609]: 
    for sampling_count in range(1000):
 
        gene_id = choice(gene_list)
        genes_checked += 1
        with_map = 0
        tot      = 0
        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        # find all exons we are tracking in the database
        human_exons = gene2exon_list(cursor, gene_id)
        human_exons.sort(key=lambda exon: exon.start_in_gene)
        has_a_map = False
        for human_exon in human_exons:
            if (not human_exon.is_canonical or  not human_exon.is_coding): continue
            if verbose:
                print  
                print "\t human",   human_exon.exon_id,  human_exon.is_known
                print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens'])
                print "\t checking maps ..."
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            tot += 1
            if maps:
                has_a_map = True
                with_map += 1
                #print "ok"
            else:
                print"no maps for exon", human_exon.exon_id
                continue
            if verbose:
                for map in maps:
                    species            = map.species_2
                    exon               = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(cursor, exon, ensembl_db_name[species])
                    if ( map.similarity):
                        print "\t", species,  map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ",  map.exon_id_1, map.exon_known_1
                        print "\tsim",  map.similarity,
                        print "\tsource",  map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)): 
                                print "\talnd seq mismatch"
                            
                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else:        reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print
        if not tot== with_map:
            print "####  gene id: %d   total exons: %d     with map:  %d   ( = %d%%) " % \
                (gene_id,  tot,  with_map, int(float(with_map)/tot*100) )
            incomplete += 1

    print "genes checked: %d,  incomplete: %d"  %  (genes_checked, incomplete)
    cursor.close()
    db.close()

    print tot, with_map