def main(): special = 'test' no_threads = 10 method = 'usearch' if len(sys.argv) > 1 and len(sys.argv) < 4: print("usage: %s <set name> <number of threads> <method>" % sys.argv[0]) exit(1) elif len(sys.argv) == 4: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) method = sys.argv[3] if not (method == 'usearch' or method == 'sw_sharp'): print("unrecognized method: ", method) exit(1) # sw_sharps chokes if there is only one graphics card if method == 'sw_sharp': no_threads = 1 db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) print('=======================================') print(sys.argv[0]) if special: print("using", special, "set") if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special) else: print("using all protein coding genes") switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize(no_threads, find_missing_exons, gene_list, [local_db, ensembl_db_name, method]) return True
def main(): special = 'test' no_threads = 10 method = 'usearch' if len(sys.argv) > 1 and len(sys.argv)<4: print "usage: %s <set name> <number of threads> <method>" % sys.argv[0] exit(1) elif len(sys.argv)==4: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) method = sys.argv[3] if not (method =='usearch' or method=='sw_sharp'): print "unrecognized method: ", method exit(1) # sw_sharps chokes if there is only one graphics card if method=='sw_sharp': no_threads = 1 db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" if special == 'complement': gene_list = get_complement_ids(cursor, ensembl_db_name, cfg) else: gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize (no_threads, find_missing_exons, gene_list, [local_db, ensembl_db_name, method]) return True
def main(): no_threads = 1 special = '' db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) species = '' if len(sys.argv) > 1 and len(sys.argv)<3 or len(sys.argv) >= 2 and sys.argv[1]=="-h": print "usage: %s <set name/species> <number of processes>" % sys.argv[0] exit(1) # after usage statement elif len(sys.argv)==3: special = sys.argv[1].lower() if special == 'none': special = None elif special in all_species: species = special no_threads = int(sys.argv[2]) print '=======================================' print sys.argv[0] if species: print species, "only" switch_to_db (cursor, ensembl_db_name[species]) if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') parallelize_args = [no_threads, one_species_all_genes_loop, gene_ids, [local_db, ensembl_db_name, species]] elif special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) parallelize_args = [no_threads, ortologues_for_given_genes_loop, gene_list, [local_db, ensembl_db_name]] else: parallelize_args = [no_threads, all_species_all_genes_loop, all_species, [local_db, ensembl_db_name]] cursor.close() db .close() parallelize (*parallelize_args)
def main(): no_threads = 10 special = None if len(sys.argv) > 1 and len(sys.argv)<3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv)==3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) cursor.close() db.close() parallelize (no_threads, maps_for_gene_list, gene_list, [local_db, ensembl_db_name]) return True
def main(): """ Main entry point, but in reality does nothing except taking care of the parallelization. The parallelization here is per-species. """ no_threads = 1 special = '' if len(sys.argv) > 1 and len(sys.argv)<3 or len(sys.argv) >= 2 and sys.argv[1]=="-h": print "usage: %s <set name> <number of threads>" % sys.argv[0] exit(1) # after usage statment elif len(sys.argv)==3: special = sys.argv[1].lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print '=======================================' print sys.argv[0] if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) cursor.close() db .close() # two version of the main loop: # 1) over all species, and all genes in each speceis if not special: parallelize (no_threads, store_exon_seqs, all_species, [local_db, ensembl_db_name]) else: parallelize (no_threads, store_exon_seqs_special, gene_list, [local_db, ensembl_db_name])
def main(): special = None no_threads = 1 db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) # loop over all genes sw_count = 0 tot_count = 0 for human_gene_id in gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) human_stable = gene2stable (cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) tot_count += 1 #print human_gene_id, human_stable, human_description human_exons = [e for e in gene2exon_list(cursor, human_gene_id, verbose=True) if e.covering_exon < 0 and e.is_canonical and e.is_known] if not human_exons: #print "\t\t", human_stable, "no exons found" continue human_exons.sort(key=lambda exon: exon.start_in_gene) # loop over all exons in this gene maps_for_exon = {} for he in human_exons: he.stable_id = exon2stable (cursor, he.exon_id, ensembl_db_name['homo_sapiens']) he.pepseq = get_exon_pepseq (cursor, he, ensembl_db_name['homo_sapiens']) # maps cleanup: get rid of maps that have "none" as similarity maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data if not maps_for_exon[he]: continue #maps_for_exon[he] = filter (lambda m: m.source == 'sw_sharp' or m.source == 'usearch', # maps_for_exon[he]) maps_for_exon[he] = filter (lambda m: m.source == 'usearch', maps_for_exon[he]) if not maps_for_exon[he]: #print "\t\t", human_stable, "no maps found" continue sw_count += len(maps_for_exon[he]) #break print "tot count: ", tot_count print "sw count: ", sw_count #print "tot count: ", tot_count #print "sw count: ", sw_count cursor.close() db.close()
def main(): no_threads = 1 special = None if len(sys.argv) > 1 and len(sys.argv) < 3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv) == 3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species(cursor) species = 'homo_sapiens' switch_to_db(cursor, ensembl_db_name[species]) if special: print "using", special, "set" gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special) else: print "using all protein coding genes" switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1) incomplete = 0 genes_checked = 0 #for gene_id in gene_list: #for gene_id in [743609]: for sampling_count in range(1000): gene_id = choice(gene_list) genes_checked += 1 with_map = 0 tot = 0 switch_to_db(cursor, ensembl_db_name['homo_sapiens']) print gene2stable(cursor, gene_id), get_description(cursor, gene_id) # find all exons we are tracking in the database human_exons = gene2exon_list(cursor, gene_id) human_exons.sort(key=lambda exon: exon.start_in_gene) has_a_map = False for human_exon in human_exons: if (not human_exon.is_canonical or not human_exon.is_coding): continue if verbose: print print "\t human", human_exon.exon_id, human_exon.is_known print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens']) print "\t checking maps ..." maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) tot += 1 if maps: has_a_map = True with_map += 1 #print "ok" else: print "no maps for exon", human_exon.exon_id continue if verbose: for map in maps: species = map.species_2 exon = map2exon(cursor, ensembl_db_name, map) unaligned_sequence = get_exon_pepseq( cursor, exon, ensembl_db_name[species]) if (map.similarity): print "\t", species, map.source, map.exon_id_2, map.exon_known_2 print "\tmaps to ", map.exon_id_1, map.exon_known_1 print "\tsim", map.similarity, print "\tsource", map.source print "\t", unaligned_sequence if not map.bitmap: print "\t bitmap not assigned" else: bs = Bits(bytes=map.bitmap) reconst_pepseq = '' if (not bs.count(1) == len(unaligned_sequence)): print "\talnd seq mismatch" else: usi = iter(unaligned_sequence) for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) print "\tbinary : ", bs.bin print "\talnd seq: ", reconst_pepseq print if not tot == with_map: print "#### gene id: %d total exons: %d with map: %d ( = %d%%) " % \ (gene_id, tot, with_map, int(float(with_map)/tot*100) ) incomplete += 1 print "genes checked: %d, incomplete: %d" % (genes_checked, incomplete) cursor.close() db.close() print tot, with_map
def main(): no_threads = 1 special = None if len(sys.argv) > 1 and len(sys.argv)<3: print "usage: %s <set name> <number of threads> " % sys.argv[0] exit(1) elif len(sys.argv)==3: special = sys.argv[1] special = special.lower() if special == 'none': special = None no_threads = int(sys.argv[2]) db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) if special: print "using", special, "set" gene_list = get_theme_ids (cursor, ensembl_db_name, cfg, special ) else: print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1) incomplete = 0 genes_checked = 0 #for gene_id in gene_list: #for gene_id in [743609]: for sampling_count in range(1000): gene_id = choice(gene_list) genes_checked += 1 with_map = 0 tot = 0 switch_to_db (cursor, ensembl_db_name['homo_sapiens']) print gene2stable(cursor, gene_id), get_description (cursor, gene_id) # find all exons we are tracking in the database human_exons = gene2exon_list(cursor, gene_id) human_exons.sort(key=lambda exon: exon.start_in_gene) has_a_map = False for human_exon in human_exons: if (not human_exon.is_canonical or not human_exon.is_coding): continue if verbose: print print "\t human", human_exon.exon_id, human_exon.is_known print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens']) print "\t checking maps ..." maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) tot += 1 if maps: has_a_map = True with_map += 1 #print "ok" else: print"no maps for exon", human_exon.exon_id continue if verbose: for map in maps: species = map.species_2 exon = map2exon(cursor, ensembl_db_name, map) unaligned_sequence = get_exon_pepseq(cursor, exon, ensembl_db_name[species]) if ( map.similarity): print "\t", species, map.source, map.exon_id_2, map.exon_known_2 print "\tmaps to ", map.exon_id_1, map.exon_known_1 print "\tsim", map.similarity, print "\tsource", map.source print "\t", unaligned_sequence if not map.bitmap: print "\t bitmap not assigned" else: bs = Bits(bytes=map.bitmap) reconst_pepseq = '' if (not bs.count(1) == len(unaligned_sequence)): print "\talnd seq mismatch" else: usi = iter(unaligned_sequence) for c in bs.bin: if c == '0': reconst_pepseq += '-' else: reconst_pepseq += next(usi) print "\tbinary : ", bs.bin print "\talnd seq: ", reconst_pepseq print if not tot== with_map: print "#### gene id: %d total exons: %d with map: %d ( = %d%%) " % \ (gene_id, tot, with_map, int(float(with_map)/tot*100) ) incomplete += 1 print "genes checked: %d, incomplete: %d" % (genes_checked, incomplete) cursor.close() db.close() print tot, with_map