def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species(cursor) print "using all protein coding genes" switch_to_db(cursor, ensembl_db_name['homo_sapiens']) min_similarity = cfg.get_value('min_accptbl_exon_sim') flank_length = 10 gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1, ref_only=True) new_afas = 0 old_afas = 0 ancient_afas = 0 failed_afas = [] for gene_id in gene_list: switch_to_db(cursor, ensembl_db_name['homo_sapiens']) stable_id = gene2stable(cursor, gene_id) if check_afa_age(cfg, stable_id, max_days=30) == "new": new_afas += 1 continue elif check_afa_age(cfg, stable_id, max_days=300) == "new": old_afas += 1 failed_afas.append(gene_id) continue elif check_afa_age(cfg, stable_id, max_days=1000) == "new": ancient_afas += 1 failed_afas.append(gene_id) continue no_exons = 0 cases_with_no_orthos = 0 no_exon_ids = [] for gene_id in failed_afas: if ((failed_afas.index(gene_id)) % 10 == 0): print failed_afas.index(gene_id), "out of ", len(failed_afas), print " no orthos: ", cases_with_no_orthos canonical_human_exons = get_canonical_coding_exons( cursor, gene_id, ensembl_db_name['homo_sapiens']) if not canonical_human_exons: no_exon_ids.append(gene_id) no_exons += 1 continue if False: # reconstruct per-exon alignments with orthologues mitochondrial = is_mitochondrial(cursor, gene_id) [alnmt_pep, alnmt_dna] = make_exon_alignments(cursor, ensembl_db_name, canonical_human_exons, mitochondrial, min_similarity, flank_length) no_orthos = True for human_exon, almt in alnmt_pep.iteritems(): if (type(almt) is str or len(almt.keys()) <= 1): continue no_orthos = False break if no_orthos: cases_with_no_orthos += 1 continue print print "total genes", len(gene_list) print "new afas", new_afas print "old afas", old_afas print "ancient afas", ancient_afas print print "failure cases" print "\t no exons", no_exons print "\t no orthologues ", cases_with_no_orthos print for gene_id in no_exon_ids: print gene_id for exon in gene2exon_list(cursor, gene_id): print "\t", exon.is_canonical, exon.is_coding cursor.close() db.close()
def find_missing_exons(human_gene_list, db_info): # [local_db, ensembl_db_name, method] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids and common names for each species db all_species, ensembl_db_name = get_species(cursor) # minimal acceptable similarity between exons min_similarity = cfg.get_value('min_accptbl_exon_sim') switch_to_db(cursor, ensembl_db_name['homo_sapiens']) ################################################################################## # loop over human genes gene_ct = 0 found = 0 sought = 0 unsequenced = 0 #human_gene_list.reverse() for human_gene_id in human_gene_list: switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # Get stable id and description of this gene -- DEBUG human_stable = gene2stable(cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) if verbose: print(human_gene_id, human_stable, human_description) # progress counter gene_ct += 1 if (not gene_ct % 10): print("processed ", gene_ct, " out of ", len(human_gene_list), "genes") print("exons found: ", found, " out of ", sought, "sought") # find all human exons for this gene that we are tracking in the database human_exons = [ e for e in gene2exon_list(cursor, human_gene_id) if e.covering_exon < 0 and e.is_canonical and e.is_known ] if not human_exons: print("\t\t no exons found") continue human_exons.sort(key=lambda exon: exon.start_in_gene) for he in human_exons: he.stable_id = exon2stable(cursor, he.exon_id) ################################################################################## ################################################################################## # make 'table' of maps, which is either pointer to the map if it exists, or None map_table = {} for species in all_species: map_table[species] = {} for he in human_exons: map_table[species][he] = None ################# maps_for_exon = {} for he in human_exons: maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data for m in maps_for_exon[he]: #if m.source == 'usearch': continue #if m.source == 'sw_sharp': continue #if m.source == 'sw_sharp': # print 'sw_sharp' #if m.source == 'usearch': # print 'usearch', m.similarity, m.species_2, m.exon_id_1, m.exon_id_2 if m.similarity < min_similarity: continue m_previous = map_table[m.species_2][he] if m_previous and m_previous.similarity > m.similarity: continue map_table[m.species_2][he] = m # get rid of species that do not have the gene at all for species in all_species: one_exon_found = False for he in human_exons: if map_table[species][he]: one_exon_found = True break if not one_exon_found: del map_table[species] # fill in the peptide sequence field for each human exon # get rid of exons that appear in no other species but human (?) bad_he = [] for he in human_exons: one_species_found = False he.pepseq = get_exon_pepseq(cursor, he, ensembl_db_name['homo_sapiens']) if len( he.pepseq ) < 3: # can I ever get rid of all the nonsense I find in Ensembl? bad_he.append(he) continue for species in list(map_table.keys()): if species == 'homo_sapiens': continue if map_table[species][he]: one_species_found = True break if not one_species_found: bad_he.append(he) human_exons = [he for he in human_exons if not he in bad_he] # keep track of nearest neighbors for each human exon previous = {} next = {} prev = None for he in human_exons: previous[he] = prev if prev: next[prev] = he prev = he next[he] = None # fill, starting from the species that are nearest to the human if not list(map_table.keys()): continue # whatever species_sorted_from_human = species_sort(cursor, list(map_table.keys()), species)[1:] for species in species_sorted_from_human: print(species) # see which exons have which neighbors #if verbose: print he.exon_id, species no_left = [] no_right = [] has_both_neighbors = [] one_existing_map = None for he in human_exons: m = map_table[species][he] if m and not m.warning: # the one existing map should not be a problematic one one_existing_map = m continue prev = previous[he] nxt = next[he] if prev and nxt and map_table[species][prev] and map_table[ species][nxt]: has_both_neighbors.append(he) elif not prev or not map_table[species][prev]: no_left.append(he) elif not nxt or not map_table[species][nxt]: no_right.append(he) if not one_existing_map: continue # this shouldn't happen if not has_both_neighbors and not no_left and not no_right: continue # what is the gene that we are talking about? exon_id = one_existing_map.exon_id_2 is_known = one_existing_map.exon_known_2 gene_id = exon_id2gene_id(cursor, ensembl_db_name[species], exon_id, is_known) # is it mitochondrial? mitochondrial = is_mitochondrial(cursor, gene_id, ensembl_db_name[species]) # where is the gene origin (position on the sequence) gene_coords = get_gene_coordinates(cursor, gene_id, ensembl_db_name[species]) if not gene_coords: continue [gene_seq_region_id, gene_start, gene_end, gene_strand] = gene_coords # fill in exons that have both neighbors: # human exon functions as a coordinate here for he in has_both_neighbors: # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file # get previous region prev_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # get following region next_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 # work backwards # use the last known region on the left as the bound no_left.reverse() next_seq_region = None for he in no_left: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not next_seq_region: next_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue # otherwise it is the last thing we found # the previous region is eyeballed from the next on # the previous and the next region frame the search region prev_seq_region = left_region(next_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 # repeat the whole procedure on the right prev_seq_region = None for he in no_right: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template(cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not prev_seq_region: prev_seq_region = get_neighboring_region( cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # otherwise it is the last thing we found # the following region is eyeballed from the previous next_seq_region = right_region(prev_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply == 'NNN': unsequenced += 1 print(species, "sought", sought, " unseq", unsequenced)
def multiple_exon_alnmt(gene_list, db_info): print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list)) [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) # for each human gene gene_ct = 0 tot = 0 ok = 0 no_maps = 0 no_pepseq = 0 no_orthologues = 0 min_similarity = cfg.get_value('min_accptbl_exon_sim') #gene_list.reverse() for gene_id in gene_list: start = time() gene_ct += 1 if not gene_ct%10: print gene_ct, "genes out of", len(gene_list) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) print gene_ct, len(gene_ids), gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id)) human_exons.sort(key=lambda exon: exon.start_in_gene) ################################################################## for human_exon in human_exons: tot += 1 # find all orthologous exons the human exon maps to maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) if verbose: print "\texon no.", tot, " id", human_exon.exon_id, if not maps: print " no maps" print human_exon print if not maps: no_maps += 1 continue # human sequence to fasta: seqname = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known) if (not pepseq): if verbose and human_exon.is_coding and human_exon.covering_exon <0: # this should be a master exon print "no pep seq for", human_exon.exon_id, "coding ", human_exon.is_coding, print "canonical: ", human_exon.is_canonical print "length of dna ", len(dna_seq) no_pepseq += 1 continue # collect seq from all maps, and output them in fasta format hassw = False headers = [] sequences = {} exons_per_species = {} for map in maps: switch_to_db (cursor, ensembl_db_name[map.species_2]) if map.similarity < min_similarity: continue exon = map2exon(cursor, ensembl_db_name, map) pepseq = get_exon_pepseq (cursor,exon) if (not pepseq): continue if map.source == 'sw_sharp': exon_known_code = 2 hassw = True elif map.source == 'usearch': exon_known_code = 3 hassw = True else: exon_known_code = map.exon_known_2 seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code) headers.append(seqname) sequences[seqname] = pepseq # for split exon concatenation (see below) if not map.species_2 in exons_per_species.keys(): exons_per_species[map.species_2] = [] exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]); if (len(headers) <=1 ): if verbose: print "single species in the alignment" no_orthologues += 1 continue # concatenate exons from the same gene - the alignment program might go wrong otherwise concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species) fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id) output_fasta (fasta_fnm, sequences.keys(), sequences) # align afa_fnm = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id) mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm) ret = commands.getoutput(mafftcmd) if (verbose): print 'almt to', afa_fnm # read in the alignment inf = erropen(afa_fnm, "r") aligned_seqs = {} for record in SeqIO.parse(inf, "fasta"): aligned_seqs[record.id] = str(record.seq) inf.close() # split back the concatenated exons if concatenated: split_concatenated_exons (aligned_seqs, concatenated) human_seq_seen = False for seq_name, sequence in aligned_seqs.iteritems(): # if this is one of the concatenated seqs, split them back to two ### store the alignment as bitstring # Generate the bitmap bs = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0'))) # The returned value of tobytes() will be padded at the end # with between zero and seven 0 bits to make it byte aligned. # I will end up with something that looks like extra alignment gaps, that I'll have to return msa_bitmap = bs.tobytes() # Retrieve information on the cognate cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':') if cognate_exon_known == '2': source = 'sw_sharp' elif cognate_exon_known == '3': source = 'usearch' else: source = 'ensembl' if (cognate_species == 'homo_sapiens'): human_seq_seen = True cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens # Write the bitmap to the database #if (cognate_species == 'homo_sapiens'): if verbose: # and (source=='sw_sharp' or source=='usearch'): print "storing" print human_exon.exon_id, human_exon.is_known print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source print sequence if not msa_bitmap: print "no msa_bitmap" continue store_or_update(cursor, "exon_map", {"cognate_genome_db_id":cognate_genome_db_id, "cognate_exon_id":cognate_exon_id ,"cognate_exon_known" :cognate_exon_known, "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known}, {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)}) ok += 1 commands.getoutput("rm "+afa_fnm+" "+fasta_fnm) if verbose: print " time: %8.3f\n" % (time()-start); print "tot: ", tot, "ok: ", ok print "no maps ", no_pepseq print "no pepseq ", no_pepseq print "no orthologues ", no_orthologues print
def find_missing_exons(human_gene_list, db_info): # [local_db, ensembl_db_name, method] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids and common names for each species db all_species, ensembl_db_name = get_species (cursor) # minimal acceptable similarity between exons min_similarity = cfg.get_value('min_accptbl_exon_sim') switch_to_db (cursor, ensembl_db_name['homo_sapiens']) ################################################################################## # loop over human genes gene_ct = 0 found = 0 sought = 0 unsequenced = 0 #human_gene_list.reverse() for human_gene_id in human_gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) # Get stable id and description of this gene -- DEBUG human_stable = gene2stable (cursor, human_gene_id) human_description = get_description(cursor, human_gene_id) if verbose: print human_gene_id, human_stable, human_description # progress counter gene_ct += 1 if (not gene_ct%10): print "processed ", gene_ct, " out of ", len(human_gene_list), "genes" print "exons found: ", found, " out of ", sought, "sought" # find all human exons for this gene that we are tracking in the database human_exons = [e for e in gene2exon_list(cursor, human_gene_id) if e.covering_exon < 0 and e.is_canonical and e.is_known] if not human_exons: print "\t\t no exons found" continue human_exons.sort(key=lambda exon: exon.start_in_gene) for he in human_exons: he.stable_id = exon2stable (cursor, he.exon_id) ################################################################################## ################################################################################## # make 'table' of maps, which is either pointer to the map if it exists, or None map_table = {} for species in all_species: map_table[species] = {} for he in human_exons: map_table[species][he] = None ################# maps_for_exon = {} for he in human_exons: maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data for m in maps_for_exon[he]: #if m.source == 'usearch': continue #if m.source == 'sw_sharp': continue #if m.source == 'sw_sharp': # print 'sw_sharp' #if m.source == 'usearch': # print 'usearch', m.similarity, m.species_2, m.exon_id_1, m.exon_id_2 if m.similarity < min_similarity: continue m_previous = map_table[m.species_2][he] if m_previous and m_previous.similarity > m.similarity: continue map_table[m.species_2][he] = m # get rid of species that do not have the gene at all for species in all_species: one_exon_found = False for he in human_exons: if map_table[species][he]: one_exon_found = True break if not one_exon_found: del map_table[species] # fill in the peptide sequence field for each human exon # get rid of exons that appear in no other species but human (?) bad_he = [] for he in human_exons: one_species_found = False he.pepseq = get_exon_pepseq (cursor, he, ensembl_db_name['homo_sapiens']) if len (he.pepseq) < 3: # can I ever get rid of all the nonsense I find in Ensembl? bad_he.append(he) continue for species in map_table.keys(): if species =='homo_sapiens': continue if map_table[species][he]: one_species_found = True break if not one_species_found: bad_he.append(he) human_exons = filter (lambda he: not he in bad_he, human_exons) # keep track of nearest neighbors for each human exon previous = {} next = {} prev = None for he in human_exons: previous[he] = prev if prev: next[prev] = he prev = he next[he] = None # fill, starting from the species that are nearest to the human if not map_table.keys(): continue # whatever species_sorted_from_human = species_sort(cursor,map_table.keys(),species)[1:] for species in species_sorted_from_human: print species # see which exons have which neighbors #if verbose: print he.exon_id, species no_left = [] no_right = [] has_both_neighbors = [] one_existing_map = None for he in human_exons: m = map_table[species][he] if m and not m.warning: # the one existing map should not be a problematic one one_existing_map = m continue prev = previous[he] nxt = next[he] if prev and nxt and map_table[species][prev] and map_table[species][nxt]: has_both_neighbors.append(he) elif not prev or not map_table[species][prev]: no_left.append(he) elif not nxt or not map_table[species][nxt]: no_right.append(he) if not one_existing_map: continue # this shouldn't happen if not has_both_neighbors and not no_left and not no_right: continue # what is the gene that we are talking about? exon_id = one_existing_map.exon_id_2 is_known = one_existing_map.exon_known_2 gene_id = exon_id2gene_id (cursor, ensembl_db_name[species], exon_id, is_known) # is it mitochondrial? mitochondrial = is_mitochondrial(cursor, gene_id, ensembl_db_name[species]) # where is the gene origin (position on the sequence) gene_coords = get_gene_coordinates (cursor, gene_id, ensembl_db_name[species]) if not gene_coords: continue [gene_seq_region_id, gene_start, gene_end, gene_strand] = gene_coords # fill in exons that have both neighbors: # human exon functions as a coordinate here for he in has_both_neighbors: # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file # get previous region prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # get following region next_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 # work backwards # use the last known region on the left as the bound no_left.reverse() next_seq_region = None for he in no_left: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not next_seq_region: next_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, next[he]) if not next_seq_region: continue # otherwise it is the last thing we found # the previous region is eyeballed from the next on # the previous and the next region frame the search region prev_seq_region = left_region (next_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 # repeat the whole procedure on the right prev_seq_region = None for he in no_right: m = map_table[species][he] # check first if we haave already looked into this, and found incomplete region #if m and m.warning: continue # get template (known exon from the nearest species) template_info = get_template (cursor, ensembl_db_name, map_table, species, he) if not template_info: continue # get following region if not prev_seq_region: prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, species, gene_coords, he, previous[he]) if not prev_seq_region: continue # otherwise it is the last thing we found # the following region is eyeballed from the previous next_seq_region = right_region (prev_seq_region, MAX_SEARCH_LENGTH) sought += 1 reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], species, gene_id, gene_coords, prev_seq_region, next_seq_region, template_info, mitochondrial, method) if reply=='NNN': unsequenced += 1 print species, "sought", sought, " unseq", unsequenced
def main(): db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() [all_species, ensembl_db_name] = get_species (cursor) print "using all protein coding genes" switch_to_db (cursor, ensembl_db_name['homo_sapiens']) min_similarity = cfg.get_value('min_accptbl_exon_sim') flank_length = 10 gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) new_afas = 0 old_afas = 0 ancient_afas = 0 failed_afas = [] for gene_id in gene_list: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) stable_id = gene2stable(cursor, gene_id) if check_afa_age (cfg, stable_id, max_days=30) == "new": new_afas += 1 continue elif check_afa_age (cfg, stable_id, max_days=300) == "new": old_afas += 1 failed_afas.append(gene_id) continue elif check_afa_age (cfg, stable_id, max_days=1000) == "new": ancient_afas += 1 failed_afas.append(gene_id) continue no_exons = 0 cases_with_no_orthos = 0 no_exon_ids = [] for gene_id in failed_afas: if ( (failed_afas.index(gene_id))%10 == 0 ): print failed_afas.index(gene_id), "out of ", len(failed_afas), print " no orthos: ", cases_with_no_orthos canonical_human_exons = get_canonical_coding_exons (cursor, gene_id, ensembl_db_name['homo_sapiens']) if not canonical_human_exons: no_exon_ids.append(gene_id) no_exons += 1 continue if False: # reconstruct per-exon alignments with orthologues mitochondrial = is_mitochondrial(cursor, gene_id) [alnmt_pep, alnmt_dna] = make_exon_alignments(cursor, ensembl_db_name, canonical_human_exons, mitochondrial, min_similarity, flank_length) no_orthos = True for human_exon, almt in alnmt_pep.iteritems(): if ( type(almt) is str or len(almt.keys()) <= 1): continue no_orthos = False break if no_orthos: cases_with_no_orthos += 1 continue print print "total genes", len(gene_list) print "new afas", new_afas print "old afas", old_afas print "ancient afas", ancient_afas print print "failure cases" print "\t no exons", no_exons print "\t no orthologues ", cases_with_no_orthos print for gene_id in no_exon_ids: print gene_id for exon in gene2exon_list(cursor, gene_id): print "\t", exon.is_canonical, exon.is_coding cursor.close() db.close()