Exemplos de AlignmentCommandGenerator em Python, exemplos de el_utils.almt_cmd_generator.AlignmentCommandGenerator em Python

Exemplo n.º 1

0

Exibir arquivo

def annotate(gene_list, db_info):
    #
    [local_db, all_species, ensembl_db_name, species] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    if verbose: print "thread %s annotating %s " % (get_thread_name(), species)

    if not species == 'oryctolagus_cuniculus':
        print 'The preferred list of species is hardcoded for the rabbit. Consider modifying.'
        exit(1)

    preferred_species = [
        species, 'mus_musculus', 'rattus_norvegicus', 'homo_sapiens'
    ]
    nearest_species_list = species_sort(cursor, all_species, species)
    species_list = preferred_species + filter(
        lambda x: x not in preferred_species, nearest_species_list)

    inf = erropen("temp_out.fasta", "w")

    for gene_id in gene_list:
        #for gene_id in [90020]:
        switch_to_db(cursor, ensembl_db_name[species])
        ####################
        # get stable id and description of this gene
        stable_id = gene2stable(cursor, gene_id)
        if not gene_list.index(gene_id) % 100:
            print gene_list.index(gene_id), "out of", len(gene_list)
        if verbose: print "============================================="
        if verbose: print gene_id, stable_id
        ####################
        # find the annotation from the preferred source organism
        [annot_source, orthology_type, annotation,
         ortho_stable_ids] = find_annotation(cursor, ensembl_db_name,
                                             species_list, gene_id)
        if verbose: print annot_source, "**", orthology_type, '**', annotation

        ###################
        # find splices (for now find the canonical splice)
        switch_to_db(cursor, ensembl_db_name[species])
        canonical_splice = get_canonical_transl(acg, cursor, gene_id, species)

        # output
        if orthology_type == 'self' or annotation == 'none':
            header = ">{0} {1}".format(stable_id, annotation)
        else:
            header = ">{0} {1} [by sim to {2}, {3}]".format(
                stable_id, annotation, annot_source, ortho_stable_ids)

        print >> inf, header
        print >> inf, canonical_splice

    cursor.close()
    db.close()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: 19_alt_splice_almts.py Projeto: ivanamihalek/exolocator

def main():

    db  = connect_to_mysql()
    acg = AlignmentCommandGenerator()
    cfg = ConfigurationReader()
  
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
    # human and mouse are the only two species that have CCDs info
    for species in [ 'homo_sapiens', 'mus_musculus']:
        alt_splice_almt (cursor, cfg, acg, species, ensembl_db_name)



    cursor.close()
    db    .close()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: 11_paralogue_directory_cleanup.py Projeto: ivanamihalek/exolocator

def make_alignments (species_list, db_info):

    [local_db, ensembl_db_name] = db_info

    verbose      = False
    flank_length = 10

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)

    max_days = 60

    for species in species_list:

        species_shorthand = get_species_shorthand(cursor, species)
        print(species, species_shorthand)

        directory = check_directory (cfg, species, species_shorthand, "pep")
        if not directory: continue

        removed   = 0
        remaining = 0
        for dirname, dirnames, filenames in os.walk(directory):
            for filename in filenames:
                full_name =  os.path.join(dirname, filename)
                time_modified = os.path.getmtime(full_name)
                number_of_days_since_modified = (time.time() - time_modified)/(60*60*24)
                if number_of_days_since_modified > max_days:
                    #print "removing", filename, "made", number_of_days_since_modified, "ago"
                    os.remove(full_name)
                else:
                    remaining += 1
        print(species, "done, removed", removed, "files, remaining", remaining)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: get_canonical_translations.py Projeto: ivanamihalek/exolocator

def main():

    db = connect_to_mysql()
    acg = AlignmentCommandGenerator()

    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species(cursor)
    species = 'homo_sapiens'
    switch_to_db(cursor, ensembl_db_name[species])
    gene_list = get_gene_ids(cursor, biotype='protein_coding')

    for gene_id in gene_list:
        # find stable
        stable_id = gene2stable(cursor, gene_id=gene_id)
        canonical = get_canonical_transl(acg,
                                         cursor,
                                         gene_id,
                                         species,
                                         strip_X=False)
        if canonical:
            print stable_id, canonical

    cursor.close()
    db.close()

Exemplo n.º 5

0

Exibir arquivo

Arquivo: 48_count_unsequenced.py Projeto: ivanamihalek/exolocator

def find_missing_exons(human_gene_list, db_info):

    #
    [local_db, ensembl_db_name, method] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids and common names for each species db
    all_species, ensembl_db_name = get_species(cursor)
    # minimal acceptable similarity between exons
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    switch_to_db(cursor, ensembl_db_name['homo_sapiens'])

    ##################################################################################
    # loop over human genes
    gene_ct = 0
    found = 0
    sought = 0
    unsequenced = 0
    #human_gene_list.reverse()
    for human_gene_id in human_gene_list:

        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])

        # Get stable id and description of this gene -- DEBUG
        human_stable = gene2stable(cursor, human_gene_id)
        human_description = get_description(cursor, human_gene_id)
        if verbose: print(human_gene_id, human_stable, human_description)

        # progress counter
        gene_ct += 1
        if (not gene_ct % 10):
            print("processed ", gene_ct, " out of ", len(human_gene_list),
                  "genes")
            print("exons found: ", found, " out of ", sought, "sought")

        # find all human exons for this gene that we are tracking in the database
        human_exons = [
            e for e in gene2exon_list(cursor, human_gene_id)
            if e.covering_exon < 0 and e.is_canonical and e.is_known
        ]
        if not human_exons:
            print("\t\t no exons found")
            continue

        human_exons.sort(key=lambda exon: exon.start_in_gene)
        for he in human_exons:
            he.stable_id = exon2stable(cursor, he.exon_id)

    ##################################################################################
    ##################################################################################
        # make 'table' of maps, which is either pointer to the map if it exists, or None
        map_table = {}
        for species in all_species:
            map_table[species] = {}
            for he in human_exons:
                map_table[species][he] = None

    #################
        maps_for_exon = {}
        for he in human_exons:
            maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id,
                                         he.is_known)  # exon data
            for m in maps_for_exon[he]:
                #if m.source ==  'usearch': continue
                #if m.source == 'sw_sharp': continue
                #if m.source == 'sw_sharp':
                #    print 'sw_sharp'
                #if m.source == 'usearch':
                #    print 'usearch',  m.similarity, m.species_2, m.exon_id_1, m.exon_id_2
                if m.similarity < min_similarity: continue
                m_previous = map_table[m.species_2][he]
                if m_previous and m_previous.similarity > m.similarity:
                    continue
                map_table[m.species_2][he] = m

    # get rid of species that do not have the gene at all
        for species in all_species:
            one_exon_found = False
            for he in human_exons:
                if map_table[species][he]:
                    one_exon_found = True
                    break
            if not one_exon_found:
                del map_table[species]

    # fill in the peptide sequence field for each human exon
    # get rid of exons  that appear in no other species but human (?)
        bad_he = []
        for he in human_exons:
            one_species_found = False
            he.pepseq = get_exon_pepseq(cursor, he,
                                        ensembl_db_name['homo_sapiens'])
            if len(
                    he.pepseq
            ) < 3:  # can I ever get rid of all the nonsense I find in Ensembl?
                bad_he.append(he)
                continue
            for species in list(map_table.keys()):
                if species == 'homo_sapiens': continue
                if map_table[species][he]:
                    one_species_found = True
                    break
            if not one_species_found:
                bad_he.append(he)
        human_exons = [he for he in human_exons if not he in bad_he]

        # keep track of nearest neighbors for each human exon
        previous = {}
        next = {}
        prev = None
        for he in human_exons:
            previous[he] = prev
            if prev: next[prev] = he
            prev = he
        next[he] = None

        # fill,  starting from the species that are nearest to the human
        if not list(map_table.keys()):
            continue  # whatever

        species_sorted_from_human = species_sort(cursor,
                                                 list(map_table.keys()),
                                                 species)[1:]

        for species in species_sorted_from_human:
            print(species)
            # see which exons have which neighbors
            #if verbose: print he.exon_id, species
            no_left = []
            no_right = []
            has_both_neighbors = []
            one_existing_map = None
            for he in human_exons:
                m = map_table[species][he]
                if m and not m.warning:  # the one existing map should not be a problematic one
                    one_existing_map = m
                    continue
                prev = previous[he]
                nxt = next[he]
                if prev and nxt and map_table[species][prev] and map_table[
                        species][nxt]:
                    has_both_neighbors.append(he)
                elif not prev or not map_table[species][prev]:
                    no_left.append(he)
                elif not nxt or not map_table[species][nxt]:
                    no_right.append(he)

            if not one_existing_map: continue  # this shouldn't happen
            if not has_both_neighbors and not no_left and not no_right:
                continue

            # what is the gene that we are talking about?
            exon_id = one_existing_map.exon_id_2
            is_known = one_existing_map.exon_known_2
            gene_id = exon_id2gene_id(cursor, ensembl_db_name[species],
                                      exon_id, is_known)
            # is it mitochondrial?
            mitochondrial = is_mitochondrial(cursor, gene_id,
                                             ensembl_db_name[species])
            # where is the gene origin (position on the sequence)
            gene_coords = get_gene_coordinates(cursor, gene_id,
                                               ensembl_db_name[species])
            if not gene_coords: continue
            [gene_seq_region_id, gene_start, gene_end,
             gene_strand] = gene_coords

            # fill in exons that have both neighbors:
            # human exon functions as a coordinate here
            for he in has_both_neighbors:

                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue
                # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file
                # get previous region
                prev_seq_region = get_neighboring_region(
                    cursor, ensembl_db_name, map_table, species, gene_coords,
                    he, previous[he])
                if not prev_seq_region: continue
                # get following  region
                next_seq_region = get_neighboring_region(
                    cursor, ensembl_db_name, map_table, species, gene_coords,
                    he, next[he])
                if not next_seq_region: continue
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            # work backwards
            # use the last known region on the left as the bound
            no_left.reverse()
            next_seq_region = None
            for he in no_left:
                m = map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue

                # get following  region
                if not next_seq_region:
                    next_seq_region = get_neighboring_region(
                        cursor, ensembl_db_name, map_table, species,
                        gene_coords, he, next[he])
                if not next_seq_region: continue

                # otherwise it is the last thing we found
                # the previous region is eyeballed from the next on
                # the previous and the  next region frame the search region
                prev_seq_region = left_region(next_seq_region,
                                              MAX_SEARCH_LENGTH)
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            # repeat the whole procedure on the right
            prev_seq_region = None
            for he in no_right:
                m = map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if  m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue

                # get following  region
                if not prev_seq_region:
                    prev_seq_region = get_neighboring_region(
                        cursor, ensembl_db_name, map_table, species,
                        gene_coords, he, previous[he])
                if not prev_seq_region: continue
                # otherwise it is the last thing we found

                # the following region is eyeballed from the previous
                next_seq_region = right_region(prev_seq_region,
                                               MAX_SEARCH_LENGTH)
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            print(species, "sought", sought, " unseq", unsequenced)

Exemplo n.º 6

0

Exibir arquivo

def main():

    verbose = True
    db = connect_to_mysql()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species(cursor)

    logf = erropen("error.log", "w")
    if not logf: exit(1)

    outf = erropen("mut_significance_bg_data.txt", "w")
    if not outf: exit(1)

    switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
    gene_ids = get_gene_ids(cursor,
                            biotype='protein_coding',
                            is_known=1,
                            ref_only=True)

    # the categories of mutations for which we will be collecting statistics
    fill_category()
    # for each human gene
    #gene_ids = [10093176 ]
    for gene_id in gene_ids:

        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        stable_id = gene2stable(cursor, gene_id)

        # find all canonical coding  human exons
        # get_canonical_coding_exons also sorts exons by the start in the gene
        canonical_human_exons = get_canonical_coding_exons(
            cursor, gene_id, ensembl_db_name['homo_sapiens'])

        # bail out if there is a problem
        if not canonical_human_exons: continue

        full_reconstituted_cDNA = ""
        prev_codon_piece_plus_right_flank = ""
        for human_exon in canonical_human_exons:
            [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, nucseq] = \
                    get_exon_seqs(cursor, human_exon.exon_id, human_exon.is_known)
            # add the split codon
            phase = get_exon_phase(cursor, human_exon.exon_id,
                                   human_exon.is_known)
            left_flank_plus_codon_piece = left_flank + nucseq[:
                                                              pepseq_transl_start]
            split_codon = ""
            if phase > 0 and prev_codon_piece_plus_right_flank and left_flank:
                offset = (3 - phase) % 3
                # hedge against the possibility that the translation starts
                # right at the start of the exon, but there is supposed to be a phase
                split_codon = prev_codon_piece_plus_right_flank[:
                                                                phase] + left_flank_plus_codon_piece[
                                                                    -offset:]
            full_reconstituted_cDNA += split_codon + nucseq[
                pepseq_transl_start:pepseq_transl_end]
            prev_codon_piece_plus_right_flank = nucseq[
                pepseq_transl_end:] + right_flank

        mitochondrial = is_mitochondrial(cursor, gene_id)
        if (mitochondrial):
            full_reconstituted_seq = Seq(full_reconstituted_cDNA).translate(
                table="Vertebrate Mitochondrial").tostring()
        else:
            full_reconstituted_seq = Seq(
                full_reconstituted_cDNA).translate().tostring()

        canonical = get_canonical_transl(acg,
                                         cursor,
                                         gene_id,
                                         'homo_sapiens',
                                         strip_X=False)
        if canonical[
                0] == 'X':  #that's some crap apparently wrong transcript is annotated as canonical
            print >> logf, "warning", gene_id, stable_id, get_description(
                cursor, gene_id)
            print >> logf, "the deposited canonical sequence starts with X - is there an alternative (?)"
            canonical = canonical[1:]

        if full_reconstituted_seq[-1] == '*' and canonical[-1] != '*':
            canonical += '*'
        if (len(full_reconstituted_seq) != len(canonical)
                or full_reconstituted_seq != canonical):

            if (len(canonical) - len(full_reconstituted_seq) < 3
                    and full_reconstituted_seq in canonical):
                # go with it  - I do not have that much of that crap anyway
                print >> logf, "warning", gene_id, stable_id, get_description(
                    cursor, gene_id)
                print >> logf, "missing a couple of amino acids in beginning or in the end"
            else:
                print >> logf, "error", gene_id, stable_id, get_description(
                    cursor, gene_id)
                print >> logf, "error reassembling,  len(full_reconstituted_seq) != len(canonical) ", len(
                    full_reconstituted_seq), len(canonical)
                print >> logf, "canonical:"
                print >> logf, canonical
                print >> logf, "reconstituted:"
                print >> logf, full_reconstituted_seq
                continue

        # nucleotide stats
        count = {'A': 0, 'C': 0, 'C-CpG': 0, 'T': 0, 'G': 0, 'G-CpG': 0}
        is_CpG = {}
        for i in range(len(full_reconstituted_cDNA)):
            is_CpG[i] = False
            if full_reconstituted_cDNA[i] == 'A':
                count['A'] += 1
            elif full_reconstituted_cDNA[i] == 'T':
                count['T'] += 1
            elif full_reconstituted_cDNA[i] == 'C':
                if i + 1 < len(full_reconstituted_cDNA
                               ) and full_reconstituted_cDNA[i + 1] == 'G':
                    count['C-CpG'] += 1
                    is_CpG[i] = True
                else:
                    count['C'] += 1
            elif full_reconstituted_cDNA[i] == 'G':
                if i > 0 and full_reconstituted_cDNA[i - 1] == 'C':
                    count['G-CpG'] += 1
                    is_CpG[i] = True
                else:
                    count['G'] += 1

        # in each category_dict (AT transt, AT transv, CG trans, CG transv, Cpg trans, cpGtransv, how many missense,
        #  how many nonsense, how many silent  possible
        codons = map(''.join, zip(*[iter(full_reconstituted_cDNA)] * 3))
        silent = {}
        missense = {}
        nonsense = {}
        for cg in categories:
            silent[cg] = 0
            missense[cg] = 0
            nonsense[cg] = 0
        for i in range(len(codons)):
            codon = codons[i]
            aa = full_reconstituted_seq[i]
            for j in range(3):
                nt_position = i * 3 + j
                nt = full_reconstituted_cDNA[nt_position]
                for new_nt in ['A', 'C', 'T', 'G']:
                    if new_nt == nt: continue
                    mutated_codon = mutate(codon, j, new_nt)
                    if (mitochondrial):
                        mutated_aa = Seq(mutated_codon).translate(
                            table="Vertebrate Mitochondrial").tostring()
                    else:
                        mutated_aa = Seq(mutated_codon).translate().tostring()
                    cg = category_dict[codon[j]][new_nt][is_CpG[nt_position]]
                    if not cg or not cg in categories:
                        print >> logf, "category problem in ", gene_id, stable_id, get_description(
                            cursor, gene_id)
                        print >> logf, codon, mutated_codon, j, codon[
                            j], new_nt, is_CpG[nt_position], cg
                        print >> logf, i, j, nt_position, nt
                        print >> logf, aa, mutated_aa
                        continue
                    if (mutated_aa == aa):
                        silent[cg] += 1
                    elif (mutated_aa == "*"):
                        nonsense[cg] += 1
                    else:
                        missense[cg] += 1

        print >> outf, stable_id, get_description(cursor, gene_id)
        print >> outf, "# CpG nucleotides (format: cdna_position|nucleotide|codon|context; )"
        print >> outf, "# ('context' contains one nucleotide before and one after the CpG nucleotide)"

        outstr = ""
        for i in range(len(full_reconstituted_cDNA)):
            if (is_CpG[i]):
                context = ""
                if i > 0: context += full_reconstituted_cDNA[i - 1]
                context += full_reconstituted_cDNA[i]
                if i < len(full_reconstituted_cDNA) - 1:
                    context += full_reconstituted_cDNA[i + 1]
                outstr += "%d|%s|%s|%s;" % (i + 1, full_reconstituted_cDNA[i],
                                            codons[i / 3], context)
        print >> outf, outstr

        print >> outf, "# mutations possible (in principle)"
        print >> outf, "# %10s  %5s  %5s  %5s" % ("category", "silent",
                                                  "nonsense", "missense")
        for cg in categories:
            print >> outf, "%10s  %5d  %5d  %5d" % (cg, silent[cg],
                                                    nonsense[cg], missense[cg])

        print >> outf, "# canonical sequence (format: <amino_acid><position_on_peptide_chain><codon>;):"
        outstr = ""
        for i in range(len(codons)):
            if (mitochondrial):
                codon_transl = Seq(codons[i]).translate(
                    table="Vertebrate Mitochondrial").tostring()
            else:
                codon_transl = Seq(codons[i]).translate().tostring()

            outstr += "%s%d%s;" % (full_reconstituted_seq[i], i + 1, codons[i])
        print >> outf, outstr

        print >> outf, stable_id, "done"

    logf.close()

Exemplo n.º 7

0

Exibir arquivo

Arquivo: 10_ortho_exon_map_to_msa.py Projeto: ivanamihalek/exolocator

def multiple_exon_alnmt(gene_list, db_info):


    print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list))

    [local_db, ensembl_db_name] = db_info

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    # for each human gene
    gene_ct = 0
    tot  = 0
    ok   = 0
    no_maps        = 0
    no_pepseq      = 0
    no_orthologues = 0
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    #gene_list.reverse()
    for gene_id in gene_list:

        start = time()
        gene_ct += 1
        if  not gene_ct%10: print gene_ct, "genes out of", len(gene_list)

        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print gene_ct, len(gene_ids),  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id))
        human_exons.sort(key=lambda exon: exon.start_in_gene)

        ##################################################################
        for human_exon in human_exons:
            
            tot += 1

            # find all orthologous exons the human exon  maps to
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            if verbose: 
                print "\texon no.", tot, " id", human_exon.exon_id,
                if not maps: 
                    print " no maps"
                    print human_exon
                print 
            if not maps: 
                no_maps += 1
                continue

  
            # human sequence to fasta:
            seqname   = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known)
            switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
            [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
             left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known)
            if (not pepseq):
                if verbose and  human_exon.is_coding and  human_exon.covering_exon <0: # this should be a master exon
                    print "no pep seq for",  human_exon.exon_id, "coding ", human_exon.is_coding,
                    print "canonical: ",  human_exon.is_canonical
                    print "length of dna ", len(dna_seq)
                no_pepseq += 1
                continue

            # collect seq from all maps, and output them in fasta format
            hassw = False
            headers   = []
            sequences = {}
            exons_per_species = {}

            for map in maps:

                switch_to_db (cursor, ensembl_db_name[map.species_2])
                if map.similarity < min_similarity: continue
                exon    = map2exon(cursor, ensembl_db_name, map)
                pepseq  = get_exon_pepseq (cursor,exon)
                if (not pepseq):
                    continue
                if  map.source == 'sw_sharp':
                    exon_known_code = 2
                    hassw = True
                elif  map.source == 'usearch':
                    exon_known_code = 3
                    hassw = True
                else:
                    exon_known_code = map.exon_known_2
                seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code)
                headers.append(seqname)
                sequences[seqname] = pepseq
                # for split exon concatenation (see below)
                if not map.species_2 in exons_per_species.keys():
                    exons_per_species[map.species_2] = []
                exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]);
                
                    
            if (len(headers) <=1 ):
                if verbose: print "single species in the alignment"
                no_orthologues += 1
                continue
            
            # concatenate exons from the same gene - the alignment program might go wrong otherwise
            concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species)

            fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            output_fasta (fasta_fnm, sequences.keys(), sequences)

            # align
            afa_fnm  = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
            ret      = commands.getoutput(mafftcmd)

            if (verbose): print 'almt to', afa_fnm

            # read in the alignment 
            inf = erropen(afa_fnm, "r")
            aligned_seqs = {}
            for record in SeqIO.parse(inf, "fasta"):
                aligned_seqs[record.id] = str(record.seq)
            inf.close()
            # split back the concatenated exons
            if concatenated: split_concatenated_exons (aligned_seqs, concatenated)

            human_seq_seen = False
            for seq_name, sequence in aligned_seqs.iteritems():
                # if this is one of the concatenated seqs, split them back to two

                ### store the alignment as bitstring
                # Generate the bitmap
                bs         = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0')))
                # The returned value of tobytes() will be padded at the end 
                # with between zero and seven 0 bits to make it byte aligned.
                # I will end up with something that looks like extra alignment gaps, that I'll have to return
                msa_bitmap = bs.tobytes() 
                # Retrieve information on the cognate
                cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':')
                if cognate_exon_known == '2':
                    source = 'sw_sharp'
                elif cognate_exon_known == '3':
                    source = 'usearch'
                else:
                    source = 'ensembl'
                if (cognate_species == 'homo_sapiens'):
                    human_seq_seen = True
                cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor
                switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens
                # Write the bitmap to the database
                #if (cognate_species == 'homo_sapiens'):
                if verbose: # and (source=='sw_sharp' or source=='usearch'):
                    print "storing"
                    print human_exon.exon_id, human_exon.is_known
                    print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source
                    print sequence
                    if not msa_bitmap:
                        print "no msa_bitmap"
                        continue
                store_or_update(cursor, "exon_map",    {"cognate_genome_db_id":cognate_genome_db_id,
                   "cognate_exon_id":cognate_exon_id   ,"cognate_exon_known"  :cognate_exon_known,
                   "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known},
                  {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                 
            ok += 1
            commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)

        if verbose: print " time: %8.3f\n" % (time()-start);

    print "tot: ", tot, "ok: ", ok
    print "no maps ",   no_pepseq
    print "no pepseq ", no_pepseq
    print "no orthologues  ", no_orthologues
    print

Exemplo n.º 8

0

Exibir arquivo

Arquivo: 17_para_exon_map_to_msa.py Projeto: ivanamihalek/exolocator

def multiple_exon_alnmt(species_list, db_info):


    [local_db, ensembl_db_name] = db_info

    verbose  = False

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()


    for species in species_list:

        print
        print "############################"
        print  species

        switch_to_db (cursor,  ensembl_db_name[species])
        gene_ids = get_gene_ids (cursor, biotype='protein_coding')
        #gene_ids = get_theme_ids(cursor, cfg, 'wnt_pathway')
        if not gene_ids:
            print "no gene_ids"
            continue


        gene_ct       = 0
        tot           = 0
        ok            = 0
        no_maps       = 0
        no_pepseq     = 0
        no_paralogues = 0
        for gene_id in gene_ids:

            if verbose: start = time()
            gene_ct += 1
            if not gene_ct%100: print species, gene_ct, "genes out of", len(gene_ids)
            if verbose: 
                print
                print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id)

            # get the paralogues - only the representative for  the family will have this 
            paralogues = get_paras (cursor, gene_id)  
            if not paralogues:
                if verbose:  print "\t not a template or no paralogues"
                continue

            if verbose:  print "paralogues: ", paralogues

            # get _all_ exons
            template_exons = gene2exon_list(cursor, gene_id)
            if (not template_exons):
                if verbose: print 'no exons for ', gene_id
                continue

            # find all template  exons we are tracking in the database
            for template_exon in template_exons:

                if verbose: print template_exon.exon_id
                maps = get_maps(cursor, ensembl_db_name, template_exon.exon_id,
                                template_exon.is_known, species=species, table='para_exon_map')

                if not maps:
                    no_maps += 1
                    continue

                # output to fasta:
                seqname        = "{0}:{1}:{2}".format('template', template_exon.exon_id, template_exon.is_known)
                exon_seqs_info =  get_exon_seqs (cursor, template_exon.exon_id, template_exon.is_known)
                if not exon_seqs_info: continue
                [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
                 left_flank, right_flank, dna_seq] = exon_seqs_info
                if (not pepseq):
                    if ( template_exon.is_coding and  template_exon.covering_exon <0): # this should be a master exon
                        print "no pep seq for",  template_exon.exon_id, "coding ", template_exon.is_coding,
                        print "canonical: ",  template_exon.is_canonical
                        print "length of dna ", len(dna_seq)
                        no_pepseq += 1
                    continue
                
                tot += 1

                sequences = {seqname:pepseq}
                headers   = [seqname]
                for map in maps:
                    exon    = map2exon(cursor, ensembl_db_name, map, paralogue=True)
                    pepseq  = get_exon_pepseq (cursor,exon)
                    if (not pepseq):
                        continue
                    seqname = "{0}:{1}:{2}".format('para', map.exon_id_2, map.exon_known_2)
                    headers.append(seqname)
                    sequences[seqname] = pepseq

                fasta_fnm = "{0}/{1}_{2}_{3}.fa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                output_fasta (fasta_fnm, headers, sequences)

                if (len(headers) <=1 ):
                    print "single species in the alignment (?)"
                    no_paralogues += 1
                    continue

                # align
                afa_fnm  = "{0}/{1}_{2}_{3}.afa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
                ret      = commands.getoutput(mafftcmd)

                # read in the alignment
                inf = erropen(afa_fnm, "r")
                if not inf:
                    print gene_id
                    continue
                template_seq_seen = False
                for record in SeqIO.parse(inf, "fasta"):
                    ### store the alignment as bitstring
                    # Generate the bitmap
                    bs         = Bits(bin='0b' + re.sub("[^0]","1", str(record.seq).replace('-','0')))
                    msa_bitmap = bs.tobytes()
                    # Retrieve information on the cognate
                    label, cognate_exon_id, cognate_exon_known = record.id.split(':')
                    if (label == 'template'):
                        template_seq_seen = True
                    # Write the bitmap to the database
                    #print "updating: ", template_exon.exon_id
                    store_or_update(cursor, "para_exon_map", {"cognate_exon_id"    :cognate_exon_id,
                                                         "cognate_exon_known" :cognate_exon_known,
                                                         "exon_id"            :template_exon.exon_id,
                                                         "exon_known"         :template_exon.is_known},
                                    {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                inf.close()
                ok += 1
                commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)
            if verbose: print " time: %8.3f\n" % (time()-start);
 
        outstr  =  species + " done \n"
        outstr +=  "tot: %d   ok: %d  \n" % (tot,  ok)
        outstr +=  "no maps       %d  \n" % no_pepseq
        outstr +=  "no pepseq     %d  \n" % no_pepseq
        outstr +=  "no paralogues %d  \n" % no_paralogues
        outstr += "\n"
        print outstr

Exemplo n.º 9

0

Exibir arquivo

Arquivo: 22_novel_exon_cleanup.py Projeto: ivanamihalek/exolocator

def exon_cleanup(gene_list, db_info):

    [local_db, ensembl_db_name] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()
    # find db ids and common names for each species db
    all_species, ensembl_db_name = get_species(cursor)

    mammals = [
        'ailuropoda_melanoleuca', 'bos_taurus', 'callithrix_jacchus',
        'canis_familiaris', 'cavia_porcellus', 'choloepus_hoffmanni',
        'dasypus_novemcinctus', 'dipodomys_ordii', 'echinops_telfairi',
        'equus_caballus', 'erinaceus_europaeus', 'felis_catus',
        'gorilla_gorilla', 'ictidomys_tridecemlineatus', 'loxodonta_africana',
        'macaca_mulatta', 'macropus_eugenii', 'microcebus_murinus',
        'monodelphis_domestica', 'mus_musculus', 'mustela_putorius_furo',
        'myotis_lucifugus', 'nomascus_leucogenys', 'ochotona_princeps',
        'ornithorhynchus_anatinus', 'oryctolagus_cuniculus',
        'otolemur_garnettii', 'pan_troglodytes', 'papio_anubis',
        'pongo_abelii', 'procavia_capensis', 'pteropus_vampyrus',
        'rattus_norvegicus', 'sarcophilus_harrisii', 'sorex_araneus',
        'sus_scrofa', 'tarsius_syrichta', 'tupaia_belangeri',
        'tursiops_truncatus', 'vicugna_pacos'
    ]

    tot = 0
    tot_ok = 0
    for human_gene_id in gene_list:

        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        stable_id = gene2stable(cursor, human_gene_id)
        description = get_description(cursor, human_gene_id)

        mitochondrial = is_mitochondrial(cursor, human_gene_id)

        #print "#############################################"
        #print human_gene_id, stable_id, get_description (cursor, human_gene_id)

        human_exons = get_ok_human_exons(cursor, ensembl_db_name,
                                         human_gene_id)

        for human_exon in human_exons:
            [
                exon_seq_id, human_protein_seq, pepseq_transl_start,
                pepseq_transl_end, left_flank, right_flank, dna_seq
            ] = get_exon_seqs(cursor, human_exon.exon_id, 1,
                              ensembl_db_name['homo_sapiens'])
            human_exon_phase = get_exon_phase(cursor, human_exon.exon_id, 1)

            first_exon = (human_exons.index(human_exon) == 0)

            for species in mammals:  # maxentscan does not work for fish

                for table in ['sw_exon', 'usearch_exon']:
                    switch_to_db(cursor, ensembl_db_name[species])
                    qry = "select * from %s where maps_to_human_exon_id = %d " % (
                        table, human_exon.exon_id)
                    novel_exons = search_db(cursor, qry)

                    if not novel_exons:
                        #print  "human_exon: ", human_exon.exon_id, "no", table,  "for", species
                        continue
                    ct = 0
                    ok = 0
                    for novel_exon in novel_exons:

                        print "%s: novel exon found in table %s, mapping to human exon %s" % \
                        (species, table, exon2stable (cursor, human_exon.exon_id, ensembl_db_name['homo_sapiens']) )
                        ct += 1

                        has_stop = False
                        has_NNN = False

                        [
                            novel_exon_id, gene_id, start_in_gene, end_in_gene,
                            maps_to_human_exon_id, exon_seq_id,
                            template_exon_seq_id, template_species, strand,
                            phase, end_phase, has_NNN, has_stop, has_3p_ss,
                            has_5p_ss
                        ] = novel_exon

                        tot += 1

                        exon_seqs = get_exon_seq_by_db_id(
                            cursor, exon_seq_id, ensembl_db_name[species])
                        if not exon_seqs:
                            print "exon seqs not found"
                            continue

                        [
                            exon_seq_id, protein_seq, pepseq_transl_start,
                            pepseq_transl_end, left_flank, right_flank, dna_seq
                        ] = exon_seqs

                        len_ok = (pepseq_transl_end -
                                  pepseq_transl_start) == len(dna_seq)
                        if not len_ok:
                            # if it is not the case, then make it be so
                            left_flank += dna_seq[:pepseq_transl_start]
                            right_flank = dna_seq[
                                pepseq_transl_end:] + right_flank
                            dna_seq = dna_seq[
                                pepseq_transl_start:pepseq_transl_end]
                            pepseq_transl_start = 0
                            pepseq_transl_end = len(dna_seq)

                        phase_ok = (len(dna_seq) % 3 == 0)
                        if not phase_ok:
                            phase = len(dna_seq) % 3
                            cds = dna_seq[phase:]
                            pepseq_corrected = Seq(cds).translate().tostring()
                            if pepseq_corrected == protein_seq:
                                left_flank += dna_seq[:phase]
                                dna_seq = dna_seq[phase:]
                            else:
                                cds = dna_seq[:-phase]
                                pepseq_corrected = Seq(
                                    cds).translate().tostring()

                                if pepseq_corrected == protein_seq:
                                    right_flank += dna_seq[
                                        -phase:] + right_flank
                                    dna_seq = dna_seq[:-phase]
                                else:
                                    print "no match ..."
                                    continue  # don't want to shut-off the pipeline here

                            pepseq_transl_start = 0
                            pepseq_transl_end = len(dna_seq)

                        # retrieve the template
                        template_db_id = species2genome_db_id(
                            cursor, template_species)

                        [templ_exon_seq_id, templ_protein_seq, templ_pepseq_transl_start,
                         templ_pepseq_transl_end,  templ_left_flank, templ_right_flank, templ_dna_seq] \
                         = get_exon_seq_by_db_id (cursor, template_exon_seq_id, ensembl_db_name[template_species])

                        correction = 0
                        phase = 0
                        end_phase = 0

                        # if this is the first exon, check if we are starting from methionine
                        if first_exon:
                            [left_flank_ok, correction, phase] = \
                                check_translation_start (mitochondrial, left_flank, dna_seq, templ_dna_seq, templ_protein_seq)
                        # see if the left splice site is ok
                        else:
                            [left_flank_ok, correction, phase, max_score] = \
                                check_left_flank (acg, left_flank, dna_seq, templ_dna_seq)

                        ########################
                        #
                        # see if the right splice site is ok
                        [right_flank_ok, end_correction, end_phase, end_max_score] = \
                            check_right_flank(acg, right_flank, dna_seq, templ_dna_seq)

                        pepseq_corrected = ""
                        new_left_flank = ""
                        new_right_flank = ""
                        new_dna_seq = ""
                        if left_flank_ok:
                            offset = (3 - phase) % 3
                            if correction:
                                if correction > 0:
                                    new_dna_seq = dna_seq[correction:]
                                    new_left_flank = left_flank + dna_seq[:
                                                                          correction]
                                else:
                                    # correction is negative, therefore left_flank[correction:] is the tail of left_flank
                                    new_dna_seq = left_flank[
                                        correction:] + dna_seq
                                    new_left_flank = left_flank[:correction]
                            else:
                                new_left_flank = left_flank

                            pepseq_transl_start = offset

                        else:
                            new_left_flank = left_flank

                        if right_flank_ok:
                            if not new_dna_seq: new_dna_seq = dna_seq
                            if end_correction:
                                if end_correction < 0:
                                    new_right_flank = new_dna_seq[
                                        end_correction:] + right_flank
                                    new_dna_seq = new_dna_seq[:end_correction]
                                else:
                                    # correction is negative, therefore right_flank[correction:] is the tail of right_flank
                                    new_right_flank = right_flank[
                                        end_correction:]
                                    new_dna_seq += right_flank[:end_correction]
                            else:
                                new_right_flank = right_flank
                            pepseq_transl_end = len(new_dna_seq)
                            pepseq_transl_end -= end_phase
                        else:
                            new_right_flank = right_flank

                        # if only one flank is ok, use that side to decide if there is a phase on the other
                        if left_flank_ok and not right_flank_ok:
                            end_phase = (pepseq_transl_end -
                                         pepseq_transl_start) % 3
                            pepseq_transl_end -= end_phase

                        if right_flank_ok and not left_flank_ok:
                            phase = (pepseq_transl_end -
                                     pepseq_transl_start) % 3
                            pepseq_transl_start += phase

                        # check that the lengths match
                        has_stop = None
                        if new_dna_seq:
                            len_old = len(left_flank + dna_seq + right_flank)
                            len_new = len(new_left_flank + new_dna_seq +
                                          new_right_flank)
                            if not len_old == len_new:

                                print len_old, len_new
                                print correction, end_correction
                                print map(len,
                                          [left_flank, dna_seq, right_flank])
                                print map(len, [
                                    new_left_flank, new_dna_seq,
                                    new_right_flank
                                ])
                                continue
                            cds = new_dna_seq[
                                pepseq_transl_start:pepseq_transl_end]
                            if mitochondrial:
                                pepseq_corrected = Seq(cds).translate(
                                    table="Vertebrate Mitochondrial").tostring(
                                    )
                            else:
                                pepseq_corrected = Seq(
                                    cds).translate().tostring()
                            if '*' in pepseq_corrected:
                                has_stop = 1
                            else:
                                has_stop = 0

                        if has_stop and not '*' in protein_seq:
                            continue  # abort, abort

                        if True:
                            print "#############################################"
                            print human_gene_id, stable_id, "exo no:", human_exons.index(
                                human_exon), "      ", description
                            print species, table

                            print "\t  template", template_exon_seq_id, template_species, template_db_id
                            print "\t  template left flank", templ_left_flank, templ_dna_seq[
                                0:3]
                            print "\t           left flank", left_flank, dna_seq[
                                0:3]
                            print "\t          ", left_flank_ok, correction, phase,
                            if not first_exon:
                                print max_score
                            else:
                                print
                            print "\t  template right flank", templ_dna_seq[
                                -3:], templ_right_flank
                            print "\t           right flank", dna_seq[
                                -3:], right_flank
                            print "\t          ", right_flank_ok, end_correction, end_phase, end_max_score

                            print "\t     human", human_protein_seq, human_exon.exon_id, human_exon_phase
                            print "\t  template", templ_protein_seq
                            print "\t deposited", protein_seq
                            if pepseq_corrected:
                                print "\t corrected", pepseq_corrected

                        if new_dna_seq:
                            if (pepseq_transl_end - pepseq_transl_start) % 3:
                                print "length not divisible by 3 "
                                print pepseq_transl_start, pepseq_transl_end
                                print phase, end_phase
                                print len(new_dna_seq)
                                print "%%%%% "
                                continue
                        else:
                            new_dna_seq = dna_seq

                        #########################################################
                        # 18_find_exons is sometimes messing up the coordinates
                        # I do not know why
                        ret = check_coordinates_in_the_gene(
                            cursor, cfg, acg, ensembl_db_name, species,
                            novel_exon, new_dna_seq)
                        if not ret:
                            print "\t coordinate check failed"
                            continue
                        [start_in_gene_corrected, end_in_gene_corrected] = ret

                        #########################################################
                        # update the *_exon and exon_seq tables accordingly
                        switch_to_db(cursor, ensembl_db_name[species])

                        qry = "update %s set " % table

                        set_fields = ""
                        if not start_in_gene_corrected == start_in_gene:
                            if set_fields: set_fields += ", "
                            set_fields += " start_in_gene = %d  " % start_in_gene_corrected

                        if not end_in_gene_corrected == end_in_gene:
                            if set_fields: set_fields += ", "
                            set_fields += " end_in_gene = %d  " % end_in_gene_corrected

                        if not has_stop is None:
                            if set_fields: set_fields += ", "
                            set_fields += " has_stop  = %d" % has_stop

                        if left_flank_ok:
                            if set_fields: set_fields += ", "
                            set_fields += " phase = %d,  " % phase
                            if first_exon:
                                set_fields += " has_3p_ss = '%s' " % (
                                    "first exon; starts with M")
                            else:
                                set_fields += " has_3p_ss = '%s' " % (
                                    "me_score=" + str(max_score))

                        if right_flank_ok:
                            if set_fields: set_fields += ", "
                            set_fields += " end_phase = %d,  " % end_phase
                            set_fields += " has_5p_ss = '%s' " % (
                                "me_score=" + str(end_max_score))

                        qry += set_fields + " where exon_id=%d" % novel_exon_id

                        if set_fields:
                            search_db(cursor, qry)

                        # update exon sequence
                        if pepseq_corrected:
                            # we might have changed our mind as to what is the cDNA seq, and what is flanking
                            qry = "update exon_seq set "
                            qry += " protein_seq = '%s', " % pepseq_corrected
                            qry += " dna_seq = '%s',     " % new_dna_seq
                            qry += " left_flank = '%s',  " % new_left_flank
                            qry += " right_flank = '%s',       " % new_right_flank
                            qry += " pepseq_transl_start = %d, " % pepseq_transl_start
                            qry += " pepseq_transl_end   = %d  " % pepseq_transl_end
                            table_id = 2 if table == 'novel_exon' else 3
                            qry += " where exon_id=%d and is_known=%d" % (
                                novel_exon_id, table_id)
                            search_db(cursor, qry)

                        # gene2exon --> have to go back to 07_gene2exon for that
                        tot_ok += 1

    print "gene list done"

    cursor.close()
    db.close()

Exemplo n.º 10

0

Exibir arquivo

Arquivo: 06_gene2exon_check.py Projeto: ivanamihalek/exolocator

def main():

    db = connect_to_mysql()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)

    if len(sys.argv) > 1:
        species_list = sys.argv[1:]
    else:
        species_list = all_species

    ############################
    for species in species_list:
        print
        print "############################"
        print species

        switch_to_db(cursor, ensembl_db_name[species])

        if (species == 'homo_sapiens'):
            gene_ids = get_gene_ids(cursor,
                                    biotype='protein_coding',
                                    is_known=1)
        else:
            gene_ids = get_gene_ids(cursor, biotype='protein_coding')

        ct = 0
        tot = 0

        for tot in range(1000):
            #for gene_id in gene_ids:
            #tot += 1
            gene_id = choice(gene_ids)
            # find all canonical coding exons associated with the gene id
            exons = get_canonical_coding_exons(cursor, gene_id)
            if (not exons):
                ct += 1
                print gene_id, gene2stable(
                    cursor, gene_id=gene_id), " no exons found ", ct, tot

            if not tot % 100:
                print species, tot, ct

            # add up the coding length of the canonical exons
            exons.sort(key=lambda exon: exon.start_in_gene)

            inside_the_coding_range = False
            start_properly_marked = False
            length = 0
            for exon in exons:

                if not exon.canon_transl_start is None:
                    start_properly_marked = True  # if it is not propermy marked, we'll never start reading
                    inside_the_coding_range = True
                    length -= exon.canon_transl_start - 1

                if not exon.canon_transl_end is None:
                    inside_the_coding_range = False
                    length += exon.canon_transl_end

                if inside_the_coding_range:
                    length += exon.end_in_gene - exon.start_in_gene + 1

            # take that all exons are coding full length if there is no start and end annotation
            # (this I believe is the case for predicted transcripts)
            if not start_properly_marked:
                length = 0
                for exon in exons:
                    length += exon.end_in_gene - exon.start_in_gene + 1

            if (not length):
                print gene2stable(
                    cursor, gene_id=gene_id), " no exons marked as canonical"
                continue

            # what is the length of the canonical transcript according to Ensembl
            canonical_translation = get_canonical_transl(acg,
                                                         cursor,
                                                         gene_id,
                                                         species,
                                                         strip_X=False)
            if (not canonical_translation):
                print "no canonical transl found for ", gene_id
                continue

            if (abs(length / 3 - len(canonical_translation)) > 3):
                ct += 1
                print gene_id, gene2stable(cursor, gene_id), get_description(
                    cursor, gene_id)
                print "(length of all exons)/3 ", length / 3,
                print " does not match reported canonical transl len ", len(
                    canonical_translation)
                if False:
                    # print out all exons
                    print "exons:"
                    inspect(exons)
                    print
                    print 'canonical sequence'
                    print re.sub(
                        "(.{50})", "\\1\n", canonical_translation
                    )  # print canonical sequence with \n stuck in every 50 positions
                    print
                    # print out exons more carefully filtered to belong to the canonical version of the translation
                    print
                    get_translated_region_talkative(cursor, gene_id, species)
                    all_exons = gene2exon_list(cursor, gene_id)
                    print "all exons:"
                    inspect(all_exons)
                    print
                    compare_seqs(canonical_translation,
                                 translated_seq,
                                 verbose=False)
                    exit(1)

        print species, "checked a sample of ", tot + 1, "genes;  problematic:", ct

    cursor.close()
    db.close()
    #
    #    print 'Note: some problems could not have be resolved up to this point,'
    #    print 'becasue we have not really looged at the exons seqs yet.'
    #    print 'For example, for MP furo the, start fo the cannonical translation'
    #    print 'is sometimes given in the middle of NNNNN region.'
    #
    return True