Пример #1
0
def store_paralogues (cursor_species,  gene_id, orthos):

    for ortho in orthos:
        [ortho_stable, species, cognate_genome_db_id] = ortho
        ortho_gene_id = stable2gene (cursor_species, ortho_stable)
        
        fixed_fields  = {}
        fixed_fields ['gene_id']              = gene_id
        fixed_fields ['cognate_genome_db_id'] = cognate_genome_db_id
        fixed_fields ['cognate_gene_id']      = ortho_gene_id

        update_fields = {}
        update_fields['source']          = 'ensembl'
                    
        store_or_update (cursor_species, 'paralogue', fixed_fields, update_fields)
Пример #2
0
def store_seq_filenames (cursor, name, file_names):
    fixed_fields  = {}
    update_fields = {}
    fixed_fields ['name']      = name
    update_fields['file_name'] = file_names
    retval = store_or_update (cursor, "seq_region", fixed_fields, update_fields)
    return retval
Пример #3
0
def store_orthologues (cursor_human, ortho_table, cursor, all_species, 
                       ensembl_db_name,  gene_id, orthos):

    for ortho in orthos:
        [ortho_stable, species, cognate_genome_db_id] = ortho
        if (not species in all_species):
            continue
        ortho_gene_id = stable2gene (cursor, ortho_stable, ensembl_db_name[species])
        
        fixed_fields  = {}
        fixed_fields ['gene_id']              = gene_id
        fixed_fields ['cognate_genome_db_id'] = cognate_genome_db_id

        update_fields = {}
        update_fields['source']          = 'ensembl'
        
        if ( ortho_table == 'orthologue'):
            update_fields['cognate_gene_id'] = ortho_gene_id
        else:
            fixed_fields['cognate_gene_id'] = ortho_gene_id
            

        store_or_update (cursor_human, ortho_table, fixed_fields, update_fields)
Пример #4
0
def store(cursor, infile):

    inf = erropen(infile, "r")

    total        = 0
    id_not_found = 0
    for line in inf:
        line.rstrip()
        total += 1
        if not total%1000: print "\t", total
        if ( len(line.split()) !=  2 or not 'ENS' in line):
            continue
        [stable_id1, stable_id2] = line.split()
        fixed_fields    = {}
        update_fields   = {}
        
        fixed_fields['gene_id1'] = stable_id1
        fixed_fields['gene_id2'] = stable_id2

        store_or_update (cursor, 'paralog', fixed_fields, update_fields)

    print "done with ", infile, "total ",  total

    inf.close ()
Пример #5
0
def store(cursor, infile):

    inf = erropen(infile, "r")

    total = 0
    id_not_found = 0
    for line in inf:
        line.rstrip()
        total += 1
        if not total % 1000: print "\t", total
        if (len(line.split()) != 2 or not 'ENS' in line):
            continue
        [stable_id1, stable_id2] = line.split()
        fixed_fields = {}
        update_fields = {}

        fixed_fields['gene_id1'] = stable_id1
        fixed_fields['gene_id2'] = stable_id2

        store_or_update(cursor, 'paralog', fixed_fields, update_fields)

    print "done with ", infile, "total ", total

    inf.close()
Пример #6
0
def main():

    parameter = {}
    # in case I ever have to handle multiple versions of ensembl
    # (but for now I don't have enough space)
    # note though that there are functions in el_utils/mysql.py that assume
    # that whatever ensembl stuff is available to the mysql server corresponds to the same release 
    release_number = '76'
    parameter['ensembl_release_number'] = release_number
    parameter['blastp_e_value']         = "1.e-10" # it will be used as a string  when fmting the blastp cmd
    parameter['min_accptbl_exon_sim']   = 0.33333 #minimum acceptable exon similarity

    dir_path = {}
    dir_path['ensembl_fasta'] = '/mnt/ensembl-mirror/release-'+release_number+'/fasta'
    # local juggling of data from one database base to the other
    dir_path['afs_dumps']     = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['afs_dumps']    += 'ExoLocator/results/dumpster'
    dir_path['resources']     = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['resources']    += 'pypeworks/exolocator/resources'
    dir_path['scratch']       = '/tmp'
    dir_path['maxentscan']    = '/afs/bii.a-star.edu.sg/dept/biomodel_design/Group/ivana/'
    dir_path['maxentscan']   += 'pypeworks/exolocator/pl_utils/maxentscan'

    util_path = {}
    util_path['mafft']    = '/usr/bin/mafft'
    util_path['blastall'] = '/usr/bin/blastall'
    util_path['fastacmd'] = '/usr/bin/fastacmd'
    util_path['sw#']      = '/usr/bin/swsharp'
    util_path['usearch']  = '/usr/bin/usearch'
    util_path['score3']   = dir_path['maxentscan'] + '/score3.pl'
    util_path['score5']   = dir_path['maxentscan'] + '/score5.pl'

    if 1:
        # check if the paths are functioning (at this point at least)
        for util in util_path.values():
            if (not os.path.exists(util)):
                print util, " not found "
                sys.exit (1)

        for dir in dir_path.values():
            if (not os.path.exists(dir)):
                print dir, " not found "
                sys.exit (1)
            if (not os.path.isdir (dir)):
                print dir, " is not a directory "
                sys.exit (1)
            
    db     = connect_to_mysql()
    cursor = db.cursor()


    #######################################################
    # check if the config db exists -- if not, make it
    db_name   = "exolocator_config"
    qry  = "show databases like'%s'" % db_name
    rows = search_db (cursor, qry)
    if (not rows):
        print db_name, "database not found"
        qry = "create database %s " % db_name
        rows = search_db (cursor, qry)
        if (rows):
            print "some problem creating the database ..."
            rows = search_db (cursor, qry, verbose = True)
    else:
        print db_name, "database found"

    qry = "use %s " % db_name
    search_db (cursor, qry)
        
    # make tables
    for table in ['util_path', 'dir_path', 'parameter']:
        if ( check_table_exists (cursor, db_name, table)):
            print table, " found in ", db_name
        else:
            print table, " not found in ", db_name
            make_table (cursor, table)
   
    # fill util, dir and path tables 
    fixed_fields  = {}
    update_fields = {}
    for [name, path] in util_path.iteritems():
        fixed_fields['name']  = name
        update_fields['path'] = path
        store_or_update (cursor, 'util_path', fixed_fields, update_fields)

    fixed_fields  = {}
    update_fields = {}
    for [name, path] in dir_path.iteritems():
        fixed_fields['name'] = name
        update_fields['path'] = path
        store_or_update (cursor, 'dir_path', fixed_fields, update_fields)

    fixed_fields  = {}
    update_fields = {}
    for [name, value] in parameter.iteritems():
        fixed_fields['name']  = name
        update_fields['value'] = value
        store_or_update (cursor, 'parameter', fixed_fields, update_fields)

    #######################################################
    # add trivial names to ncbi_taxonomy.names
    [all_species, ensembl_db_name] = get_species (cursor)
    feed_trivial_names (cursor, all_species)

    #######################################################
    # add species shorthands (used in ENS* names formation)
    # though we will not needed unit the paralogue alignment reconstruction point)
    feed_name_shorthands (cursor, all_species)

    cursor.close()
    db.close()
Пример #7
0
def  feed_name_shorthands (cursor, all_species):

    short = {}
    short['ailuropoda_melanoleuca'] = 'AME'
    short['anas_platyrhynchos']     = 'APL'
    short['anolis_carolinensis']    = 'ACA'
    short['astyanax_mexicanus']     = 'AMX'
    short['bos_taurus']             = 'BTA'
    short['callithrix_jacchus'] = 'CJA'
    short['canis_familiaris'] = 'CAF'
    short['cavia_porcellus'] = 'CPO'
    short['choloepus_hoffmanni'] = 'CHO'
    short['danio_rerio'] = 'DAR'
    short['dasypus_novemcinctus'] = 'DNO'
    short['dipodomys_ordii'] = 'DOR'
    short['echinops_telfairi'] = 'ETE'
    short['equus_caballus'] = 'ECA'
    short['erinaceus_europaeus'] = 'EEU'
    short['felis_catus'] = 'FCA'
    short['ficedula_albicollis'] = 'FAL'
    short['gadus_morhua'] = 'GMO'
    short['gallus_gallus'] = 'GAL'
    short['gasterosteus_aculeatus'] = 'GAC'
    short['gorilla_gorilla'] = 'GGO'
    short['homo_sapiens'] = ''
    short['ictidomys_tridecemlineatus'] = 'STO'
    short['latimeria_chalumnae'] = 'LAC'
    short['lepisosteus_oculatus'] = 'LOC'
    short['loxodonta_africana'] = 'LAF'
    short['macaca_mulatta'] = 'MMU'
    short['macropus_eugenii'] = 'MEU'
    short['meleagris_gallopavo'] = 'MGA'
    short['microcebus_murinus'] = 'MIC'
    short['monodelphis_domestica'] = 'MOD'
    short['mus_musculus'] = 'MUS'
    short['mustela_putorius_furo'] = 'MPU'
    short['myotis_lucifugus'] = 'MLU'
    short['nomascus_leucogenys'] = 'NLE'
    short['ochotona_princeps'] = 'OPR'
    short['oreochromis_niloticus'] = 'ONI'
    short['ornithorhynchus_anatinus'] = 'OAN'
    short['oryctolagus_cuniculus'] = 'OCU'
    short['oryzias_latipes'] = 'ORL'
    short['ovis_aries'] = 'OAR'
    short['otolemur_garnettii'] = 'OGA'
    short['pan_troglodytes'] = 'PTR'
    short['papio_anubis'] = 'PAN'
    short['poecilia_formosa'] = 'PFO'
    short['pelodiscus_sinensis'] = 'PSI'
    short['petromyzon_marinus'] = 'PMA'
    short['pongo_abelii'] = 'PPY'
    short['procavia_capensis'] = 'PCA'
    short['pteropus_vampyrus'] = 'PVA'
    short['rattus_norvegicus'] = 'RNO'
    short['sarcophilus_harrisii'] = 'SHA'
    short['sorex_araneus'] = 'SAR'
    short['sus_scrofa'] = 'SSC'
    short['taeniopygia_guttata'] = 'TGU'
    short['takifugu_rubripes'] = 'TRU'
    short['tarsius_syrichta'] = 'TSY'
    short['tetraodon_nigroviridis'] = 'TNI'
    short['tupaia_belangeri'] = 'TBE'
    short['tursiops_truncatus'] = 'TTR'
    short['vicugna_pacos'] = 'VPA'
    short['xenopus_tropicalis'] = 'XET'
    short['xiphophorus_maculatus'] = 'XMA'


    db_name = get_compara_name (cursor)
    qry = "use %s " % db_name
    search_db (cursor, qry)

    table = 'species_name_shorthands'
    # if the table does not exist, make it
    if not check_table_exists (cursor, db_name, table):
        qry  = "CREATE TABLE " + table + "  (id INT(10) PRIMARY KEY AUTO_INCREMENT)"
        rows = search_db (cursor, qry)
        if (rows): return False

        qry = "ALTER TABLE %s  ADD %s VARCHAR(100)" % (table, 'species')
        rows = search_db (cursor, qry)
        if (rows): return False
        qry = "ALTER TABLE %s  ADD %s VARCHAR(10)" % (table, 'shorthand')
        rows = search_db (cursor, qry)
        if (rows): return False


    for species in all_species:
        if short.has_key(species):
            fixed_fields  = {}
            update_fields = {}
            fixed_fields  ['species']   = species
            update_fields ['shorthand'] = short[species]
            store_or_update (cursor, table, fixed_fields, update_fields)
        else:
            print "short for ", species, " not found "
            short[species] = ""
Пример #8
0
def  feed_trivial_names (cursor, all_species):

    tax_id  = {}
    trivial = {}

    trivial['ailuropoda_melanoleuca'] = 'panda' 
    trivial['anas_platyrhynchos']     = 'duck'
    trivial['anolis_carolinensis']    = 'anole_lizard' 
    trivial['astyanax_mexicanus']     = 'blind_cavefish'
    trivial['bos_taurus']             = 'cow' 
    trivial['callithrix_jacchus']     = 'marmoset' 
    trivial['canis_familiaris']       = 'dog' 
    trivial['cavia_porcellus']        = 'guinea_pig' 
    trivial['choloepus_hoffmanni']    = 'sloth' 
    trivial['danio_rerio']            = 'zebrafish' 
    trivial['dasypus_novemcinctus']   = 'armadillo' 
    trivial['dipodomys_ordii']        = 'kangaroo_rat' 
    trivial['echinops_telfairi']      = 'madagascar_hedgehog' 
    trivial['equus_caballus']         = 'horse' 
    trivial['erinaceus_europaeus']    = 'european_hedgehog' 
    trivial['felis_catus']            = 'cat' 
    trivial['ficedula_albicollis']    = 'flycatcher'
    trivial['gadus_morhua']           = 'cod' 
    trivial['gallus_gallus']          = 'chicken' 
    trivial['gasterosteus_aculeatus'] = 'stickleback' 
    trivial['gorilla_gorilla']        = 'gorilla' 
    trivial['homo_sapiens']           = 'human' 
    trivial['ictidomys_tridecemlineatus']  = 'squirrel' 
    trivial['latimeria_chalumnae']         = 'coelacanth' 
    trivial['lepisosteus_oculatus']        = 'spotted_gar'
    trivial['loxodonta_africana']          = 'elephant' 
    trivial['macaca_mulatta']              = 'macaque' 
    trivial['macropus_eugenii']            = 'wallaby' 
    trivial['meleagris_gallopavo']         = 'turkey' 
    trivial['microcebus_murinus']          = 'lemur' 
    trivial['monodelphis_domestica']       = 'opossum' 
    trivial['mus_musculus']                = 'mouse' 
    trivial['mustela_putorius_furo']       = 'ferret' 
    trivial['myotis_lucifugus']            = 'bat' 
    trivial['nomascus_leucogenys']         = 'gibbon' 
    trivial['ochotona_princeps']           = 'pika' 
    trivial['oreochromis_niloticus']       = 'tilapia' 
    trivial['ornithorhynchus_anatinus']    = 'platypus' 
    trivial['oryctolagus_cuniculus']       = 'rabbit' 
    trivial['oryzias_latipes']             = 'medaka' 
    trivial['otolemur_garnettii']          = 'galago_lemur' 
    trivial['ovis_aries']                  = 'sheep'
    trivial['pan_troglodytes']             = 'chimpanzee' 
    trivial['papio_anubis']                = 'baboon' 
    trivial['pelodiscus_sinensis']         = 'turtle' 
    trivial['petromyzon_marinus']          = 'lamprey' 
    trivial['poecilia_formosa']            = 'amazon_molly'
    trivial['pongo_abelii']                = 'orangutan' 
    trivial['procavia_capensis']           = 'hyrax' 
    trivial['pteropus_vampyrus']           = 'flying_fox' 
    trivial['rattus_norvegicus']           = 'rat' 
    trivial['sarcophilus_harrisii']        = 'tasmanian_devil' 
    trivial['sorex_araneus']               = 'european_shrew' 
    trivial['sus_scrofa']                  = 'pig' 
    trivial['taeniopygia_guttata']         = 'zebra_finch' 
    trivial['takifugu_rubripes']           = 'fugu' 
    trivial['tarsius_syrichta']            = 'tarsier' 
    trivial['tetraodon_nigroviridis']      = 'pufferfish' 
    trivial['tupaia_belangeri']            = 'tree_shrew' 
    trivial['tursiops_truncatus']          = 'dolphin' 
    trivial['vicugna_pacos']               = 'alpaca' 
    trivial['xenopus_tropicalis']          = 'xenopus' 
    trivial['xiphophorus_maculatus']       = 'platyfish' 



    db_name = get_compara_name (cursor)
    if (not db_name):
        print "compara db not found"
        exit(1)
    qry = "use %s " % db_name
    search_db (cursor, qry)
    for species in all_species:
        tax_id[species] = species2taxid (cursor, species)

    # switch to ncbi taxonomy database
    db_name = get_ncbi_tax_name (cursor)
    if (not db_name):
        print "ncbi taxonomy db not found"
        exit(1)

    qry = "use %s " % db_name
    search_db (cursor, qry)
    for species in all_species:
        if trivial.has_key(species):
            fixed_fields  = {}
            update_fields = {}
            fixed_fields ['tax_id']     = tax_id[species]
            fixed_fields ['name_class'] = 'trivial'
            update_fields['name_txt']   = trivial[species]
            store_or_update (cursor, 'names', fixed_fields, update_fields)
        else:
            print "trivial for ", species, " not found "
            trivial[species] = ""

    return True
def multiple_exon_alnmt(gene_list, db_info):


    print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list))

    [local_db, ensembl_db_name] = db_info

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    # for each human gene
    gene_ct = 0
    tot  = 0
    ok   = 0
    no_maps        = 0
    no_pepseq      = 0
    no_orthologues = 0
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    #gene_list.reverse()
    for gene_id in gene_list:

        start = time()
        gene_ct += 1
        if  not gene_ct%10: print gene_ct, "genes out of", len(gene_list)

        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print gene_ct, len(gene_ids),  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id))
        human_exons.sort(key=lambda exon: exon.start_in_gene)

        ##################################################################
        for human_exon in human_exons:
            
            tot += 1

            # find all orthologous exons the human exon  maps to
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            if verbose: 
                print "\texon no.", tot, " id", human_exon.exon_id,
                if not maps: 
                    print " no maps"
                    print human_exon
                print 
            if not maps: 
                no_maps += 1
                continue

  
            # human sequence to fasta:
            seqname   = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known)
            switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
            [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
             left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known)
            if (not pepseq):
                if verbose and  human_exon.is_coding and  human_exon.covering_exon <0: # this should be a master exon
                    print "no pep seq for",  human_exon.exon_id, "coding ", human_exon.is_coding,
                    print "canonical: ",  human_exon.is_canonical
                    print "length of dna ", len(dna_seq)
                no_pepseq += 1
                continue

            # collect seq from all maps, and output them in fasta format
            hassw = False
            headers   = []
            sequences = {}
            exons_per_species = {}

            for map in maps:

                switch_to_db (cursor, ensembl_db_name[map.species_2])
                if map.similarity < min_similarity: continue
                exon    = map2exon(cursor, ensembl_db_name, map)
                pepseq  = get_exon_pepseq (cursor,exon)
                if (not pepseq):
                    continue
                if  map.source == 'sw_sharp':
                    exon_known_code = 2
                    hassw = True
                elif  map.source == 'usearch':
                    exon_known_code = 3
                    hassw = True
                else:
                    exon_known_code = map.exon_known_2
                seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code)
                headers.append(seqname)
                sequences[seqname] = pepseq
                # for split exon concatenation (see below)
                if not map.species_2 in exons_per_species.keys():
                    exons_per_species[map.species_2] = []
                exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]);
                
                    
            if (len(headers) <=1 ):
                if verbose: print "single species in the alignment"
                no_orthologues += 1
                continue
            
            # concatenate exons from the same gene - the alignment program might go wrong otherwise
            concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species)

            fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            output_fasta (fasta_fnm, sequences.keys(), sequences)

            # align
            afa_fnm  = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
            ret      = commands.getoutput(mafftcmd)

            if (verbose): print 'almt to', afa_fnm

            # read in the alignment 
            inf = erropen(afa_fnm, "r")
            aligned_seqs = {}
            for record in SeqIO.parse(inf, "fasta"):
                aligned_seqs[record.id] = str(record.seq)
            inf.close()
            # split back the concatenated exons
            if concatenated: split_concatenated_exons (aligned_seqs, concatenated)

            human_seq_seen = False
            for seq_name, sequence in aligned_seqs.iteritems():
                # if this is one of the concatenated seqs, split them back to two

                ### store the alignment as bitstring
                # Generate the bitmap
                bs         = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0')))
                # The returned value of tobytes() will be padded at the end 
                # with between zero and seven 0 bits to make it byte aligned.
                # I will end up with something that looks like extra alignment gaps, that I'll have to return
                msa_bitmap = bs.tobytes() 
                # Retrieve information on the cognate
                cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':')
                if cognate_exon_known == '2':
                    source = 'sw_sharp'
                elif cognate_exon_known == '3':
                    source = 'usearch'
                else:
                    source = 'ensembl'
                if (cognate_species == 'homo_sapiens'):
                    human_seq_seen = True
                cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor
                switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens
                # Write the bitmap to the database
                #if (cognate_species == 'homo_sapiens'):
                if verbose: # and (source=='sw_sharp' or source=='usearch'):
                    print "storing"
                    print human_exon.exon_id, human_exon.is_known
                    print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source
                    print sequence
                    if not msa_bitmap:
                        print "no msa_bitmap"
                        continue
                store_or_update(cursor, "exon_map",    {"cognate_genome_db_id":cognate_genome_db_id,
                   "cognate_exon_id":cognate_exon_id   ,"cognate_exon_known"  :cognate_exon_known,
                   "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known},
                  {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                 
            ok += 1
            commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)

        if verbose: print " time: %8.3f\n" % (time()-start);

    print "tot: ", tot, "ok: ", ok
    print "no maps ",   no_pepseq
    print "no pepseq ", no_pepseq
    print "no orthologues  ", no_orthologues
    print
def multiple_exon_alnmt(species_list, db_info):


    [local_db, ensembl_db_name] = db_info

    verbose  = False

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()


    for species in species_list:

        print
        print "############################"
        print  species

        switch_to_db (cursor,  ensembl_db_name[species])
        gene_ids = get_gene_ids (cursor, biotype='protein_coding')
        #gene_ids = get_theme_ids(cursor, cfg, 'wnt_pathway')
        if not gene_ids:
            print "no gene_ids"
            continue


        gene_ct       = 0
        tot           = 0
        ok            = 0
        no_maps       = 0
        no_pepseq     = 0
        no_paralogues = 0
        for gene_id in gene_ids:

            if verbose: start = time()
            gene_ct += 1
            if not gene_ct%100: print species, gene_ct, "genes out of", len(gene_ids)
            if verbose: 
                print
                print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id)

            # get the paralogues - only the representative for  the family will have this 
            paralogues = get_paras (cursor, gene_id)  
            if not paralogues:
                if verbose:  print "\t not a template or no paralogues"
                continue

            if verbose:  print "paralogues: ", paralogues

            # get _all_ exons
            template_exons = gene2exon_list(cursor, gene_id)
            if (not template_exons):
                if verbose: print 'no exons for ', gene_id
                continue

            # find all template  exons we are tracking in the database
            for template_exon in template_exons:

                if verbose: print template_exon.exon_id
                maps = get_maps(cursor, ensembl_db_name, template_exon.exon_id,
                                template_exon.is_known, species=species, table='para_exon_map')

                if not maps:
                    no_maps += 1
                    continue

                # output to fasta:
                seqname        = "{0}:{1}:{2}".format('template', template_exon.exon_id, template_exon.is_known)
                exon_seqs_info =  get_exon_seqs (cursor, template_exon.exon_id, template_exon.is_known)
                if not exon_seqs_info: continue
                [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
                 left_flank, right_flank, dna_seq] = exon_seqs_info
                if (not pepseq):
                    if ( template_exon.is_coding and  template_exon.covering_exon <0): # this should be a master exon
                        print "no pep seq for",  template_exon.exon_id, "coding ", template_exon.is_coding,
                        print "canonical: ",  template_exon.is_canonical
                        print "length of dna ", len(dna_seq)
                        no_pepseq += 1
                    continue
                
                tot += 1

                sequences = {seqname:pepseq}
                headers   = [seqname]
                for map in maps:
                    exon    = map2exon(cursor, ensembl_db_name, map, paralogue=True)
                    pepseq  = get_exon_pepseq (cursor,exon)
                    if (not pepseq):
                        continue
                    seqname = "{0}:{1}:{2}".format('para', map.exon_id_2, map.exon_known_2)
                    headers.append(seqname)
                    sequences[seqname] = pepseq

                fasta_fnm = "{0}/{1}_{2}_{3}.fa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                output_fasta (fasta_fnm, headers, sequences)

                if (len(headers) <=1 ):
                    print "single species in the alignment (?)"
                    no_paralogues += 1
                    continue

                # align
                afa_fnm  = "{0}/{1}_{2}_{3}.afa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
                ret      = commands.getoutput(mafftcmd)

                # read in the alignment
                inf = erropen(afa_fnm, "r")
                if not inf:
                    print gene_id
                    continue
                template_seq_seen = False
                for record in SeqIO.parse(inf, "fasta"):
                    ### store the alignment as bitstring
                    # Generate the bitmap
                    bs         = Bits(bin='0b' + re.sub("[^0]","1", str(record.seq).replace('-','0')))
                    msa_bitmap = bs.tobytes()
                    # Retrieve information on the cognate
                    label, cognate_exon_id, cognate_exon_known = record.id.split(':')
                    if (label == 'template'):
                        template_seq_seen = True
                    # Write the bitmap to the database
                    #print "updating: ", template_exon.exon_id
                    store_or_update(cursor, "para_exon_map", {"cognate_exon_id"    :cognate_exon_id,
                                                         "cognate_exon_known" :cognate_exon_known,
                                                         "exon_id"            :template_exon.exon_id,
                                                         "exon_known"         :template_exon.is_known},
                                    {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                inf.close()
                ok += 1
                commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)
            if verbose: print " time: %8.3f\n" % (time()-start);
 
        outstr  =  species + " done \n"
        outstr +=  "tot: %d   ok: %d  \n" % (tot,  ok)
        outstr +=  "no maps       %d  \n" % no_pepseq
        outstr +=  "no pepseq     %d  \n" % no_pepseq
        outstr +=  "no paralogues %d  \n" % no_paralogues
        outstr += "\n"
        print outstr