Python MultipleSeqAlignment.extend примеры использования

Язык программирования: Python

Пространство имен/Пакет: Bio.Align

Класс/Тип: MultipleSeqAlignment

Метод/Функция: extend

Примеров на hotexamples.com: 18

Python MultipleSeqAlignment.extend - 18 примеров найдено. Это лучшие примеры Python кода для Bio.Align.MultipleSeqAlignment.extend, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

append(30)

MultipleSeqAlignment(30)

get_alignment_length(29)

extend(13)

add_sequence(12)

format(9)

_annotations(7)

__init__(4)

sort(2)

ref_ungapped(1)

ref(1)

pd(1)

description(1)

molecule_type(1)

__add__(1)

_version(1)

_star_info(1)

__getitem__(1)

values(1)

Пример #1

Показать файл

 def check_matching(self) -> tuple:
     init_clstr = defaultdict(list)
     dichord_list = []
     converted_names = {}
     related_aligns = None
     for n, tip in enumerate(self.tree.get_terminals()):
         tip_name = tip.name
         try:
             seq_record = self.aligns_as_seqs[tip_name]
             dichord = TipSeqLinker(
                 seq_record,
                 (self.tree.root, *self.tree.get_path(tip))
             )
         except KeyError:
             raise TipNotMatchedError(tip)
         init_clstr[tip].append(dichord)
         dichord_list.append(dichord)
         new_seq_id = 'seq{}'.format(n)
         converted_names[tip_name] = new_seq_id
         converted_names[new_seq_id] = tip_name
         if related_aligns is None:
             related_aligns = MultipleSeqAlignment([seq_record])
         else:
             related_aligns.extend([seq_record])
     return (
         init_clstr, dichord_list, converted_names,
         tuple(range(related_aligns.get_alignment_length()))
     )

Пример #2

Показать файл

Файл: seqalign.py Проект: schrodinger/pymol-open-source

def needle_alignment(s1, s2):
    '''
DESCRIPTION

    Does a Needleman-Wunsch Alignment of sequence s1 and s2 and
    returns a Bio.Align.MultipleSeqAlignment object.
    '''
    from Bio import pairwise2
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    try:
        from Bio.Align import substitution_matrices
    except ImportError:
        from Bio.SubsMat.MatrixInfo import blosum62
    else:
        blosum62 = substitution_matrices.load("BLOSUM62")

    def match_callback(c1, c2):
        return blosum62.get((c1, c2), 1 if c1 == c2 else -4)

    alns = pairwise2.align.globalcs(s1, s2,
            match_callback, -10., -.5,
            one_alignment_only=True)

    a = MultipleSeqAlignment([])
    s1 = SeqRecord(alns[0][0], id="s1")
    s2 = SeqRecord(alns[0][1], id="s2")
    a.extend([s1, s2])
    return a

Пример #3

Показать файл

Файл: test.py Проект: Leena01/computational_biology

def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"

    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    print("Number of characters in alignment:", len(alignment[0]))

    ####################
    # Neighbor joining #
    ####################
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)

    constructor = DistanceTreeConstructor()
    start = time.time()
    tree = constructor.nj(dm)
    end = time.time()
    print("Neighbor joining ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)

    #########
    # UPGMA #
    #########

    start = time.time()
    tree = constructor.upgma(dm)
    end = time.time()
    print("UPGMA ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)

Пример #4

Показать файл

def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"
    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    par = SOTA(alignment[:5])
    start = time.time()
    par.train()
    end = time.time()
    print("Self-organizing tree network ran in {} seconds.".format(end -
                                                                   start))
    par.draw_tree()

Пример #5

Показать файл

 def to_alignment(self):
     """Construct an alignment from the aligned sequences in this tree."""
     def is_aligned_seq(elem):
         if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
             return True
         return False
     seqs = self._filter_search(is_aligned_seq, 'preorder', True)
     try:
         first_seq = next(seqs)
     except StopIteration:
         # No aligned sequences were found --> empty MSA
         return MultipleSeqAlignment([])
     msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
                                first_seq.get_alphabet())
     msa.extend(seq.to_seqrecord() for seq in seqs)
     return msa

Пример #6

Показать файл

Файл: main.py Проект: Leena01/computational_biology

def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"
    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    ####################
    # Neighbor joining #
    ####################
    dc = Distance_Calculator()
    dm = dc.create_distance_matrix(alignment)
    dm.data.to_csv("animals.csv")

    start = time.time()
    dc.build_tree(dm)
    end = time.time()
    print("Neighbor joining ran in {} seconds.".format(end - start))
    dc.draw_tree()

    #########
    # UPGMA #
    #########
    dc = Distance_Calculator(mode="UPGMA")
    dm = dc.create_distance_matrix(alignment)

    start = time.time()
    dc.build_tree(dm)
    end = time.time()
    print("UPGMA ran in {} seconds.".format(end - start))
    dc.draw_tree()

    #########
    # WPGMA #
    #########
    dc = Distance_Calculator(mode="WPGMA")
    dm = dc.create_distance_matrix(alignment)

    start = time.time()
    dc.build_tree(dm)
    end = time.time()
    print("WPGMA ran in {} seconds.".format(end - start))
    dc.draw_tree()

Пример #7

Показать файл

Файл: L_hist_aln.py Проект: tgorkovets/2015_histone_seq_analysis

def annotate_hist_msa(msa, htype, variant=None):
    """Adds to the MSA lines from features.json"""

    # read json
    with open("inp_data/features.json") as ff:
        f = json.load(ff)
    f = f[htype]
    genseq = f["General" + htype]["sequence"]
    genf = f["General" + htype]["feature1"]

    a = SummaryInfo(msa)
    cons = a.dumb_consensus(threshold=0.1, ambiguous="X")
    sr_c = SeqRecord(id="consensus", seq=cons)
    sr_genseq = SeqRecord(id="template", seq=Seq(genseq))
    auxmsa = muscle_aln([sr_c, sr_genseq])
    auxmsa.sort()

    gapped_template = str(auxmsa[1].seq)
    gapped_cons = str(auxmsa[0].seq)

    s = list()
    for c, i in zip(gapped_cons, range(len(gapped_template))):
        if c != "-":
            s.append(gapped_template[i])
    newgapped_template = "".join(s)
    # now we need to gap feature
    gapped_genf = list()

    k = 0
    for c, i in zip(newgapped_template, range(len(newgapped_template))):
        if c != "-":
            gapped_genf.append(genf[i - k])
        else:
            k = k + 1
            gapped_genf.append("-")
    gapped_genf = "".join(gapped_genf)

    newmsa = MultipleSeqAlignment([SeqRecord(id="gi|features|id", description=htype, seq=Seq(gapped_genf))])
    newmsa.extend(msa)
    # print newmsa
    return newmsa

Пример #8

Показать файл

def main():
    # file_name = "data/coding.fa"
    file_name = "data/cons_noncode.fa"
    # file_name = "data/test.fa"
    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    par = ParsimonyExact(alignment, bnb=True)
    start = time.time()
    par.run(print_best=True)
    end = time.time()
    print("Maximum parsimony (exact) ran in {} seconds.".format(end - start))
    par.draw_tree(show_scores=True)
    
    print("------------------------------------------------------------------")

    par = ParsimonyHeuristics(alignment, seed=0)
    start = time.time()
    par.run(print_best=True)
    end = time.time()
    print("Maximum parsimony (with heuristics) ran in {} seconds.".format(end - start))
    par.draw_tree(show_scores=True)

Пример #9

Показать файл

Файл: 1_example_aln_H2A.py Проект: tgorkovets/MYSOFT

def main():
    title=''
    #1. Getting data
    ########################################################
    ########################################################
    # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score.
    df=pd.read_csv('int_data/seqs_rs.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    
    #2. Filtering - filter initial dataset by type, variant and other parameters
    ########################################################
    ########################################################

    #2.1. Narrow by variant/type
    ########################################################
    title+='H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[((df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.X'))&(df['partial']==False)&(df['non_st_aa']==False)]
    # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)]

    # f_df=df[(df['hist_type']=='H2A')]

    print "Number of seqs after narrowing by hist type/var:", len(f_df)
    

    #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades
    #########################################################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    #This is akin manual removal of bad species
    del_nodes=[5782,5690]

    print "Selecting taxonomic subset for taxids: ",parent_nodes
    print "while removing taxonomic subset for taxids: ",del_nodes

    taxids=set(parent_nodes)
    for i in parent_nodes:
        taxids.update(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    for i in del_nodes:
        taxids=taxids.difference(set([i]))
        taxids=taxids.difference(set(ncbi.get_descendant_taxa(i,intermediate_nodes=True)))

    f_df=f_df[f_df['taxid'].isin(taxids)]
    print "Number of seq after taxonomic subset: ",len(f_df)
    


    #2.3.0 Marking number of identical sequence within each species and subspecies.
    #This will simplify further analysis of sequence filtering on similarity
    #We know that all refseqs are duplicated for instance.
    ################################################
    ident=dict()
    new_gis=list()
    tids=set(list(f_df['taxid']))
    for i in tids:
        # print i.name, i.sci_name
        temp_df=f_df[(f_df['taxid']==i)]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        if(len(gis)>1):
            res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00)
            ident.update(res)
        else:
            ident.update({gis[0]:1})

    f_df['ident']=[ident.get(k,1) for k in f_df['gi']]
    #where ident - number of identical sequnces for current sepecies/subspecies.
    print "Identity of sequence inside each taxid determined"

    #2.3.1. Calculate number of similar seqs for every seq in tax group
    #########################################################
    # Use powerful method, to get rid of random errors is to identify identical sequences
    # if a sequence is supported by two or more entires - this is good.
    # Here we add a degen column to our data set - showing how many similar sequences are found
    # for a given sequence in its taxonomic clade (genus currently) 

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen=dict()
    new_gis=list()
    tids=list(f_df['taxid']) 
    t = ncbi.get_topology(tids,intermediate_nodes=True)
    for i in t.search_nodes(rank='family'):
        # print i.name, i.sci_name
        nodeset=list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df=f_df[(f_df['taxid'].isin(nodeset))]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00)
        degen.update(res)

    # print degen
    f_df['degen']=[degen.get(k,1) for k in f_df['gi']]

    #2.3.2. Remove seqs that do not have support outside their species
    # if they are not curated or RefSeq NP.
    ###########################################################

    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    f_df=f_df[(f_df['degen']>f_df['ident'])|(f_df['curated']==True)|(f_df['RefSeq']==2)]
    print "After removing mined seqs with no support in neighboring species: ",len(f_df)

    #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority
    ###########################################################
    #RefSeq and degenerate sequence get priority
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    # print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])


    #2.4 Take one best representative per specific taxonomic rank (e.g. genus)
    ############################################################
    pruningrank='genus'
    print "Pruning taxonomy by ", pruningrank
    
    title+=' , one seq. per %s'%pruningrank
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    grouped_taxids=group_taxids(seqtaxids,rank=pruningrank)
    # print seqtaxids
    # print grouped_taxids
    #Now we need to take best representative
    #refseq NP, curated, or the one with largest degeneracy
    new_gis=list()
    for tids in grouped_taxids:
        t_df=f_df[f_df['taxid'].isin(tids)]
        #try take curated first
        if(len(t_df[t_df['curated']==True])>0):
            new_gis.append(t_df.loc[t_df.curated==True,'gi'].values[0])
            continue
        #try take NP records nest
        #RefSeq 2 means NP, 1 means XP
        if(len(t_df[t_df['RefSeq']==2])>0):
            new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0])
            continue
        # take best degenerate otherwise
        else:
            t_df=t_df.sort(['degen','RefSeq'],ascending=False) 
            new_gis.append(t_df['gi'].iloc[0])

    f_df=f_df[f_df['gi'].isin(new_gis)]

    print "After pruning taxonomy we have: ",len(f_df)


    #2.5. Check seq for sanity - needs to be checked!
    ##############################################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ##############################################

    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA############
    aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()

    #7. Vizualize MSA with ete2.##########
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)

    #10. Conservation############
    #############################
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw_norm',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0,norm="T"))),cn,features,axis='conservation')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')

Пример #10

Показать файл

Файл: 1_example_aln_H2A.py Проект: tgorkovets/2015_histone_seq_analysis

def main():
    title=''
    #1. Getting data
    df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    # exit()
    
    #2. Filtering
    ##########
    #2.1. Narrow by variant/type
    title+='CenH3'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[((df['hist_var']=='cenH3'))&(df['partial']==False)]
    # f_df=df[(df['hist_type']=='H2A')]
    # exit()

    print len(f_df)
    


    #2.2. Filter by list of taxonomy clades   
    ################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    print "Selecting taxonomic subset"
    taxids=list(parent_nodes)
    for i in parent_nodes:
        taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    f_df=f_df[f_df['taxid'].isin(taxids)]
    print len(f_df)
    # exit()
    
    #2.3*. Alternative powerful method, to get rid of random errors in seqs
    #We need to cluster seqs, and select only if we have support by two or more similar seqs 

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen=dict()
    new_gis=list()
    tids=list(f_df['taxid']) 
    t = ncbi.get_topology(tids,intermediate_nodes=True)
    for i in t.search_nodes(rank='genus'):
        # print i.name, i.sci_name
        nodeset=list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df=f_df[(f_df['taxid'].isin(nodeset))]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=0.95)
        degen.update(res)
        # exit()
        # for k,v in res.iteritems():
            # if v>2.0:
                # new_gis.append(k)
    # f_df=f_df[f_df['gi'].isin(new_gis)]
    print degen
    f_df['degen']=[degen.get(k,1) for k in f_df['gi']]

    #2.4. #####select one variant per taxid, priority to RefSeq
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])


    #2.4 Take one best representative per specific taxonomic rank.
    ################
    title+=' , one seq. per genus, trimmed'
    print "Pruning taxonomy"
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    grouped_taxids=group_taxids(seqtaxids,rank='species')
    print seqtaxids
    print grouped_taxids
    #Now we need to take best representative
    #refseq NP, or the one with larges degeneracy
    new_gis=list()
    for tids in grouped_taxids:
        t_df=f_df[f_df['taxid'].isin(tids)]
        #take NP if we have it
        if(len(t_df[t_df['RefSeq']==2])>0):
            new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0])
            continue
        else: # take best degenerate
            t_df=t_df.sort(['degen','RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates
            if(t_df['degen'].iloc[0]>100):
                new_gis.append(t_df['gi'].iloc[0])

    f_df=f_df[f_df['gi'].isin(new_gis)]

    # new_seqtaxids=subsample_taxids(seqtaxids,rank='species') #new subsampled list
    # f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe
    # print "---"

    # exit()


    #2.5. Check seq for sanity
    ################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ####################
    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA
    aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()
    #7. Vizualize MSA with ete2.
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    print taxid2gi
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)


    #10. Conservation
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')

Пример #11

Показать файл

Файл: 2_H2A_ca_cellular.py Проект: tgorkovets/2015_histone_seq_analysis

def main():
    title=''
    #1. Getting data
    df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    # exit()
    
    #2. Filtering
    ##########
    #2.1. Narrow by variant/type
    title+='Canonical H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[(df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.1')]
    # f_df=df[(df['hist_type']=='H2A')]
    # exit()
    print len(f_df)
    #2.2. #####select one variant per taxid
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates
    f_df=f_df.drop_duplicates(['taxid','hist_var'])


    # exit()
    #2.3. Filter by list of taxonomy clades   
    ################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    print "Selecting taxonomic subset"
    taxids=list(parent_nodes)
    for i in parent_nodes:
        taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    f_df=f_df[f_df['taxid'].isin(taxids)]
    print len(f_df)
    # exit()
    
    #2.4 Take one representative per specific taxonomic rank.
    ################
    title+=', one sequence per order'
    print "Pruning taxonomy"
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    new_seqtaxids=subsample_taxids(seqtaxids,rank='order') #new subsampled list
    f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe
    # print "---"
    print len(f_df)
    # exit()


    #2.5. Check seq for sanity
    ################
    # title+=' seqQC '

    print "Checkig sequence quality"
    newgis=list()
    for i,row in f_df.iterrows():
        gi=row['gi']
        seq=fasta_dict[str(gi)].seq
        hist_type=row['hist_type']
        hist_var=row['hist_var']
        if(check_hist_length(seq,hist_type,hist_var,1)&check_hist_core_length(seq,hist_type,1)):
            newgis.append(gi)
    f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    print len(f_df)
    # print list(f_df['gi'])
    # exit()

    #3. Make a list of seq with good ids and descriptions
    ####################
    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values())
    AlignIO.write(msa, "results/h2a_ca_cellular.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "results/h2a_ca_cellular_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA
    aln2html(msa,'results/h2a_ca_cellular.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A in cellular organisms",description=True,field1w=10,field2w=35)


    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    title+=', gaps removed'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)


    #7. Vizualize MSA with ete2.
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    print taxid2gi
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("results/h2a_ca_cellular.svg", w=6000, dpi=300, tree_style=ts)


    #10. Conservation
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]

    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('results/h2a_ca_cellular_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('results/h2a_ca_cellular_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    plot_prof4seq('results/h2a_ca_cellular_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')

Пример #12

Показать файл

Файл: fasta_to_nexus_mb.py Проект: abigail-Moore/baits-analysis

Usage = '''
fasta_to_nexus_mb.py is a script to convert a fasta alignment to a nexus
alignment (hopefully) without messing up the names too badly.
It also makes a separate Mr.Bayes script..
fasta_to_nexus.py [name of fasta alignment, expects an extension of .fa]
'''

if len(sys.argv) != 2:
	sys.exit("ERROR! This script expects one additional argument, and you gave it %d arguments!  %s" % (len(sys.argv), Usage))
InFileName = sys.argv[1]

sys.stderr.write("Alignment %s will be processed.\n" % (InFileName))

MyAlignment = MultipleSeqAlignment([], generic_dna)
MyAlignment.extend(AlignIO.read(InFileName, 'fasta'))
for record in MyAlignment:
	SeqNameTemp = record.id
	for Char in SeqNameTemp:
		if (Char in [':',',','(',')',':','[',']',"'","="]):
			SeqNameTemp = SeqNameTemp.replace(Char,'-')
	record.id = SeqNameTemp

OutFileName = InFileName[:-2]+"nex"

AlignIO.write(MyAlignment,OutFileName,'nexus')

OutList = [ ]
Line = "set autoclose=yes nowarn=yes\nexecute "+OutFileName+"\n"
Line += "lset nst=6 rates=invgamma\nunlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all)\n"
OutList.append(Line)

Пример #13

Показать файл

def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H",
                   "D", "B"]

    ### input directory from s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/")

    if os.path.isdir(output_directory_2) == False:
        os.makedirs(output_directory_2)

    ### iterate each gene
    for file in os.listdir(output_directory_1):
        if file != ".DS_Store":
            output_directory_file = output_directory_2 + file
            fasta_name = output_directory_1 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " + sequence)

                alignment = AlignIO.read(sequence, 'fasta')

                ### calculate the polymorphism in outgroup
                ### change alignment to an array.
                total_wrong_poly_sites_outgroup = []

                align_array_outgroup = np.array([list(rec) for rec in alignment])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = alignment.get_alignment_length()
                # alignment = AlignIO.read(sequence, 'fasta')
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position_outgroup = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array_outgroup[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position_outgroup.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position_outgroup.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position_outgroup) > float(Max_p_sites_o):
                        print(column_position_outgroup)
                        total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup


                unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup))
                print(unique_wrong_sites_ougroup)
                print("outgroup")


                align_2 = MultipleSeqAlignment([])
                for record in alignment:
                    new_seq = ""

                    if record.id in outgroups:
                        print(record.seq)
                        for i in range(total_length):
                            if i in unique_wrong_sites_ougroup:
                                new_seq = new_seq + "-"
                            else:
                                new_seq = new_seq + str(record.seq[i])

                        temp_seq2 = SeqRecord(Seq(str(new_seq)), id=str(record.id))
                        align_2.extend([temp_seq2])
                        #align_2.extend(str(record.id), str(new_seq))

                    else:
                        temp_seq3 = SeqRecord(Seq(str(record.seq)), id=str(record.id))
                        align_2.extend([temp_seq3])
                        #align_2.extend(str(record.id), str(record.seq))

                print(align_2)

                AlignIO.write(align_2, output_directory_file, "fasta")

Пример #14

Показать файл

def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"]

    ### input files are from s6
    genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/")

    ### mkdir output directory for s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    if os.path.isdir(output_directory) == False:
        os.makedirs(output_directory)

    ### iterate each gene
    for file in os.listdir(genes_result_s6):
        if file != ".DS_Store":
            output_directory_file = output_directory + file
            fasta_name = genes_result_s6 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " +sequence)

                alignment = AlignIO.read(sequence, 'fasta')
                # print(alignment)

                ### generate a new alignment sequences without outgroups.
                align = MultipleSeqAlignment([])

                for record in alignment:
                    if record.id not in outgroups:
                        # print(record.id)
                        # print(record.seq)
                        temp_seq = SeqRecord(Seq(str(record.seq)), id=str(record.id))
                        # print(temp_seq)
                        align.extend([temp_seq])


                print(align)
                # print(align.get_alignment_length())


                total_wrong_poly_sites = []
                ### change alignment to an array.
                align_array = np.array([list(rec) for rec in align])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = align.get_alignment_length()



                ### using 20bp-long sliding windows.
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position) > float(Max_p_sites):
                        print(column_position)
                        total_wrong_poly_sites = total_wrong_poly_sites + column_position

                #print(total_wrong_poly_sites)

                ### generate the unique positions

                total_wrong_poly_sites = total_wrong_poly_sites + list(range(10))
                total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length))
                ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species.
                unique_wrong_sites = list(np.unique(total_wrong_poly_sites))
                print(len(unique_wrong_sites))
                # sum2 = alignment[:, total_length:total_length + 1]
                # for i in unique_wrong_sites:
                #     sum2 = sum2 + alignment[:, i:i+1]
                # print(sum2)
                # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip")


                ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites.
                ### otherwise, copy the gene to the new folder.
                if len(unique_wrong_sites) > 0:

                    print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}"))

                    cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")

                    cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col

                    print(cmd)
                    os.system(cmd)

                else:
                    cmd_2 = "cp " + fasta_name + " " + output_directory_file
                    print(cmd_2)
                    os.system(cmd_2)

Пример #15

Показать файл

Файл: 1_example_seq_sel.py Проект: tgorkovets/MYSOFT

def main():
    title = ''
    #1. Getting data
    ########################################################
    ########################################################
    # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score.
    df = pd.read_csv('int_data/seqs_rs.csv')  #Histone types info
    fasta_dict = pickle.load(open("int_data/fasta_dict.p", "rb"))  #Sequences

    #2. Filtering - filter initial dataset by type, variant and other parameters
    ########################################################
    ########################################################

    #2.1. Narrow by variant/type
    ########################################################
    title += 'H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df = df[(
        (df['hist_var'] == 'canonical_H2A') | (df['hist_var'] == 'H2A.X'))
              & (df['partial'] == False) & (df['non_st_aa'] == False)]
    # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)]

    # f_df=df[(df['hist_type']=='H2A')]

    print "Number of seqs after narrowing by hist type/var:", len(f_df)

    #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades
    #########################################################
    title += ' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes = [
        131567
    ]  #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    #This is akin manual removal of bad species
    del_nodes = [5782, 5690]

    print "Selecting taxonomic subset for taxids: ", parent_nodes
    print "while removing taxonomic subset for taxids: ", del_nodes

    taxids = set(parent_nodes)
    for i in parent_nodes:
        taxids.update(ncbi.get_descendant_taxa(i, intermediate_nodes=True))
    for i in del_nodes:
        taxids = taxids.difference(set([i]))
        taxids = taxids.difference(
            set(ncbi.get_descendant_taxa(i, intermediate_nodes=True)))

    f_df = f_df[f_df['taxid'].isin(taxids)]
    print "Number of seq after taxonomic subset: ", len(f_df)

    #2.3.0 Marking number of identical sequence within each species and subspecies.
    #This will simplify further analysis of sequence filtering on similarity
    #We know that all refseqs are duplicated for instance.
    ################################################
    ident = dict()
    new_gis = list()
    tids = set(list(f_df['taxid']))
    for i in tids:
        # print i.name, i.sci_name
        temp_df = f_df[(f_df['taxid'] == i)]
        gis = list(temp_df['gi'])  #this is to limit exec time
        # print gis
        if (len(gis) > 1):
            res = cluster_seq_support({gi: fasta_dict[str(gi)]
                                       for gi in gis},
                                      ident_thresh=1.00)
            ident.update(res)
        else:
            ident.update({gis[0]: 1})

    f_df['ident'] = [ident.get(k, 1) for k in f_df['gi']]
    #where ident - number of identical sequnces for current sepecies/subspecies.
    print "Identity of sequence inside each taxid determined"

    #2.3.1. Calculate number of similar seqs for every seq in tax group
    #########################################################
    # Use powerful method, to get rid of random errors is to identify identical sequences
    # if a sequence is supported by two or more entires - this is good.
    # Here we add a degen column to our data set - showing how many similar sequences are found
    # for a given sequence in its taxonomic clade (genus currently)

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen = dict()
    new_gis = list()
    tids = list(f_df['taxid'])
    t = ncbi.get_topology(tids, intermediate_nodes=True)
    for i in t.search_nodes(rank='family'):
        # print i.name, i.sci_name
        nodeset = list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df = f_df[(f_df['taxid'].isin(nodeset))]
        gis = list(temp_df['gi'])  #this is to limit exec time
        # print gis
        res = cluster_seq_support({gi: fasta_dict[str(gi)]
                                   for gi in gis},
                                  ident_thresh=1.00)
        degen.update(res)

    # print degen
    f_df['degen'] = [degen.get(k, 1) for k in f_df['gi']]

    #2.3.2. Remove seqs that do not have support outside their species
    # if they are not curated or RefSeq NP.
    ###########################################################

    f_df = f_df.sort(
        ['RefSeq', 'degen'], ascending=False
    )  # so that RefSeq record get priority on removing duplicates
    f_df = f_df[(f_df['degen'] > f_df['ident']) | (f_df['curated'] == True) |
                (f_df['RefSeq'] == 2)]
    print "After removing mined seqs with no support in neighboring species: ", len(
        f_df)

    #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority
    ###########################################################
    #RefSeq and degenerate sequence get priority
    # title+=' 1ptax'
    f_df = f_df.sort(
        ['RefSeq', 'degen'], ascending=False
    )  # so that RefSeq record get priority on removing duplicates
    # print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])

    #2.4 Take one best representative per specific taxonomic rank (e.g. genus)
    ############################################################
    pruningrank = 'genus'
    print "Pruning taxonomy by ", pruningrank

    title += ' , one seq. per %s' % pruningrank
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids = list(f_df['taxid'])  #old list
    grouped_taxids = group_taxids(seqtaxids, rank=pruningrank)
    # print seqtaxids
    # print grouped_taxids
    #Now we need to take best representative
    #refseq NP, curated, or the one with largest degeneracy
    new_gis = list()
    for tids in grouped_taxids:
        t_df = f_df[f_df['taxid'].isin(tids)]
        #try take curated first
        if (len(t_df[t_df['curated'] == True]) > 0):
            new_gis.append(t_df.loc[t_df.curated == True, 'gi'].values[0])
            continue
        #try take NP records nest
        #RefSeq 2 means NP, 1 means XP
        if (len(t_df[t_df['RefSeq'] == 2]) > 0):
            new_gis.append(t_df.loc[t_df.RefSeq == 2, 'gi'].values[0])
            continue
        # take best degenerate otherwise
        else:
            t_df = t_df.sort(['degen', 'RefSeq'], ascending=False)
            new_gis.append(t_df['gi'].iloc[0])

    f_df = f_df[f_df['gi'].isin(new_gis)]

    print "After pruning taxonomy we have: ", len(f_df)

    #2.5. Check seq for sanity - needs to be checked!
    ##############################################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ##############################################

    f_fasta_dict = {
        key: value
        for (key, value) in fasta_dict.iteritems()
        if int(key) in list(f_df['gi'])
    }
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict = {
        key: SeqRecord(
            id=key,
            description=f_df.loc[f_df.gi == int(key), 'hist_var'].values[0] +
            ' ' + taxid2name[f_df.loc[f_df.gi == int(key), 'taxid'].values[0]],
            seq=value.seq)
        for (key, value) in f_fasta_dict.iteritems()
    }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa = muscle_aln(f_fasta_dict.values(), gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot = MultipleSeqAlignment([
        SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(
            ' ', '-')),
                  id='annotation',
                  name='')
    ])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi = msa[i].id
        msa[i].description = f_fasta_dict[gi].description.replace(
            'canonical', 'ca')
    msa.sort(key=lambda x: x.description)

    #5. Visualize MSA############
    aln2html(msa,
             'example_h2a.html',
             features=get_hist_ss_in_aln_for_html(msa, 'H2A', 0),
             title="canonical H2A alignment",
             description=True,
             field1w=10,
             field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr = trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()

    #7. Vizualize MSA with ete2.##########
    taxid2gi = {
        f_df.loc[f_df.gi == int(gi), 'taxid'].values[0]: gi
        for gi in list(f_df['gi'])
    }
    gi2variant = {
        gi: f_df.loc[f_df.gi == int(gi), 'hist_var'].values[0]
        for gi in list(f_df['gi'])
    }

    msa_dict = {i.id: i.seq for i in msa_tr}
    t = ncbi.get_topology(list(f_df['taxid']), intermediate_nodes=False)
    a = t.add_child(name='annotation')
    a.add_feature('sci_name', 'annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()

    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if (node.rank in ['order', 'class', 'phylum', 'kingdom']):
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name == 'annotation':
            s = str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi = taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' ' + str(gi) + ' '),
                             node,
                             column=1,
                             position="aligned")
            add_face_to_node(TextFace('      ' + str(int(node.name)) + ' '),
                             node,
                             column=2,
                             position="aligned")
            add_face_to_node(TextFace('      ' + str(gi2variant[gi]) + ' '),
                             node,
                             column=3,
                             position="aligned")

        if node.is_leaf() and node.name == 'annotation':
            s = get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' ' + 'NCBI_GI' + ' '),
                             node,
                             column=1,
                             position="aligned")
            add_face_to_node(TextFace('       ' + 'NCBI_TAXID' + ' '),
                             node,
                             column=2,
                             position="aligned")
            add_face_to_node(TextFace('       ' + 'Variant' + '       '),
                             node,
                             column=3,
                             position="aligned")

    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)

    #10. Conservation############
    #############################
    features = get_hist_ss_in_aln_for_shade(msa_tr, below=True)
    cn = add_consensus(msa_tr, threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',
                  map(lambda x: log(20) + x,
                      map(float, cons_prof(msa_tr, f=0, c=0))),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_ent_unw_norm',
                  map(lambda x: log(20) + x,
                      map(float, cons_prof(msa_tr, f=0, c=0, norm="T"))),
                  cn,
                  features,
                  axis='conservation')

    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',
                  map(float, cons_prof(msa_tr, f=0, c=2, m=1)),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',
                  map(float, cons_prof(msa_tr, f=0, c=2, m=0)),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',
                  map(float, cons_prof(msa_tr, f=2, c=2, m=1)),
                  cn,
                  features,
                  axis='conservation')

Пример #16

Показать файл

Файл: semitreep.py Проект: abube002/SemiTree---analysis-of-phylogenetic-trees

        ''' PhyChem '''
        myclade.calculate_phchem_conservation(
        )  #find frequency for each column

        myclade.find_phychm_conserved_regions(min_conserved_length, consensus)
        ''' Rate4Site '''
        myclade.run_rate4site()
        ''' OutPut '''
        myclade.print_clade_analysis()  # display

        myclade.write_clade_to_files()  # default: cladename.tre, clade_name.fa

        #cladedict[cladename] = myclade
        cladelist.append(myclade)

        MSA_everycalade.extend(MSA)
        MSAodo_everycalade.extend(ODO)
''' +++ ALL_CLADES +++ 
ALL SELECTED CLADES IN ONE (it might be the case when not all clades are seleced in tree)
'''

all_clades = Clade(fileheader, "ALL_CLADES", tree, allfasta, MSA_everycalade,
                   MSAodo_everycalade, entropy_gap_weight)
''' Oder/DisOrder '''
all_clades.calculate_odo_conservation()  #find O/D frequency for each column
''' AA '''
all_clades.calculate_aa_conservation()  #findAA  frequency for each column

all_clades.find_aa_conserved_regions(min_conserved_length, consensus)
''' AA entropy '''
all_clades.calculate_aa_entropy()

Пример #17

Показать файл

# Prettify labels
def get_label(leaf):
    if leaf.name.startswith("Inner"):
        return ""
    return leaf.name.replace("_", " ")


# Read the sequences and align

aln = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
for seq_record in SeqIO.parse("data/coding.fa", "fasta"):
    # for seq_record in SeqIO.parse("data/cons_noncode.fa", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    aln.extend([seq_record])

# Print the alignment
print(aln)

# Calculate the distance matrix
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(aln)

# Print the distance Matrix
print('\nDistance Matrix\n===================')
print(dm)

# Construct the phylogenetic tree using UPGMA algorithm
constructor = DistanceTreeConstructor()

Пример #18

Показать файл

Файл: fasta_to_nexus_mb.py Проект: abigail-Moore/baits-analysis

fasta_to_nexus_mb.py is a script to convert a fasta alignment to a nexus
alignment (hopefully) without messing up the names too badly.
It also makes a separate Mr.Bayes script..
fasta_to_nexus.py [name of fasta alignment, expects an extension of .fa]
'''

if len(sys.argv) != 2:
    sys.exit(
        "ERROR! This script expects one additional argument, and you gave it %d arguments!  %s"
        % (len(sys.argv), Usage))
InFileName = sys.argv[1]

sys.stderr.write("Alignment %s will be processed.\n" % (InFileName))

MyAlignment = MultipleSeqAlignment([], generic_dna)
MyAlignment.extend(AlignIO.read(InFileName, 'fasta'))
for record in MyAlignment:
    SeqNameTemp = record.id
    for Char in SeqNameTemp:
        if (Char in [':', ',', '(', ')', ':', '[', ']', "'", "="]):
            SeqNameTemp = SeqNameTemp.replace(Char, '-')
    record.id = SeqNameTemp

OutFileName = InFileName[:-2] + "nex"

AlignIO.write(MyAlignment, OutFileName, 'nexus')

OutList = []
Line = "set autoclose=yes nowarn=yes\nexecute " + OutFileName + "\n"
#Line += "lset nst=6 rates=invgamma\nunlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all)\n"
#OutList.append(Line)