Пример #1
0
 def check_matching(self) -> tuple:
     init_clstr = defaultdict(list)
     dichord_list = []
     converted_names = {}
     related_aligns = None
     for n, tip in enumerate(self.tree.get_terminals()):
         tip_name = tip.name
         try:
             seq_record = self.aligns_as_seqs[tip_name]
             dichord = TipSeqLinker(
                 seq_record,
                 (self.tree.root, *self.tree.get_path(tip))
             )
         except KeyError:
             raise TipNotMatchedError(tip)
         init_clstr[tip].append(dichord)
         dichord_list.append(dichord)
         new_seq_id = 'seq{}'.format(n)
         converted_names[tip_name] = new_seq_id
         converted_names[new_seq_id] = tip_name
         if related_aligns is None:
             related_aligns = MultipleSeqAlignment([seq_record])
         else:
             related_aligns.extend([seq_record])
     return (
         init_clstr, dichord_list, converted_names,
         tuple(range(related_aligns.get_alignment_length()))
     )
Пример #2
0
def needle_alignment(s1, s2):
    '''
DESCRIPTION

    Does a Needleman-Wunsch Alignment of sequence s1 and s2 and
    returns a Bio.Align.MultipleSeqAlignment object.
    '''
    from Bio import pairwise2
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    try:
        from Bio.Align import substitution_matrices
    except ImportError:
        from Bio.SubsMat.MatrixInfo import blosum62
    else:
        blosum62 = substitution_matrices.load("BLOSUM62")

    def match_callback(c1, c2):
        return blosum62.get((c1, c2), 1 if c1 == c2 else -4)

    alns = pairwise2.align.globalcs(s1, s2,
            match_callback, -10., -.5,
            one_alignment_only=True)

    a = MultipleSeqAlignment([])
    s1 = SeqRecord(alns[0][0], id="s1")
    s2 = SeqRecord(alns[0][1], id="s2")
    a.extend([s1, s2])
    return a
Пример #3
0
def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"

    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    print("Number of characters in alignment:", len(alignment[0]))

    ####################
    # Neighbor joining #
    ####################
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)

    constructor = DistanceTreeConstructor()
    start = time.time()
    tree = constructor.nj(dm)
    end = time.time()
    print("Neighbor joining ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)

    #########
    # UPGMA #
    #########

    start = time.time()
    tree = constructor.upgma(dm)
    end = time.time()
    print("UPGMA ran in {} seconds.".format(end - start))
    Phylo.draw(tree, label_func=get_label)
Пример #4
0
def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"
    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    par = SOTA(alignment[:5])
    start = time.time()
    par.train()
    end = time.time()
    print("Self-organizing tree network ran in {} seconds.".format(end -
                                                                   start))
    par.draw_tree()
Пример #5
0
 def to_alignment(self):
     """Construct an alignment from the aligned sequences in this tree."""
     def is_aligned_seq(elem):
         if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
             return True
         return False
     seqs = self._filter_search(is_aligned_seq, 'preorder', True)
     try:
         first_seq = next(seqs)
     except StopIteration:
         # No aligned sequences were found --> empty MSA
         return MultipleSeqAlignment([])
     msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
                                first_seq.get_alphabet())
     msa.extend(seq.to_seqrecord() for seq in seqs)
     return msa
Пример #6
0
def main():
    file_name = "data/coding.fa"
    # file_name = "data/cons_noncode.fa"
    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    ####################
    # Neighbor joining #
    ####################
    dc = Distance_Calculator()
    dm = dc.create_distance_matrix(alignment)
    dm.data.to_csv("animals.csv")

    start = time.time()
    dc.build_tree(dm)
    end = time.time()
    print("Neighbor joining ran in {} seconds.".format(end - start))
    dc.draw_tree()

    #########
    # UPGMA #
    #########
    dc = Distance_Calculator(mode="UPGMA")
    dm = dc.create_distance_matrix(alignment)

    start = time.time()
    dc.build_tree(dm)
    end = time.time()
    print("UPGMA ran in {} seconds.".format(end - start))
    dc.draw_tree()

    #########
    # WPGMA #
    #########
    dc = Distance_Calculator(mode="WPGMA")
    dm = dc.create_distance_matrix(alignment)

    start = time.time()
    dc.build_tree(dm)
    end = time.time()
    print("WPGMA ran in {} seconds.".format(end - start))
    dc.draw_tree()
def annotate_hist_msa(msa, htype, variant=None):
    """Adds to the MSA lines from features.json"""

    # read json
    with open("inp_data/features.json") as ff:
        f = json.load(ff)
    f = f[htype]
    genseq = f["General" + htype]["sequence"]
    genf = f["General" + htype]["feature1"]

    a = SummaryInfo(msa)
    cons = a.dumb_consensus(threshold=0.1, ambiguous="X")
    sr_c = SeqRecord(id="consensus", seq=cons)
    sr_genseq = SeqRecord(id="template", seq=Seq(genseq))
    auxmsa = muscle_aln([sr_c, sr_genseq])
    auxmsa.sort()

    gapped_template = str(auxmsa[1].seq)
    gapped_cons = str(auxmsa[0].seq)

    s = list()
    for c, i in zip(gapped_cons, range(len(gapped_template))):
        if c != "-":
            s.append(gapped_template[i])
    newgapped_template = "".join(s)
    # now we need to gap feature
    gapped_genf = list()

    k = 0
    for c, i in zip(newgapped_template, range(len(newgapped_template))):
        if c != "-":
            gapped_genf.append(genf[i - k])
        else:
            k = k + 1
            gapped_genf.append("-")
    gapped_genf = "".join(gapped_genf)

    newmsa = MultipleSeqAlignment([SeqRecord(id="gi|features|id", description=htype, seq=Seq(gapped_genf))])
    newmsa.extend(msa)
    # print newmsa
    return newmsa
Пример #8
0
def main():
    # file_name = "data/coding.fa"
    file_name = "data/cons_noncode.fa"
    # file_name = "data/test.fa"
    alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
    for seq_record in SeqIO.parse(file_name, "fasta"):
        alignment.extend([seq_record])

    par = ParsimonyExact(alignment, bnb=True)
    start = time.time()
    par.run(print_best=True)
    end = time.time()
    print("Maximum parsimony (exact) ran in {} seconds.".format(end - start))
    par.draw_tree(show_scores=True)
    
    print("------------------------------------------------------------------")

    par = ParsimonyHeuristics(alignment, seed=0)
    start = time.time()
    par.run(print_best=True)
    end = time.time()
    print("Maximum parsimony (with heuristics) ran in {} seconds.".format(end - start))
    par.draw_tree(show_scores=True)
Пример #9
0
def main():
    title=''
    #1. Getting data
    ########################################################
    ########################################################
    # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score.
    df=pd.read_csv('int_data/seqs_rs.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    
    #2. Filtering - filter initial dataset by type, variant and other parameters
    ########################################################
    ########################################################

    #2.1. Narrow by variant/type
    ########################################################
    title+='H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[((df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.X'))&(df['partial']==False)&(df['non_st_aa']==False)]
    # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)]

    # f_df=df[(df['hist_type']=='H2A')]

    print "Number of seqs after narrowing by hist type/var:", len(f_df)
    

    #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades
    #########################################################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    #This is akin manual removal of bad species
    del_nodes=[5782,5690]

    print "Selecting taxonomic subset for taxids: ",parent_nodes
    print "while removing taxonomic subset for taxids: ",del_nodes

    taxids=set(parent_nodes)
    for i in parent_nodes:
        taxids.update(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    for i in del_nodes:
        taxids=taxids.difference(set([i]))
        taxids=taxids.difference(set(ncbi.get_descendant_taxa(i,intermediate_nodes=True)))

    f_df=f_df[f_df['taxid'].isin(taxids)]
    print "Number of seq after taxonomic subset: ",len(f_df)
    


    #2.3.0 Marking number of identical sequence within each species and subspecies.
    #This will simplify further analysis of sequence filtering on similarity
    #We know that all refseqs are duplicated for instance.
    ################################################
    ident=dict()
    new_gis=list()
    tids=set(list(f_df['taxid']))
    for i in tids:
        # print i.name, i.sci_name
        temp_df=f_df[(f_df['taxid']==i)]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        if(len(gis)>1):
            res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00)
            ident.update(res)
        else:
            ident.update({gis[0]:1})

    f_df['ident']=[ident.get(k,1) for k in f_df['gi']]
    #where ident - number of identical sequnces for current sepecies/subspecies.
    print "Identity of sequence inside each taxid determined"

    #2.3.1. Calculate number of similar seqs for every seq in tax group
    #########################################################
    # Use powerful method, to get rid of random errors is to identify identical sequences
    # if a sequence is supported by two or more entires - this is good.
    # Here we add a degen column to our data set - showing how many similar sequences are found
    # for a given sequence in its taxonomic clade (genus currently) 

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen=dict()
    new_gis=list()
    tids=list(f_df['taxid']) 
    t = ncbi.get_topology(tids,intermediate_nodes=True)
    for i in t.search_nodes(rank='family'):
        # print i.name, i.sci_name
        nodeset=list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df=f_df[(f_df['taxid'].isin(nodeset))]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00)
        degen.update(res)

    # print degen
    f_df['degen']=[degen.get(k,1) for k in f_df['gi']]

    #2.3.2. Remove seqs that do not have support outside their species
    # if they are not curated or RefSeq NP.
    ###########################################################

    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    f_df=f_df[(f_df['degen']>f_df['ident'])|(f_df['curated']==True)|(f_df['RefSeq']==2)]
    print "After removing mined seqs with no support in neighboring species: ",len(f_df)

    #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority
    ###########################################################
    #RefSeq and degenerate sequence get priority
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    # print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])


    #2.4 Take one best representative per specific taxonomic rank (e.g. genus)
    ############################################################
    pruningrank='genus'
    print "Pruning taxonomy by ", pruningrank
    
    title+=' , one seq. per %s'%pruningrank
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    grouped_taxids=group_taxids(seqtaxids,rank=pruningrank)
    # print seqtaxids
    # print grouped_taxids
    #Now we need to take best representative
    #refseq NP, curated, or the one with largest degeneracy
    new_gis=list()
    for tids in grouped_taxids:
        t_df=f_df[f_df['taxid'].isin(tids)]
        #try take curated first
        if(len(t_df[t_df['curated']==True])>0):
            new_gis.append(t_df.loc[t_df.curated==True,'gi'].values[0])
            continue
        #try take NP records nest
        #RefSeq 2 means NP, 1 means XP
        if(len(t_df[t_df['RefSeq']==2])>0):
            new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0])
            continue
        # take best degenerate otherwise
        else:
            t_df=t_df.sort(['degen','RefSeq'],ascending=False) 
            new_gis.append(t_df['gi'].iloc[0])

    f_df=f_df[f_df['gi'].isin(new_gis)]

    print "After pruning taxonomy we have: ",len(f_df)


    #2.5. Check seq for sanity - needs to be checked!
    ##############################################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ##############################################

    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA############
    aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()

    #7. Vizualize MSA with ete2.##########
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)

    #10. Conservation############
    #############################
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw_norm',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0,norm="T"))),cn,features,axis='conservation')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main():
    title=''
    #1. Getting data
    df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    # exit()
    
    #2. Filtering
    ##########
    #2.1. Narrow by variant/type
    title+='CenH3'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[((df['hist_var']=='cenH3'))&(df['partial']==False)]
    # f_df=df[(df['hist_type']=='H2A')]
    # exit()

    print len(f_df)
    


    #2.2. Filter by list of taxonomy clades   
    ################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    print "Selecting taxonomic subset"
    taxids=list(parent_nodes)
    for i in parent_nodes:
        taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    f_df=f_df[f_df['taxid'].isin(taxids)]
    print len(f_df)
    # exit()
    
    #2.3*. Alternative powerful method, to get rid of random errors in seqs
    #We need to cluster seqs, and select only if we have support by two or more similar seqs 

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen=dict()
    new_gis=list()
    tids=list(f_df['taxid']) 
    t = ncbi.get_topology(tids,intermediate_nodes=True)
    for i in t.search_nodes(rank='genus'):
        # print i.name, i.sci_name
        nodeset=list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df=f_df[(f_df['taxid'].isin(nodeset))]
        gis=list(temp_df['gi']) #this is to limit exec time
        # print gis
        res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=0.95)
        degen.update(res)
        # exit()
        # for k,v in res.iteritems():
            # if v>2.0:
                # new_gis.append(k)
    # f_df=f_df[f_df['gi'].isin(new_gis)]
    print degen
    f_df['degen']=[degen.get(k,1) for k in f_df['gi']]

    #2.4. #####select one variant per taxid, priority to RefSeq
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates
    print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])


    #2.4 Take one best representative per specific taxonomic rank.
    ################
    title+=' , one seq. per genus, trimmed'
    print "Pruning taxonomy"
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    grouped_taxids=group_taxids(seqtaxids,rank='species')
    print seqtaxids
    print grouped_taxids
    #Now we need to take best representative
    #refseq NP, or the one with larges degeneracy
    new_gis=list()
    for tids in grouped_taxids:
        t_df=f_df[f_df['taxid'].isin(tids)]
        #take NP if we have it
        if(len(t_df[t_df['RefSeq']==2])>0):
            new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0])
            continue
        else: # take best degenerate
            t_df=t_df.sort(['degen','RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates
            if(t_df['degen'].iloc[0]>100):
                new_gis.append(t_df['gi'].iloc[0])

    f_df=f_df[f_df['gi'].isin(new_gis)]

    # new_seqtaxids=subsample_taxids(seqtaxids,rank='species') #new subsampled list
    # f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe
    # print "---"

    # exit()


    #2.5. Check seq for sanity
    ################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ####################
    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA
    aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()
    #7. Vizualize MSA with ete2.
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    print taxid2gi
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)


    #10. Conservation
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main():
    title=''
    #1. Getting data
    df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info
    fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences
    # exit()
    
    #2. Filtering
    ##########
    #2.1. Narrow by variant/type
    title+='Canonical H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df=df[(df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.1')]
    # f_df=df[(df['hist_type']=='H2A')]
    # exit()
    print len(f_df)
    #2.2. #####select one variant per taxid
    # title+=' 1ptax'
    f_df=f_df.sort(['RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates
    f_df=f_df.drop_duplicates(['taxid','hist_var'])


    # exit()
    #2.3. Filter by list of taxonomy clades   
    ################
    title+=' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    print "Selecting taxonomic subset"
    taxids=list(parent_nodes)
    for i in parent_nodes:
        taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True))
    f_df=f_df[f_df['taxid'].isin(taxids)]
    print len(f_df)
    # exit()
    
    #2.4 Take one representative per specific taxonomic rank.
    ################
    title+=', one sequence per order'
    print "Pruning taxonomy"
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids=list(f_df['taxid']) #old list
    new_seqtaxids=subsample_taxids(seqtaxids,rank='order') #new subsampled list
    f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe
    # print "---"
    print len(f_df)
    # exit()


    #2.5. Check seq for sanity
    ################
    # title+=' seqQC '

    print "Checkig sequence quality"
    newgis=list()
    for i,row in f_df.iterrows():
        gi=row['gi']
        seq=fasta_dict[str(gi)].seq
        hist_type=row['hist_type']
        hist_var=row['hist_var']
        if(check_hist_length(seq,hist_type,hist_var,1)&check_hist_core_length(seq,hist_type,1)):
            newgis.append(gi)
    f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    print len(f_df)
    # print list(f_df['gi'])
    # exit()

    #3. Make a list of seq with good ids and descriptions
    ####################
    f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])}
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa=muscle_aln(f_fasta_dict.values())
    AlignIO.write(msa, "results/h2a_ca_cellular.fasta", "fasta")

    msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "results/h2a_ca_cellular_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi=msa[i].id
        msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca')
    msa.sort(key=lambda x: x.description)


    #5. Visualize MSA
    aln2html(msa,'results/h2a_ca_cellular.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A in cellular organisms",description=True,field1w=10,field2w=35)


    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    title+=', gaps removed'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr=trim_hist_aln_to_core(msa)


    #7. Vizualize MSA with ete2.
    taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])}
    gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])}

    msa_dict={i.id:i.seq for i in msa_tr}
    print taxid2gi
    t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False)
    a=t.add_child(name='annotation')
    a.add_feature('sci_name','annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()
    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if(node.rank in ['order','class','phylum','kingdom']):   
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name=='annotation':
            s=str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi=taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('      '+str(int(node.name))+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('      '+str(gi2variant[gi])+' '),node,column=3, position = "aligned")

        if node.is_leaf() and node.name=='annotation':
            s=get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned")
            add_face_to_node(TextFace('       '+'NCBI_TAXID'+' '),node,column=2, position = "aligned")
            add_face_to_node(TextFace('       '+'Variant'+'       '),node,column=3, position = "aligned")



    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("results/h2a_ca_cellular.svg", w=6000, dpi=300, tree_style=ts)


    #10. Conservation
    features=get_hist_ss_in_aln_for_shade(msa_tr,below=True)
    cn=add_consensus(msa_tr,threshold=0.5)[-2:-1]

    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('results/h2a_ca_cellular_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    
    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('results/h2a_ca_cellular_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
    plot_prof4seq('results/h2a_ca_cellular_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
Usage = '''
fasta_to_nexus_mb.py is a script to convert a fasta alignment to a nexus
alignment (hopefully) without messing up the names too badly.
It also makes a separate Mr.Bayes script..
fasta_to_nexus.py [name of fasta alignment, expects an extension of .fa]
'''

if len(sys.argv) != 2:
	sys.exit("ERROR! This script expects one additional argument, and you gave it %d arguments!  %s" % (len(sys.argv), Usage))
InFileName = sys.argv[1]

sys.stderr.write("Alignment %s will be processed.\n" % (InFileName))

MyAlignment = MultipleSeqAlignment([], generic_dna)
MyAlignment.extend(AlignIO.read(InFileName, 'fasta'))
for record in MyAlignment:
	SeqNameTemp = record.id
	for Char in SeqNameTemp:
		if (Char in [':',',','(',')',':','[',']',"'","="]):
			SeqNameTemp = SeqNameTemp.replace(Char,'-')
	record.id = SeqNameTemp

OutFileName = InFileName[:-2]+"nex"

AlignIO.write(MyAlignment,OutFileName,'nexus')

OutList = [ ]
Line = "set autoclose=yes nowarn=yes\nexecute "+OutFileName+"\n"
Line += "lset nst=6 rates=invgamma\nunlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all)\n"
OutList.append(Line)
Пример #13
0
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H",
                   "D", "B"]

    ### input directory from s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/")

    if os.path.isdir(output_directory_2) == False:
        os.makedirs(output_directory_2)

    ### iterate each gene
    for file in os.listdir(output_directory_1):
        if file != ".DS_Store":
            output_directory_file = output_directory_2 + file
            fasta_name = output_directory_1 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " + sequence)

                alignment = AlignIO.read(sequence, 'fasta')

                ### calculate the polymorphism in outgroup
                ### change alignment to an array.
                total_wrong_poly_sites_outgroup = []

                align_array_outgroup = np.array([list(rec) for rec in alignment])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = alignment.get_alignment_length()
                # alignment = AlignIO.read(sequence, 'fasta')
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position_outgroup = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array_outgroup[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position_outgroup.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position_outgroup.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position_outgroup.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position_outgroup) > float(Max_p_sites_o):
                        print(column_position_outgroup)
                        total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup


                unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup))
                print(unique_wrong_sites_ougroup)
                print("outgroup")


                align_2 = MultipleSeqAlignment([])
                for record in alignment:
                    new_seq = ""

                    if record.id in outgroups:
                        print(record.seq)
                        for i in range(total_length):
                            if i in unique_wrong_sites_ougroup:
                                new_seq = new_seq + "-"
                            else:
                                new_seq = new_seq + str(record.seq[i])

                        temp_seq2 = SeqRecord(Seq(str(new_seq)), id=str(record.id))
                        align_2.extend([temp_seq2])
                        #align_2.extend(str(record.id), str(new_seq))

                    else:
                        temp_seq3 = SeqRecord(Seq(str(record.seq)), id=str(record.id))
                        align_2.extend([temp_seq3])
                        #align_2.extend(str(record.id), str(record.seq))

                print(align_2)

                AlignIO.write(align_2, output_directory_file, "fasta")
Пример #14
0
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4):
    ### define iupac
    iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"]

    ### input files are from s6
    genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/")

    ### mkdir output directory for s7
    genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/")

    ### return outgroup list
    outgroups = input_outgroup(outgroup_path)

    output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/"
    if os.path.isdir(output_directory) == False:
        os.makedirs(output_directory)

    ### iterate each gene
    for file in os.listdir(genes_result_s6):
        if file != ".DS_Store":
            output_directory_file = output_directory + file
            fasta_name = genes_result_s6 + file

            sequences = glob(fasta_name)
            ### read each alignment sequences
            for sequence in sequences:
                print("sequence: " +sequence)

                alignment = AlignIO.read(sequence, 'fasta')
                # print(alignment)

                ### generate a new alignment sequences without outgroups.
                align = MultipleSeqAlignment([])

                for record in alignment:
                    if record.id not in outgroups:
                        # print(record.id)
                        # print(record.seq)
                        temp_seq = SeqRecord(Seq(str(record.seq)), id=str(record.id))
                        # print(temp_seq)
                        align.extend([temp_seq])


                print(align)
                # print(align.get_alignment_length())


                total_wrong_poly_sites = []
                ### change alignment to an array.
                align_array = np.array([list(rec) for rec in align])
                ### , np.character
                # print(align_array)

                ### calculate the whole length of the alignment
                total_length = align.get_alignment_length()



                ### using 20bp-long sliding windows.
                for each in window(range(total_length), window_size):
                    # print(list(each))
                    poly_site_no_iupac = 0
                    poly_site_number = 0

                    column_position = []

                    ### for each block calculate the polymorphism sites number.
                    for column in each:
                        ### calculate each site (each column).
                        counter = Counter(align_array[:, column])

                        ### sorted by frequency
                        sorted_bases = counter.most_common()

                        # print(counter)
                        # print(sorted_bases)
                        # print(len(counter))

                        ### count the sites with different situations.
                        gap_yes = 0

                        if len(counter) ==1:
                            poly_site_number = poly_site_number + 0
                            poly_site_no_iupac = poly_site_no_iupac + 0


                        elif len(counter) == 2:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter
                                poly_site_number = poly_site_number + 0
                                poly_site_no_iupac = poly_site_no_iupac + 0

                            else:
                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                if len(iupac_in_alignment) == 0:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                        elif len(counter) == 3:
                            for i in sorted_bases:
                                if i[0] == "-":
                                    gap_yes = 1
                                else:
                                    gap_yes = 0
                            # print("gap is 1 or 0:" + str(gap_yes))

                            if gap_yes == 1:
                                # print counter

                                iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)]
                                # print(iupac_in_alignment)
                                if len(iupac_in_alignment) == 1:
                                    # poly_site_no_iupac = poly_site_no_iupac + 1
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 0

                                else:
                                    poly_site_number = poly_site_number + 1
                                    poly_site_no_iupac = poly_site_no_iupac + 1
                                    # print(column)
                                    column_position.append(column)

                            else:
                                poly_site_number = poly_site_number + 1
                                poly_site_no_iupac = poly_site_no_iupac + 1
                                # print(column)
                                column_position.append(column)


                        else:
                            poly_site_number = poly_site_number + 1
                            poly_site_no_iupac = poly_site_no_iupac + 1
                            # print(column)
                            column_position.append(column)


                    # print("column_position: " + str(column_position))
                    # print(len(column_position))

                    ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions.
                    if len(column_position) > float(Max_p_sites):
                        print(column_position)
                        total_wrong_poly_sites = total_wrong_poly_sites + column_position

                #print(total_wrong_poly_sites)

                ### generate the unique positions

                total_wrong_poly_sites = total_wrong_poly_sites + list(range(10))
                total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length))
                ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species.
                unique_wrong_sites = list(np.unique(total_wrong_poly_sites))
                print(len(unique_wrong_sites))
                # sum2 = alignment[:, total_length:total_length + 1]
                # for i in unique_wrong_sites:
                #     sum2 = sum2 + alignment[:, i:i+1]
                # print(sum2)
                # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip")


                ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites.
                ### otherwise, copy the gene to the new folder.
                if len(unique_wrong_sites) > 0:

                    print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}"))

                    cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")

                    cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col

                    print(cmd)
                    os.system(cmd)

                else:
                    cmd_2 = "cp " + fasta_name + " " + output_directory_file
                    print(cmd_2)
                    os.system(cmd_2)
Пример #15
0
def main():
    title = ''
    #1. Getting data
    ########################################################
    ########################################################
    # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score.
    df = pd.read_csv('int_data/seqs_rs.csv')  #Histone types info
    fasta_dict = pickle.load(open("int_data/fasta_dict.p", "rb"))  #Sequences

    #2. Filtering - filter initial dataset by type, variant and other parameters
    ########################################################
    ########################################################

    #2.1. Narrow by variant/type
    ########################################################
    title += 'H2A'
    # f_df=df[(df['hist_var']=='canonical_H4')]
    # f_df['hist_var']='canonical_H4'
    f_df = df[(
        (df['hist_var'] == 'canonical_H2A') | (df['hist_var'] == 'H2A.X'))
              & (df['partial'] == False) & (df['non_st_aa'] == False)]
    # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)]

    # f_df=df[(df['hist_type']=='H2A')]

    print "Number of seqs after narrowing by hist type/var:", len(f_df)

    #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades
    #########################################################
    title += ' across cellular organisms'
    # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    parent_nodes = [
        131567
    ]  #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates
    #33682 - euglenozoa
    #6656 - arthropods
    # 4751 - fungi
    #5782 - dictostelium
    #This is akin manual removal of bad species
    del_nodes = [5782, 5690]

    print "Selecting taxonomic subset for taxids: ", parent_nodes
    print "while removing taxonomic subset for taxids: ", del_nodes

    taxids = set(parent_nodes)
    for i in parent_nodes:
        taxids.update(ncbi.get_descendant_taxa(i, intermediate_nodes=True))
    for i in del_nodes:
        taxids = taxids.difference(set([i]))
        taxids = taxids.difference(
            set(ncbi.get_descendant_taxa(i, intermediate_nodes=True)))

    f_df = f_df[f_df['taxid'].isin(taxids)]
    print "Number of seq after taxonomic subset: ", len(f_df)

    #2.3.0 Marking number of identical sequence within each species and subspecies.
    #This will simplify further analysis of sequence filtering on similarity
    #We know that all refseqs are duplicated for instance.
    ################################################
    ident = dict()
    new_gis = list()
    tids = set(list(f_df['taxid']))
    for i in tids:
        # print i.name, i.sci_name
        temp_df = f_df[(f_df['taxid'] == i)]
        gis = list(temp_df['gi'])  #this is to limit exec time
        # print gis
        if (len(gis) > 1):
            res = cluster_seq_support({gi: fasta_dict[str(gi)]
                                       for gi in gis},
                                      ident_thresh=1.00)
            ident.update(res)
        else:
            ident.update({gis[0]: 1})

    f_df['ident'] = [ident.get(k, 1) for k in f_df['gi']]
    #where ident - number of identical sequnces for current sepecies/subspecies.
    print "Identity of sequence inside each taxid determined"

    #2.3.1. Calculate number of similar seqs for every seq in tax group
    #########################################################
    # Use powerful method, to get rid of random errors is to identify identical sequences
    # if a sequence is supported by two or more entires - this is good.
    # Here we add a degen column to our data set - showing how many similar sequences are found
    # for a given sequence in its taxonomic clade (genus currently)

    #We will traverse the species tree by species, genus or family, and determine degeneracy level
    degen = dict()
    new_gis = list()
    tids = list(f_df['taxid'])
    t = ncbi.get_topology(tids, intermediate_nodes=True)
    for i in t.search_nodes(rank='family'):
        # print i.name, i.sci_name
        nodeset = list()
        for k in i.traverse():
            nodeset.append(int(k.name))
        temp_df = f_df[(f_df['taxid'].isin(nodeset))]
        gis = list(temp_df['gi'])  #this is to limit exec time
        # print gis
        res = cluster_seq_support({gi: fasta_dict[str(gi)]
                                   for gi in gis},
                                  ident_thresh=1.00)
        degen.update(res)

    # print degen
    f_df['degen'] = [degen.get(k, 1) for k in f_df['gi']]

    #2.3.2. Remove seqs that do not have support outside their species
    # if they are not curated or RefSeq NP.
    ###########################################################

    f_df = f_df.sort(
        ['RefSeq', 'degen'], ascending=False
    )  # so that RefSeq record get priority on removing duplicates
    f_df = f_df[(f_df['degen'] > f_df['ident']) | (f_df['curated'] == True) |
                (f_df['RefSeq'] == 2)]
    print "After removing mined seqs with no support in neighboring species: ", len(
        f_df)

    #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority
    ###########################################################
    #RefSeq and degenerate sequence get priority
    # title+=' 1ptax'
    f_df = f_df.sort(
        ['RefSeq', 'degen'], ascending=False
    )  # so that RefSeq record get priority on removing duplicates
    # print f_df[0:10]
    # f_df=f_df.drop_duplicates(['taxid','hist_var'])

    #2.4 Take one best representative per specific taxonomic rank (e.g. genus)
    ############################################################
    pruningrank = 'genus'
    print "Pruning taxonomy by ", pruningrank

    title += ' , one seq. per %s' % pruningrank
    #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies
    seqtaxids = list(f_df['taxid'])  #old list
    grouped_taxids = group_taxids(seqtaxids, rank=pruningrank)
    # print seqtaxids
    # print grouped_taxids
    #Now we need to take best representative
    #refseq NP, curated, or the one with largest degeneracy
    new_gis = list()
    for tids in grouped_taxids:
        t_df = f_df[f_df['taxid'].isin(tids)]
        #try take curated first
        if (len(t_df[t_df['curated'] == True]) > 0):
            new_gis.append(t_df.loc[t_df.curated == True, 'gi'].values[0])
            continue
        #try take NP records nest
        #RefSeq 2 means NP, 1 means XP
        if (len(t_df[t_df['RefSeq'] == 2]) > 0):
            new_gis.append(t_df.loc[t_df.RefSeq == 2, 'gi'].values[0])
            continue
        # take best degenerate otherwise
        else:
            t_df = t_df.sort(['degen', 'RefSeq'], ascending=False)
            new_gis.append(t_df['gi'].iloc[0])

    f_df = f_df[f_df['gi'].isin(new_gis)]

    print "After pruning taxonomy we have: ", len(f_df)

    #2.5. Check seq for sanity - needs to be checked!
    ##############################################
    # title+=' seqQC '

    # print "Checkig sequence quality"
    # newgis=list()
    # for i,row in f_df.iterrows():
    #     gi=row['gi']
    #     seq=fasta_dict[str(gi)].seq
    #     hist_type=row['hist_type']
    #     hist_var=row['hist_var']
    #     if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)):
    #         newgis.append(gi)
    # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe
    # print len(f_df)

    #3. Make a list of seq with good ids and descriptions
    ##############################################

    f_fasta_dict = {
        key: value
        for (key, value) in fasta_dict.iteritems()
        if int(key) in list(f_df['gi'])
    }
    print len(f_fasta_dict)
    taxid2name = ncbi.get_taxid_translator(list(f_df['taxid']))
    #Relabel sequences gi=> type and organism
    f_fasta_dict = {
        key: SeqRecord(
            id=key,
            description=f_df.loc[f_df.gi == int(key), 'hist_var'].values[0] +
            ' ' + taxid2name[f_df.loc[f_df.gi == int(key), 'taxid'].values[0]],
            seq=value.seq)
        for (key, value) in f_fasta_dict.iteritems()
    }
    #with arbitrary index
    # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) }
    # exit()

    #4. Make MSA
    #################
    #Here we construct MSA
    msa = muscle_aln(f_fasta_dict.values(), gapopen=float(-20))
    AlignIO.write(msa, "int_data/example_msa.fasta", "fasta")

    msa_annot = MultipleSeqAlignment([
        SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(
            ' ', '-')),
                  id='annotation',
                  name='')
    ])
    msa_annot.extend(msa)
    AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta")

    for i in range(len(msa)):
        gi = msa[i].id
        msa[i].description = f_fasta_dict[gi].description.replace(
            'canonical', 'ca')
    msa.sort(key=lambda x: x.description)

    #5. Visualize MSA############
    aln2html(msa,
             'example_h2a.html',
             features=get_hist_ss_in_aln_for_html(msa, 'H2A', 0),
             title="canonical H2A alignment",
             description=True,
             field1w=10,
             field2w=35)

    #6. Trim alignment - this is optional
    #6.1. Trim gaps
    # title+=' gaptrim'
    # msa_tr=trim_aln_gaps(msa,threshold=0.8)

    #6.2. Trim to histone core sequence
    msa_tr = trim_hist_aln_to_core(msa)
    # msa_tr=msa
    # print get_hist_ss_in_aln_for_shade(msa_tr,below=True)

    # exit()

    #7. Vizualize MSA with ete2.##########
    taxid2gi = {
        f_df.loc[f_df.gi == int(gi), 'taxid'].values[0]: gi
        for gi in list(f_df['gi'])
    }
    gi2variant = {
        gi: f_df.loc[f_df.gi == int(gi), 'hist_var'].values[0]
        for gi in list(f_df['gi'])
    }

    msa_dict = {i.id: i.seq for i in msa_tr}
    t = ncbi.get_topology(list(f_df['taxid']), intermediate_nodes=False)
    a = t.add_child(name='annotation')
    a.add_feature('sci_name', 'annotation')
    t.sort_descendants(attr='sci_name')
    ts = TreeStyle()

    def layout(node):
        # print node.rank
        # print node.sci_name
        if getattr(node, "rank", None):
            if (node.rank in ['order', 'class', 'phylum', 'kingdom']):
                rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred")
                node.add_face(rank_face, column=0, position="branch-top")
        if node.is_leaf():
            sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue")
            node.add_face(sciname_face, column=0, position="branch-right")
        if node.is_leaf() and not node.name == 'annotation':
            s = str(msa_dict[str(taxid2gi[int(node.name)])])
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            gi = taxid2gi[int(node.name)]
            add_face_to_node(TextFace(' ' + str(gi) + ' '),
                             node,
                             column=1,
                             position="aligned")
            add_face_to_node(TextFace('      ' + str(int(node.name)) + ' '),
                             node,
                             column=2,
                             position="aligned")
            add_face_to_node(TextFace('      ' + str(gi2variant[gi]) + ' '),
                             node,
                             column=3,
                             position="aligned")

        if node.is_leaf() and node.name == 'annotation':
            s = get_hist_ss_in_aln_as_string(msa_tr)
            seqFace = SeqMotifFace(
                s, [[0, len(s), "seq", 10, 10, None, None, None]],
                scale_factor=1)
            add_face_to_node(seqFace, node, 0, position="aligned")
            add_face_to_node(TextFace(' ' + 'NCBI_GI' + ' '),
                             node,
                             column=1,
                             position="aligned")
            add_face_to_node(TextFace('       ' + 'NCBI_TAXID' + ' '),
                             node,
                             column=2,
                             position="aligned")
            add_face_to_node(TextFace('       ' + 'Variant' + '       '),
                             node,
                             column=3,
                             position="aligned")

    ts.layout_fn = layout
    ts.show_leaf_name = False
    ts.title.add_face(TextFace(title, fsize=20), column=0)
    t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts)

    #10. Conservation############
    #############################
    features = get_hist_ss_in_aln_for_shade(msa_tr, below=True)
    cn = add_consensus(msa_tr, threshold=0.5)[-2:-1]
    # Below are three methods that we find useful.
    # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_ent_unw',
                  map(lambda x: log(20) + x,
                      map(float, cons_prof(msa_tr, f=0, c=0))),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_ent_unw_norm',
                  map(lambda x: log(20) + x,
                      map(float, cons_prof(msa_tr, f=0, c=0, norm="T"))),
                  cn,
                  features,
                  axis='conservation')

    # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation')
    plot_prof4seq('example_cons_sofp_unw_renorm1',
                  map(float, cons_prof(msa_tr, f=0, c=2, m=1)),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_sofp_unw',
                  map(float, cons_prof(msa_tr, f=0, c=2, m=0)),
                  cn,
                  features,
                  axis='conservation')
    plot_prof4seq('example_cons_sofp_psic_renorm1',
                  map(float, cons_prof(msa_tr, f=2, c=2, m=1)),
                  cn,
                  features,
                  axis='conservation')
        ''' PhyChem '''
        myclade.calculate_phchem_conservation(
        )  #find frequency for each column

        myclade.find_phychm_conserved_regions(min_conserved_length, consensus)
        ''' Rate4Site '''
        myclade.run_rate4site()
        ''' OutPut '''
        myclade.print_clade_analysis()  # display

        myclade.write_clade_to_files()  # default: cladename.tre, clade_name.fa

        #cladedict[cladename] = myclade
        cladelist.append(myclade)

        MSA_everycalade.extend(MSA)
        MSAodo_everycalade.extend(ODO)
''' +++ ALL_CLADES +++ 
ALL SELECTED CLADES IN ONE (it might be the case when not all clades are seleced in tree)
'''

all_clades = Clade(fileheader, "ALL_CLADES", tree, allfasta, MSA_everycalade,
                   MSAodo_everycalade, entropy_gap_weight)
''' Oder/DisOrder '''
all_clades.calculate_odo_conservation()  #find O/D frequency for each column
''' AA '''
all_clades.calculate_aa_conservation()  #findAA  frequency for each column

all_clades.find_aa_conserved_regions(min_conserved_length, consensus)
''' AA entropy '''
all_clades.calculate_aa_entropy()
Пример #17
0
# Prettify labels
def get_label(leaf):
    if leaf.name.startswith("Inner"):
        return ""
    return leaf.name.replace("_", " ")


# Read the sequences and align

aln = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-"))
for seq_record in SeqIO.parse("data/coding.fa", "fasta"):
    # for seq_record in SeqIO.parse("data/cons_noncode.fa", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    aln.extend([seq_record])

# Print the alignment
print(aln)

# Calculate the distance matrix
calculator = DistanceCalculator('identity')
dm = calculator.get_distance(aln)

# Print the distance Matrix
print('\nDistance Matrix\n===================')
print(dm)

# Construct the phylogenetic tree using UPGMA algorithm
constructor = DistanceTreeConstructor()
fasta_to_nexus_mb.py is a script to convert a fasta alignment to a nexus
alignment (hopefully) without messing up the names too badly.
It also makes a separate Mr.Bayes script..
fasta_to_nexus.py [name of fasta alignment, expects an extension of .fa]
'''

if len(sys.argv) != 2:
    sys.exit(
        "ERROR! This script expects one additional argument, and you gave it %d arguments!  %s"
        % (len(sys.argv), Usage))
InFileName = sys.argv[1]

sys.stderr.write("Alignment %s will be processed.\n" % (InFileName))

MyAlignment = MultipleSeqAlignment([], generic_dna)
MyAlignment.extend(AlignIO.read(InFileName, 'fasta'))
for record in MyAlignment:
    SeqNameTemp = record.id
    for Char in SeqNameTemp:
        if (Char in [':', ',', '(', ')', ':', '[', ']', "'", "="]):
            SeqNameTemp = SeqNameTemp.replace(Char, '-')
    record.id = SeqNameTemp

OutFileName = InFileName[:-2] + "nex"

AlignIO.write(MyAlignment, OutFileName, 'nexus')

OutList = []
Line = "set autoclose=yes nowarn=yes\nexecute " + OutFileName + "\n"
#Line += "lset nst=6 rates=invgamma\nunlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all)\n"
#OutList.append(Line)