def check_matching(self) -> tuple: init_clstr = defaultdict(list) dichord_list = [] converted_names = {} related_aligns = None for n, tip in enumerate(self.tree.get_terminals()): tip_name = tip.name try: seq_record = self.aligns_as_seqs[tip_name] dichord = TipSeqLinker( seq_record, (self.tree.root, *self.tree.get_path(tip)) ) except KeyError: raise TipNotMatchedError(tip) init_clstr[tip].append(dichord) dichord_list.append(dichord) new_seq_id = 'seq{}'.format(n) converted_names[tip_name] = new_seq_id converted_names[new_seq_id] = tip_name if related_aligns is None: related_aligns = MultipleSeqAlignment([seq_record]) else: related_aligns.extend([seq_record]) return ( init_clstr, dichord_list, converted_names, tuple(range(related_aligns.get_alignment_length())) )
def needle_alignment(s1, s2): ''' DESCRIPTION Does a Needleman-Wunsch Alignment of sequence s1 and s2 and returns a Bio.Align.MultipleSeqAlignment object. ''' from Bio import pairwise2 from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord try: from Bio.Align import substitution_matrices except ImportError: from Bio.SubsMat.MatrixInfo import blosum62 else: blosum62 = substitution_matrices.load("BLOSUM62") def match_callback(c1, c2): return blosum62.get((c1, c2), 1 if c1 == c2 else -4) alns = pairwise2.align.globalcs(s1, s2, match_callback, -10., -.5, one_alignment_only=True) a = MultipleSeqAlignment([]) s1 = SeqRecord(alns[0][0], id="s1") s2 = SeqRecord(alns[0][1], id="s2") a.extend([s1, s2]) return a
def main(): file_name = "data/coding.fa" # file_name = "data/cons_noncode.fa" alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse(file_name, "fasta"): alignment.extend([seq_record]) print("Number of characters in alignment:", len(alignment[0])) #################### # Neighbor joining # #################### calculator = DistanceCalculator('identity') dm = calculator.get_distance(alignment) constructor = DistanceTreeConstructor() start = time.time() tree = constructor.nj(dm) end = time.time() print("Neighbor joining ran in {} seconds.".format(end - start)) Phylo.draw(tree, label_func=get_label) ######### # UPGMA # ######### start = time.time() tree = constructor.upgma(dm) end = time.time() print("UPGMA ran in {} seconds.".format(end - start)) Phylo.draw(tree, label_func=get_label)
def main(): file_name = "data/coding.fa" # file_name = "data/cons_noncode.fa" alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse(file_name, "fasta"): alignment.extend([seq_record]) par = SOTA(alignment[:5]) start = time.time() par.train() end = time.time() print("Self-organizing tree network ran in {} seconds.".format(end - start)) par.draw_tree()
def to_alignment(self): """Construct an alignment from the aligned sequences in this tree.""" def is_aligned_seq(elem): if isinstance(elem, Sequence) and elem.mol_seq.is_aligned: return True return False seqs = self._filter_search(is_aligned_seq, 'preorder', True) try: first_seq = next(seqs) except StopIteration: # No aligned sequences were found --> empty MSA return MultipleSeqAlignment([]) msa = MultipleSeqAlignment([first_seq.to_seqrecord()], first_seq.get_alphabet()) msa.extend(seq.to_seqrecord() for seq in seqs) return msa
def main(): file_name = "data/coding.fa" # file_name = "data/cons_noncode.fa" alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse(file_name, "fasta"): alignment.extend([seq_record]) #################### # Neighbor joining # #################### dc = Distance_Calculator() dm = dc.create_distance_matrix(alignment) dm.data.to_csv("animals.csv") start = time.time() dc.build_tree(dm) end = time.time() print("Neighbor joining ran in {} seconds.".format(end - start)) dc.draw_tree() ######### # UPGMA # ######### dc = Distance_Calculator(mode="UPGMA") dm = dc.create_distance_matrix(alignment) start = time.time() dc.build_tree(dm) end = time.time() print("UPGMA ran in {} seconds.".format(end - start)) dc.draw_tree() ######### # WPGMA # ######### dc = Distance_Calculator(mode="WPGMA") dm = dc.create_distance_matrix(alignment) start = time.time() dc.build_tree(dm) end = time.time() print("WPGMA ran in {} seconds.".format(end - start)) dc.draw_tree()
def annotate_hist_msa(msa, htype, variant=None): """Adds to the MSA lines from features.json""" # read json with open("inp_data/features.json") as ff: f = json.load(ff) f = f[htype] genseq = f["General" + htype]["sequence"] genf = f["General" + htype]["feature1"] a = SummaryInfo(msa) cons = a.dumb_consensus(threshold=0.1, ambiguous="X") sr_c = SeqRecord(id="consensus", seq=cons) sr_genseq = SeqRecord(id="template", seq=Seq(genseq)) auxmsa = muscle_aln([sr_c, sr_genseq]) auxmsa.sort() gapped_template = str(auxmsa[1].seq) gapped_cons = str(auxmsa[0].seq) s = list() for c, i in zip(gapped_cons, range(len(gapped_template))): if c != "-": s.append(gapped_template[i]) newgapped_template = "".join(s) # now we need to gap feature gapped_genf = list() k = 0 for c, i in zip(newgapped_template, range(len(newgapped_template))): if c != "-": gapped_genf.append(genf[i - k]) else: k = k + 1 gapped_genf.append("-") gapped_genf = "".join(gapped_genf) newmsa = MultipleSeqAlignment([SeqRecord(id="gi|features|id", description=htype, seq=Seq(gapped_genf))]) newmsa.extend(msa) # print newmsa return newmsa
def main(): # file_name = "data/coding.fa" file_name = "data/cons_noncode.fa" # file_name = "data/test.fa" alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse(file_name, "fasta"): alignment.extend([seq_record]) par = ParsimonyExact(alignment, bnb=True) start = time.time() par.run(print_best=True) end = time.time() print("Maximum parsimony (exact) ran in {} seconds.".format(end - start)) par.draw_tree(show_scores=True) print("------------------------------------------------------------------") par = ParsimonyHeuristics(alignment, seed=0) start = time.time() par.run(print_best=True) end = time.time() print("Maximum parsimony (with heuristics) ran in {} seconds.".format(end - start)) par.draw_tree(show_scores=True)
def main(): title='' #1. Getting data ######################################################## ######################################################## # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score. df=pd.read_csv('int_data/seqs_rs.csv') #Histone types info fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences #2. Filtering - filter initial dataset by type, variant and other parameters ######################################################## ######################################################## #2.1. Narrow by variant/type ######################################################## title+='H2A' # f_df=df[(df['hist_var']=='canonical_H4')] # f_df['hist_var']='canonical_H4' f_df=df[((df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.X'))&(df['partial']==False)&(df['non_st_aa']==False)] # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)] # f_df=df[(df['hist_type']=='H2A')] print "Number of seqs after narrowing by hist type/var:", len(f_df) #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades ######################################################### title+=' across cellular organisms' # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates #33682 - euglenozoa #6656 - arthropods # 4751 - fungi #5782 - dictostelium #This is akin manual removal of bad species del_nodes=[5782,5690] print "Selecting taxonomic subset for taxids: ",parent_nodes print "while removing taxonomic subset for taxids: ",del_nodes taxids=set(parent_nodes) for i in parent_nodes: taxids.update(ncbi.get_descendant_taxa(i,intermediate_nodes=True)) for i in del_nodes: taxids=taxids.difference(set([i])) taxids=taxids.difference(set(ncbi.get_descendant_taxa(i,intermediate_nodes=True))) f_df=f_df[f_df['taxid'].isin(taxids)] print "Number of seq after taxonomic subset: ",len(f_df) #2.3.0 Marking number of identical sequence within each species and subspecies. #This will simplify further analysis of sequence filtering on similarity #We know that all refseqs are duplicated for instance. ################################################ ident=dict() new_gis=list() tids=set(list(f_df['taxid'])) for i in tids: # print i.name, i.sci_name temp_df=f_df[(f_df['taxid']==i)] gis=list(temp_df['gi']) #this is to limit exec time # print gis if(len(gis)>1): res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00) ident.update(res) else: ident.update({gis[0]:1}) f_df['ident']=[ident.get(k,1) for k in f_df['gi']] #where ident - number of identical sequnces for current sepecies/subspecies. print "Identity of sequence inside each taxid determined" #2.3.1. Calculate number of similar seqs for every seq in tax group ######################################################### # Use powerful method, to get rid of random errors is to identify identical sequences # if a sequence is supported by two or more entires - this is good. # Here we add a degen column to our data set - showing how many similar sequences are found # for a given sequence in its taxonomic clade (genus currently) #We will traverse the species tree by species, genus or family, and determine degeneracy level degen=dict() new_gis=list() tids=list(f_df['taxid']) t = ncbi.get_topology(tids,intermediate_nodes=True) for i in t.search_nodes(rank='family'): # print i.name, i.sci_name nodeset=list() for k in i.traverse(): nodeset.append(int(k.name)) temp_df=f_df[(f_df['taxid'].isin(nodeset))] gis=list(temp_df['gi']) #this is to limit exec time # print gis res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=1.00) degen.update(res) # print degen f_df['degen']=[degen.get(k,1) for k in f_df['gi']] #2.3.2. Remove seqs that do not have support outside their species # if they are not curated or RefSeq NP. ########################################################### f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates f_df=f_df[(f_df['degen']>f_df['ident'])|(f_df['curated']==True)|(f_df['RefSeq']==2)] print "After removing mined seqs with no support in neighboring species: ",len(f_df) #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority ########################################################### #RefSeq and degenerate sequence get priority # title+=' 1ptax' f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates # print f_df[0:10] # f_df=f_df.drop_duplicates(['taxid','hist_var']) #2.4 Take one best representative per specific taxonomic rank (e.g. genus) ############################################################ pruningrank='genus' print "Pruning taxonomy by ", pruningrank title+=' , one seq. per %s'%pruningrank #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies seqtaxids=list(f_df['taxid']) #old list grouped_taxids=group_taxids(seqtaxids,rank=pruningrank) # print seqtaxids # print grouped_taxids #Now we need to take best representative #refseq NP, curated, or the one with largest degeneracy new_gis=list() for tids in grouped_taxids: t_df=f_df[f_df['taxid'].isin(tids)] #try take curated first if(len(t_df[t_df['curated']==True])>0): new_gis.append(t_df.loc[t_df.curated==True,'gi'].values[0]) continue #try take NP records nest #RefSeq 2 means NP, 1 means XP if(len(t_df[t_df['RefSeq']==2])>0): new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0]) continue # take best degenerate otherwise else: t_df=t_df.sort(['degen','RefSeq'],ascending=False) new_gis.append(t_df['gi'].iloc[0]) f_df=f_df[f_df['gi'].isin(new_gis)] print "After pruning taxonomy we have: ",len(f_df) #2.5. Check seq for sanity - needs to be checked! ############################################## # title+=' seqQC ' # print "Checkig sequence quality" # newgis=list() # for i,row in f_df.iterrows(): # gi=row['gi'] # seq=fasta_dict[str(gi)].seq # hist_type=row['hist_type'] # hist_var=row['hist_var'] # if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)): # newgis.append(gi) # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe # print len(f_df) #3. Make a list of seq with good ids and descriptions ############################################## f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])} print len(f_fasta_dict) taxid2name = ncbi.get_taxid_translator(list(f_df['taxid'])) #Relabel sequences gi=> type and organism f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } # exit() #4. Make MSA ################# #Here we construct MSA msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20)) AlignIO.write(msa, "int_data/example_msa.fasta", "fasta") msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')]) msa_annot.extend(msa) AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta") for i in range(len(msa)): gi=msa[i].id msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca') msa.sort(key=lambda x: x.description) #5. Visualize MSA############ aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35) #6. Trim alignment - this is optional #6.1. Trim gaps # title+=' gaptrim' # msa_tr=trim_aln_gaps(msa,threshold=0.8) #6.2. Trim to histone core sequence msa_tr=trim_hist_aln_to_core(msa) # msa_tr=msa # print get_hist_ss_in_aln_for_shade(msa_tr,below=True) # exit() #7. Vizualize MSA with ete2.########## taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])} gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])} msa_dict={i.id:i.seq for i in msa_tr} t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False) a=t.add_child(name='annotation') a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if(node.rank in ['order','class','phylum','kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name=='annotation': s=str(msa_dict[str(taxid2gi[int(node.name)])]) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") if node.is_leaf() and node.name=='annotation': s=get_hist_ss_in_aln_as_string(msa_tr) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts) #10. Conservation############ ############################# features=get_hist_ss_in_aln_for_shade(msa_tr,below=True) cn=add_consensus(msa_tr,threshold=0.5)[-2:-1] # Below are three methods that we find useful. # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation') plot_prof4seq('example_cons_ent_unw_norm',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0,norm="T"))),cn,features,axis='conservation') # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main(): title='' #1. Getting data df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences # exit() #2. Filtering ########## #2.1. Narrow by variant/type title+='CenH3' # f_df=df[(df['hist_var']=='canonical_H4')] # f_df['hist_var']='canonical_H4' f_df=df[((df['hist_var']=='cenH3'))&(df['partial']==False)] # f_df=df[(df['hist_type']=='H2A')] # exit() print len(f_df) #2.2. Filter by list of taxonomy clades ################ title+=' across cellular organisms' # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates #33682 - euglenozoa #6656 - arthropods # 4751 - fungi #5782 - dictostelium print "Selecting taxonomic subset" taxids=list(parent_nodes) for i in parent_nodes: taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True)) f_df=f_df[f_df['taxid'].isin(taxids)] print len(f_df) # exit() #2.3*. Alternative powerful method, to get rid of random errors in seqs #We need to cluster seqs, and select only if we have support by two or more similar seqs #We will traverse the species tree by species, genus or family, and determine degeneracy level degen=dict() new_gis=list() tids=list(f_df['taxid']) t = ncbi.get_topology(tids,intermediate_nodes=True) for i in t.search_nodes(rank='genus'): # print i.name, i.sci_name nodeset=list() for k in i.traverse(): nodeset.append(int(k.name)) temp_df=f_df[(f_df['taxid'].isin(nodeset))] gis=list(temp_df['gi']) #this is to limit exec time # print gis res=cluster_seq_support({gi:fasta_dict[str(gi)] for gi in gis},ident_thresh=0.95) degen.update(res) # exit() # for k,v in res.iteritems(): # if v>2.0: # new_gis.append(k) # f_df=f_df[f_df['gi'].isin(new_gis)] print degen f_df['degen']=[degen.get(k,1) for k in f_df['gi']] #2.4. #####select one variant per taxid, priority to RefSeq # title+=' 1ptax' f_df=f_df.sort(['RefSeq','degen'],ascending=False) # so that RefSeq record get priority on removing duplicates print f_df[0:10] # f_df=f_df.drop_duplicates(['taxid','hist_var']) #2.4 Take one best representative per specific taxonomic rank. ################ title+=' , one seq. per genus, trimmed' print "Pruning taxonomy" #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies seqtaxids=list(f_df['taxid']) #old list grouped_taxids=group_taxids(seqtaxids,rank='species') print seqtaxids print grouped_taxids #Now we need to take best representative #refseq NP, or the one with larges degeneracy new_gis=list() for tids in grouped_taxids: t_df=f_df[f_df['taxid'].isin(tids)] #take NP if we have it if(len(t_df[t_df['RefSeq']==2])>0): new_gis.append(t_df.loc[t_df.RefSeq==2,'gi'].values[0]) continue else: # take best degenerate t_df=t_df.sort(['degen','RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates if(t_df['degen'].iloc[0]>100): new_gis.append(t_df['gi'].iloc[0]) f_df=f_df[f_df['gi'].isin(new_gis)] # new_seqtaxids=subsample_taxids(seqtaxids,rank='species') #new subsampled list # f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe # print "---" # exit() #2.5. Check seq for sanity ################ # title+=' seqQC ' # print "Checkig sequence quality" # newgis=list() # for i,row in f_df.iterrows(): # gi=row['gi'] # seq=fasta_dict[str(gi)].seq # hist_type=row['hist_type'] # hist_var=row['hist_var'] # if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)): # newgis.append(gi) # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe # print len(f_df) #3. Make a list of seq with good ids and descriptions #################### f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])} print len(f_fasta_dict) taxid2name = ncbi.get_taxid_translator(list(f_df['taxid'])) #Relabel sequences gi=> type and organism f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } # exit() #4. Make MSA ################# #Here we construct MSA msa=muscle_aln(f_fasta_dict.values(),gapopen=float(-20)) AlignIO.write(msa, "int_data/example_msa.fasta", "fasta") msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')]) msa_annot.extend(msa) AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta") for i in range(len(msa)): gi=msa[i].id msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca') msa.sort(key=lambda x: x.description) #5. Visualize MSA aln2html(msa,'example_h2a.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A alignment",description=True,field1w=10,field2w=35) #6. Trim alignment - this is optional #6.1. Trim gaps # title+=' gaptrim' # msa_tr=trim_aln_gaps(msa,threshold=0.8) #6.2. Trim to histone core sequence msa_tr=trim_hist_aln_to_core(msa) # msa_tr=msa # print get_hist_ss_in_aln_for_shade(msa_tr,below=True) # exit() #7. Vizualize MSA with ete2. taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])} gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])} msa_dict={i.id:i.seq for i in msa_tr} print taxid2gi t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False) a=t.add_child(name='annotation') a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if(node.rank in ['order','class','phylum','kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name=='annotation': s=str(msa_dict[str(taxid2gi[int(node.name)])]) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") if node.is_leaf() and node.name=='annotation': s=get_hist_ss_in_aln_as_string(msa_tr) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts) #10. Conservation features=get_hist_ss_in_aln_for_shade(msa_tr,below=True) cn=add_consensus(msa_tr,threshold=0.5)[-2:-1] # Below are three methods that we find useful. # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation') # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2,m=0)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation')
def main(): title='' #1. Getting data df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info fasta_dict=pickle.load( open( "int_data/fasta_dict.p", "rb" )) #Sequences # exit() #2. Filtering ########## #2.1. Narrow by variant/type title+='Canonical H2A' # f_df=df[(df['hist_var']=='canonical_H4')] # f_df['hist_var']='canonical_H4' f_df=df[(df['hist_var']=='canonical_H2A')|(df['hist_var']=='H2A.1')] # f_df=df[(df['hist_type']=='H2A')] # exit() print len(f_df) #2.2. #####select one variant per taxid # title+=' 1ptax' f_df=f_df.sort(['RefSeq'],ascending=False) # so that RefSeq record get priority on removing duplicates f_df=f_df.drop_duplicates(['taxid','hist_var']) # exit() #2.3. Filter by list of taxonomy clades ################ title+=' across cellular organisms' # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates parent_nodes=[131567] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates #33682 - euglenozoa #6656 - arthropods # 4751 - fungi print "Selecting taxonomic subset" taxids=list(parent_nodes) for i in parent_nodes: taxids.extend(ncbi.get_descendant_taxa(i,intermediate_nodes=True)) f_df=f_df[f_df['taxid'].isin(taxids)] print len(f_df) # exit() #2.4 Take one representative per specific taxonomic rank. ################ title+=', one sequence per order' print "Pruning taxonomy" #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies seqtaxids=list(f_df['taxid']) #old list new_seqtaxids=subsample_taxids(seqtaxids,rank='order') #new subsampled list f_df=f_df[f_df['taxid'].isin(new_seqtaxids)] #remake the dataframe # print "---" print len(f_df) # exit() #2.5. Check seq for sanity ################ # title+=' seqQC ' print "Checkig sequence quality" newgis=list() for i,row in f_df.iterrows(): gi=row['gi'] seq=fasta_dict[str(gi)].seq hist_type=row['hist_type'] hist_var=row['hist_var'] if(check_hist_length(seq,hist_type,hist_var,1)&check_hist_core_length(seq,hist_type,1)): newgis.append(gi) f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe print len(f_df) # print list(f_df['gi']) # exit() #3. Make a list of seq with good ids and descriptions #################### f_fasta_dict={key: value for (key,value) in fasta_dict.iteritems() if int(key) in list(f_df['gi'])} print len(f_fasta_dict) taxid2name = ncbi.get_taxid_translator(list(f_df['taxid'])) #Relabel sequences gi=> type and organism f_fasta_dict={key: SeqRecord(id=key, description=f_df.loc[f_df.gi==int(key),'hist_var'].values[0]+' '+taxid2name[f_df.loc[f_df.gi==int(key),'taxid'].values[0]],seq=value.seq) for (key,value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } # exit() #4. Make MSA ################# #Here we construct MSA msa=muscle_aln(f_fasta_dict.values()) AlignIO.write(msa, "results/h2a_ca_cellular.fasta", "fasta") msa_annot=MultipleSeqAlignment([SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace(' ','-')),id='annotation',name='')]) msa_annot.extend(msa) AlignIO.write(msa_annot, "results/h2a_ca_cellular_annot.fasta", "fasta") for i in range(len(msa)): gi=msa[i].id msa[i].description=f_fasta_dict[gi].description.replace('canonical','ca') msa.sort(key=lambda x: x.description) #5. Visualize MSA aln2html(msa,'results/h2a_ca_cellular.html',features=get_hist_ss_in_aln_for_html(msa,'H2A',0),title="canonical H2A in cellular organisms",description=True,field1w=10,field2w=35) #6. Trim alignment - this is optional #6.1. Trim gaps title+=', gaps removed' # msa_tr=trim_aln_gaps(msa,threshold=0.8) #6.2. Trim to histone core sequence msa_tr=trim_hist_aln_to_core(msa) #7. Vizualize MSA with ete2. taxid2gi={f_df.loc[f_df.gi==int(gi),'taxid'].values[0]:gi for gi in list(f_df['gi'])} gi2variant={gi:f_df.loc[f_df.gi==int(gi),'hist_var'].values[0] for gi in list(f_df['gi'])} msa_dict={i.id:i.seq for i in msa_tr} print taxid2gi t = ncbi.get_topology(list(f_df['taxid']),intermediate_nodes=False) a=t.add_child(name='annotation') a.add_feature('sci_name','annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if(node.rank in ['order','class','phylum','kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name=='annotation': s=str(msa_dict[str(taxid2gi[int(node.name)])]) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") gi=taxid2gi[int(node.name)] add_face_to_node(TextFace(' '+str(gi)+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+str(int(node.name))+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+str(gi2variant[gi])+' '),node,column=3, position = "aligned") if node.is_leaf() and node.name=='annotation': s=get_hist_ss_in_aln_as_string(msa_tr) seqFace = SeqMotifFace(s,[[0,len(s), "seq", 10, 10, None, None, None]],scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' '+'NCBI_GI'+' '),node,column=1, position = "aligned") add_face_to_node(TextFace(' '+'NCBI_TAXID'+' '),node,column=2, position = "aligned") add_face_to_node(TextFace(' '+'Variant'+' '),node,column=3, position = "aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render("results/h2a_ca_cellular.svg", w=6000, dpi=300, tree_style=ts) #10. Conservation features=get_hist_ss_in_aln_for_shade(msa_tr,below=True) cn=add_consensus(msa_tr,threshold=0.5)[-2:-1] # Below are three methods that we find useful. # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation') plot_prof4seq('results/h2a_ca_cellular_cons_ent_unw',map(lambda x:log(20)+x,map(float,cons_prof(msa_tr,f=0,c=0))),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms') # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation') plot_prof4seq('results/h2a_ca_cellular_cons_sofp_unw_renorm1',map(float,cons_prof(msa_tr,f=0,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms') plot_prof4seq('results/h2a_ca_cellular_cons_sofp_psic_renorm1',map(float,cons_prof(msa_tr,f=2,c=2,m=1)),cn,features,axis='conservation',title='Conservation, canonical H2A cellular organisms')
Usage = ''' fasta_to_nexus_mb.py is a script to convert a fasta alignment to a nexus alignment (hopefully) without messing up the names too badly. It also makes a separate Mr.Bayes script.. fasta_to_nexus.py [name of fasta alignment, expects an extension of .fa] ''' if len(sys.argv) != 2: sys.exit("ERROR! This script expects one additional argument, and you gave it %d arguments! %s" % (len(sys.argv), Usage)) InFileName = sys.argv[1] sys.stderr.write("Alignment %s will be processed.\n" % (InFileName)) MyAlignment = MultipleSeqAlignment([], generic_dna) MyAlignment.extend(AlignIO.read(InFileName, 'fasta')) for record in MyAlignment: SeqNameTemp = record.id for Char in SeqNameTemp: if (Char in [':',',','(',')',':','[',']',"'","="]): SeqNameTemp = SeqNameTemp.replace(Char,'-') record.id = SeqNameTemp OutFileName = InFileName[:-2]+"nex" AlignIO.write(MyAlignment,OutFileName,'nexus') OutList = [ ] Line = "set autoclose=yes nowarn=yes\nexecute "+OutFileName+"\n" Line += "lset nst=6 rates=invgamma\nunlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all)\n" OutList.append(Line)
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input directory from s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/" output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/") if os.path.isdir(output_directory_2) == False: os.makedirs(output_directory_2) ### iterate each gene for file in os.listdir(output_directory_1): if file != ".DS_Store": output_directory_file = output_directory_2 + file fasta_name = output_directory_1 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " + sequence) alignment = AlignIO.read(sequence, 'fasta') ### calculate the polymorphism in outgroup ### change alignment to an array. total_wrong_poly_sites_outgroup = [] align_array_outgroup = np.array([list(rec) for rec in alignment]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = alignment.get_alignment_length() # alignment = AlignIO.read(sequence, 'fasta') for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position_outgroup = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array_outgroup[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position_outgroup) > float(Max_p_sites_o): print(column_position_outgroup) total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup)) print(unique_wrong_sites_ougroup) print("outgroup") align_2 = MultipleSeqAlignment([]) for record in alignment: new_seq = "" if record.id in outgroups: print(record.seq) for i in range(total_length): if i in unique_wrong_sites_ougroup: new_seq = new_seq + "-" else: new_seq = new_seq + str(record.seq[i]) temp_seq2 = SeqRecord(Seq(str(new_seq)), id=str(record.id)) align_2.extend([temp_seq2]) #align_2.extend(str(record.id), str(new_seq)) else: temp_seq3 = SeqRecord(Seq(str(record.seq)), id=str(record.id)) align_2.extend([temp_seq3]) #align_2.extend(str(record.id), str(record.seq)) print(align_2) AlignIO.write(align_2, output_directory_file, "fasta")
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input files are from s6 genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/") ### mkdir output directory for s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/" if os.path.isdir(output_directory) == False: os.makedirs(output_directory) ### iterate each gene for file in os.listdir(genes_result_s6): if file != ".DS_Store": output_directory_file = output_directory + file fasta_name = genes_result_s6 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " +sequence) alignment = AlignIO.read(sequence, 'fasta') # print(alignment) ### generate a new alignment sequences without outgroups. align = MultipleSeqAlignment([]) for record in alignment: if record.id not in outgroups: # print(record.id) # print(record.seq) temp_seq = SeqRecord(Seq(str(record.seq)), id=str(record.id)) # print(temp_seq) align.extend([temp_seq]) print(align) # print(align.get_alignment_length()) total_wrong_poly_sites = [] ### change alignment to an array. align_array = np.array([list(rec) for rec in align]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = align.get_alignment_length() ### using 20bp-long sliding windows. for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position) > float(Max_p_sites): print(column_position) total_wrong_poly_sites = total_wrong_poly_sites + column_position #print(total_wrong_poly_sites) ### generate the unique positions total_wrong_poly_sites = total_wrong_poly_sites + list(range(10)) total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length)) ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species. unique_wrong_sites = list(np.unique(total_wrong_poly_sites)) print(len(unique_wrong_sites)) # sum2 = alignment[:, total_length:total_length + 1] # for i in unique_wrong_sites: # sum2 = sum2 + alignment[:, i:i+1] # print(sum2) # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip") ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites. ### otherwise, copy the gene to the new folder. if len(unique_wrong_sites) > 0: print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")) cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}") cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col print(cmd) os.system(cmd) else: cmd_2 = "cp " + fasta_name + " " + output_directory_file print(cmd_2) os.system(cmd_2)
def main(): title = '' #1. Getting data ######################################################## ######################################################## # df=pd.read_csv('int_data/seqs_rs_redef.csv') #Histone types info #Does not really seem that we need to redefine variants based on best score. df = pd.read_csv('int_data/seqs_rs.csv') #Histone types info fasta_dict = pickle.load(open("int_data/fasta_dict.p", "rb")) #Sequences #2. Filtering - filter initial dataset by type, variant and other parameters ######################################################## ######################################################## #2.1. Narrow by variant/type ######################################################## title += 'H2A' # f_df=df[(df['hist_var']=='canonical_H4')] # f_df['hist_var']='canonical_H4' f_df = df[( (df['hist_var'] == 'canonical_H2A') | (df['hist_var'] == 'H2A.X')) & (df['partial'] == False) & (df['non_st_aa'] == False)] # f_df=df[((df['hist_var']=='H2A.Z'))&(df['partial']==False)&(df['non_st_aa']==False)] # f_df=df[(df['hist_type']=='H2A')] print "Number of seqs after narrowing by hist type/var:", len(f_df) #2.2. Filter by list of taxonomy clades - restrict sequences to certain taxonomic clades ######################################################### title += ' across cellular organisms' # parent_nodes=[9443] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates parent_nodes = [ 131567 ] #131567 - cellular organisms, 7215 4930 Drosophila and yeast, 9443 - primates #33682 - euglenozoa #6656 - arthropods # 4751 - fungi #5782 - dictostelium #This is akin manual removal of bad species del_nodes = [5782, 5690] print "Selecting taxonomic subset for taxids: ", parent_nodes print "while removing taxonomic subset for taxids: ", del_nodes taxids = set(parent_nodes) for i in parent_nodes: taxids.update(ncbi.get_descendant_taxa(i, intermediate_nodes=True)) for i in del_nodes: taxids = taxids.difference(set([i])) taxids = taxids.difference( set(ncbi.get_descendant_taxa(i, intermediate_nodes=True))) f_df = f_df[f_df['taxid'].isin(taxids)] print "Number of seq after taxonomic subset: ", len(f_df) #2.3.0 Marking number of identical sequence within each species and subspecies. #This will simplify further analysis of sequence filtering on similarity #We know that all refseqs are duplicated for instance. ################################################ ident = dict() new_gis = list() tids = set(list(f_df['taxid'])) for i in tids: # print i.name, i.sci_name temp_df = f_df[(f_df['taxid'] == i)] gis = list(temp_df['gi']) #this is to limit exec time # print gis if (len(gis) > 1): res = cluster_seq_support({gi: fasta_dict[str(gi)] for gi in gis}, ident_thresh=1.00) ident.update(res) else: ident.update({gis[0]: 1}) f_df['ident'] = [ident.get(k, 1) for k in f_df['gi']] #where ident - number of identical sequnces for current sepecies/subspecies. print "Identity of sequence inside each taxid determined" #2.3.1. Calculate number of similar seqs for every seq in tax group ######################################################### # Use powerful method, to get rid of random errors is to identify identical sequences # if a sequence is supported by two or more entires - this is good. # Here we add a degen column to our data set - showing how many similar sequences are found # for a given sequence in its taxonomic clade (genus currently) #We will traverse the species tree by species, genus or family, and determine degeneracy level degen = dict() new_gis = list() tids = list(f_df['taxid']) t = ncbi.get_topology(tids, intermediate_nodes=True) for i in t.search_nodes(rank='family'): # print i.name, i.sci_name nodeset = list() for k in i.traverse(): nodeset.append(int(k.name)) temp_df = f_df[(f_df['taxid'].isin(nodeset))] gis = list(temp_df['gi']) #this is to limit exec time # print gis res = cluster_seq_support({gi: fasta_dict[str(gi)] for gi in gis}, ident_thresh=1.00) degen.update(res) # print degen f_df['degen'] = [degen.get(k, 1) for k in f_df['gi']] #2.3.2. Remove seqs that do not have support outside their species # if they are not curated or RefSeq NP. ########################################################### f_df = f_df.sort( ['RefSeq', 'degen'], ascending=False ) # so that RefSeq record get priority on removing duplicates f_df = f_df[(f_df['degen'] > f_df['ident']) | (f_df['curated'] == True) | (f_df['RefSeq'] == 2)] print "After removing mined seqs with no support in neighboring species: ", len( f_df) #2.3.3. Shuffle sequnces, so that upon further selection, RefSeq and high degeneracy get priority ########################################################### #RefSeq and degenerate sequence get priority # title+=' 1ptax' f_df = f_df.sort( ['RefSeq', 'degen'], ascending=False ) # so that RefSeq record get priority on removing duplicates # print f_df[0:10] # f_df=f_df.drop_duplicates(['taxid','hist_var']) #2.4 Take one best representative per specific taxonomic rank (e.g. genus) ############################################################ pruningrank = 'genus' print "Pruning taxonomy by ", pruningrank title += ' , one seq. per %s' % pruningrank #Common ranks: superorder-order-suborder-infraorder-parvorder-superfamily-family-subfamily-genus-species-subspecies seqtaxids = list(f_df['taxid']) #old list grouped_taxids = group_taxids(seqtaxids, rank=pruningrank) # print seqtaxids # print grouped_taxids #Now we need to take best representative #refseq NP, curated, or the one with largest degeneracy new_gis = list() for tids in grouped_taxids: t_df = f_df[f_df['taxid'].isin(tids)] #try take curated first if (len(t_df[t_df['curated'] == True]) > 0): new_gis.append(t_df.loc[t_df.curated == True, 'gi'].values[0]) continue #try take NP records nest #RefSeq 2 means NP, 1 means XP if (len(t_df[t_df['RefSeq'] == 2]) > 0): new_gis.append(t_df.loc[t_df.RefSeq == 2, 'gi'].values[0]) continue # take best degenerate otherwise else: t_df = t_df.sort(['degen', 'RefSeq'], ascending=False) new_gis.append(t_df['gi'].iloc[0]) f_df = f_df[f_df['gi'].isin(new_gis)] print "After pruning taxonomy we have: ", len(f_df) #2.5. Check seq for sanity - needs to be checked! ############################################## # title+=' seqQC ' # print "Checkig sequence quality" # newgis=list() # for i,row in f_df.iterrows(): # gi=row['gi'] # seq=fasta_dict[str(gi)].seq # hist_type=row['hist_type'] # hist_var=row['hist_var'] # if(check_hist_length(seq,hist_type,hist_var,5)&check_hist_core_length(seq,hist_type,5)): # newgis.append(gi) # f_df=f_df[f_df['gi'].isin(newgis)] #remake the dataframe # print len(f_df) #3. Make a list of seq with good ids and descriptions ############################################## f_fasta_dict = { key: value for (key, value) in fasta_dict.iteritems() if int(key) in list(f_df['gi']) } print len(f_fasta_dict) taxid2name = ncbi.get_taxid_translator(list(f_df['taxid'])) #Relabel sequences gi=> type and organism f_fasta_dict = { key: SeqRecord( id=key, description=f_df.loc[f_df.gi == int(key), 'hist_var'].values[0] + ' ' + taxid2name[f_df.loc[f_df.gi == int(key), 'taxid'].values[0]], seq=value.seq) for (key, value) in f_fasta_dict.iteritems() } #with arbitrary index # f_fasta_dict_rel={key: SeqRecord(id=str(index), description=f_hist_df.loc[f_hist_df.gi==key,'hist_var'].values[0]+' '+taxid2names[f_hist_df.loc[f_hist_df.gi==key,'taxid'].values[0]],seq=f_fasta_dict[key].seq) for (index,key) in enumerate(f_fasta_dict) } # exit() #4. Make MSA ################# #Here we construct MSA msa = muscle_aln(f_fasta_dict.values(), gapopen=float(-20)) AlignIO.write(msa, "int_data/example_msa.fasta", "fasta") msa_annot = MultipleSeqAlignment([ SeqRecord(Seq(''.join(get_hist_ss_in_aln_as_string(msa)).replace( ' ', '-')), id='annotation', name='') ]) msa_annot.extend(msa) AlignIO.write(msa_annot, "int_data/example_msa_annot.fasta", "fasta") for i in range(len(msa)): gi = msa[i].id msa[i].description = f_fasta_dict[gi].description.replace( 'canonical', 'ca') msa.sort(key=lambda x: x.description) #5. Visualize MSA############ aln2html(msa, 'example_h2a.html', features=get_hist_ss_in_aln_for_html(msa, 'H2A', 0), title="canonical H2A alignment", description=True, field1w=10, field2w=35) #6. Trim alignment - this is optional #6.1. Trim gaps # title+=' gaptrim' # msa_tr=trim_aln_gaps(msa,threshold=0.8) #6.2. Trim to histone core sequence msa_tr = trim_hist_aln_to_core(msa) # msa_tr=msa # print get_hist_ss_in_aln_for_shade(msa_tr,below=True) # exit() #7. Vizualize MSA with ete2.########## taxid2gi = { f_df.loc[f_df.gi == int(gi), 'taxid'].values[0]: gi for gi in list(f_df['gi']) } gi2variant = { gi: f_df.loc[f_df.gi == int(gi), 'hist_var'].values[0] for gi in list(f_df['gi']) } msa_dict = {i.id: i.seq for i in msa_tr} t = ncbi.get_topology(list(f_df['taxid']), intermediate_nodes=False) a = t.add_child(name='annotation') a.add_feature('sci_name', 'annotation') t.sort_descendants(attr='sci_name') ts = TreeStyle() def layout(node): # print node.rank # print node.sci_name if getattr(node, "rank", None): if (node.rank in ['order', 'class', 'phylum', 'kingdom']): rank_face = AttrFace("sci_name", fsize=7, fgcolor="indianred") node.add_face(rank_face, column=0, position="branch-top") if node.is_leaf(): sciname_face = AttrFace("sci_name", fsize=9, fgcolor="steelblue") node.add_face(sciname_face, column=0, position="branch-right") if node.is_leaf() and not node.name == 'annotation': s = str(msa_dict[str(taxid2gi[int(node.name)])]) seqFace = SeqMotifFace( s, [[0, len(s), "seq", 10, 10, None, None, None]], scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") gi = taxid2gi[int(node.name)] add_face_to_node(TextFace(' ' + str(gi) + ' '), node, column=1, position="aligned") add_face_to_node(TextFace(' ' + str(int(node.name)) + ' '), node, column=2, position="aligned") add_face_to_node(TextFace(' ' + str(gi2variant[gi]) + ' '), node, column=3, position="aligned") if node.is_leaf() and node.name == 'annotation': s = get_hist_ss_in_aln_as_string(msa_tr) seqFace = SeqMotifFace( s, [[0, len(s), "seq", 10, 10, None, None, None]], scale_factor=1) add_face_to_node(seqFace, node, 0, position="aligned") add_face_to_node(TextFace(' ' + 'NCBI_GI' + ' '), node, column=1, position="aligned") add_face_to_node(TextFace(' ' + 'NCBI_TAXID' + ' '), node, column=2, position="aligned") add_face_to_node(TextFace(' ' + 'Variant' + ' '), node, column=3, position="aligned") ts.layout_fn = layout ts.show_leaf_name = False ts.title.add_face(TextFace(title, fsize=20), column=0) t.render("example_motifs_H2A.svg", w=6000, dpi=300, tree_style=ts) #10. Conservation############ ############################# features = get_hist_ss_in_aln_for_shade(msa_tr, below=True) cn = add_consensus(msa_tr, threshold=0.5)[-2:-1] # Below are three methods that we find useful. # plot_prof4seq('cons_sofp_psic',map(float,cons_prof(msa_tr,f=2,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_ent_unw', map(lambda x: log(20) + x, map(float, cons_prof(msa_tr, f=0, c=0))), cn, features, axis='conservation') plot_prof4seq('example_cons_ent_unw_norm', map(lambda x: log(20) + x, map(float, cons_prof(msa_tr, f=0, c=0, norm="T"))), cn, features, axis='conservation') # plot_prof4seq('cons_sofp_unw',map(float,cons_prof(msa_tr,f=0,c=2)),cn,features,axis='conservation') plot_prof4seq('example_cons_sofp_unw_renorm1', map(float, cons_prof(msa_tr, f=0, c=2, m=1)), cn, features, axis='conservation') plot_prof4seq('example_cons_sofp_unw', map(float, cons_prof(msa_tr, f=0, c=2, m=0)), cn, features, axis='conservation') plot_prof4seq('example_cons_sofp_psic_renorm1', map(float, cons_prof(msa_tr, f=2, c=2, m=1)), cn, features, axis='conservation')
''' PhyChem ''' myclade.calculate_phchem_conservation( ) #find frequency for each column myclade.find_phychm_conserved_regions(min_conserved_length, consensus) ''' Rate4Site ''' myclade.run_rate4site() ''' OutPut ''' myclade.print_clade_analysis() # display myclade.write_clade_to_files() # default: cladename.tre, clade_name.fa #cladedict[cladename] = myclade cladelist.append(myclade) MSA_everycalade.extend(MSA) MSAodo_everycalade.extend(ODO) ''' +++ ALL_CLADES +++ ALL SELECTED CLADES IN ONE (it might be the case when not all clades are seleced in tree) ''' all_clades = Clade(fileheader, "ALL_CLADES", tree, allfasta, MSA_everycalade, MSAodo_everycalade, entropy_gap_weight) ''' Oder/DisOrder ''' all_clades.calculate_odo_conservation() #find O/D frequency for each column ''' AA ''' all_clades.calculate_aa_conservation() #findAA frequency for each column all_clades.find_aa_conserved_regions(min_conserved_length, consensus) ''' AA entropy ''' all_clades.calculate_aa_entropy()
# Prettify labels def get_label(leaf): if leaf.name.startswith("Inner"): return "" return leaf.name.replace("_", " ") # Read the sequences and align aln = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse("data/coding.fa", "fasta"): # for seq_record in SeqIO.parse("data/cons_noncode.fa", "fasta"): print(seq_record.id) print(repr(seq_record.seq)) print(len(seq_record)) aln.extend([seq_record]) # Print the alignment print(aln) # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Print the distance Matrix print('\nDistance Matrix\n===================') print(dm) # Construct the phylogenetic tree using UPGMA algorithm constructor = DistanceTreeConstructor()
fasta_to_nexus_mb.py is a script to convert a fasta alignment to a nexus alignment (hopefully) without messing up the names too badly. It also makes a separate Mr.Bayes script.. fasta_to_nexus.py [name of fasta alignment, expects an extension of .fa] ''' if len(sys.argv) != 2: sys.exit( "ERROR! This script expects one additional argument, and you gave it %d arguments! %s" % (len(sys.argv), Usage)) InFileName = sys.argv[1] sys.stderr.write("Alignment %s will be processed.\n" % (InFileName)) MyAlignment = MultipleSeqAlignment([], generic_dna) MyAlignment.extend(AlignIO.read(InFileName, 'fasta')) for record in MyAlignment: SeqNameTemp = record.id for Char in SeqNameTemp: if (Char in [':', ',', '(', ')', ':', '[', ']', "'", "="]): SeqNameTemp = SeqNameTemp.replace(Char, '-') record.id = SeqNameTemp OutFileName = InFileName[:-2] + "nex" AlignIO.write(MyAlignment, OutFileName, 'nexus') OutList = [] Line = "set autoclose=yes nowarn=yes\nexecute " + OutFileName + "\n" #Line += "lset nst=6 rates=invgamma\nunlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all)\n" #OutList.append(Line)