def needle_alignment(s1, s2): ''' DESCRIPTION Does a Needleman-Wunsch Alignment of sequence s1 and s2 and returns a Bio.Align.MultipleSeqAlignment object. ''' from Bio import pairwise2 from Bio.Align import MultipleSeqAlignment from Bio.SubsMat.MatrixInfo import blosum62 def match_callback(c1, c2): return blosum62.get((c1, c2), 1 if c1 == c2 else -4) alns = pairwise2.align.globalcs(s1, s2, match_callback, -10., -.5, one_alignment_only=True) a = MultipleSeqAlignment([]) a.add_sequence("s1", alns[0][0]) a.add_sequence("s2", alns[0][1]) return a
def Create_SNP_alignment(alignment, SNPlocations): alphabet = Gapped(IUPAC.unambiguous_dna) SNPalignment = MultipleSeqAlignment(alphabet) for record in alignment: SNPsequenceObject="" for base in SNPlocations: SNPsequenceObject=SNPsequenceObject+record.seq[base].replace("-","?") SNPalignment.add_sequence(record.id, SNPsequenceObject) return SNPalignment
def SubsetAlnCols(args): aln_in_fasta = args['in'] col_range = args['r'] min_non_empty_pct = args['pct'] align_subset_out = args['out'] # get range col_start = int(col_range.split('-')[0]) col_end = int(col_range.split('-')[1]) align_subset_len = col_end - col_start + 1 # subset align_in = AlignIO.read(aln_in_fasta, 'fasta') align_in_subset = align_in[:, (col_start - 1):col_end] if min_non_empty_pct is None: # write out align_file_out_handle = open(align_subset_out, 'w') AlignIO.write(align_in_subset, align_file_out_handle, 'fasta') align_file_out_handle.close() else: # filter align_in_subset_filtered = MultipleSeqAlignment( [], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in align_in_subset: non_empty_col_pct = (align_subset_len - seq_record.seq.count('-') ) * 100 / align_subset_len if non_empty_col_pct >= float(min_non_empty_pct): align_in_subset_filtered.add_sequence(seq_record.id, str(seq_record.seq)) # write out align_file_out_handle = open(align_subset_out, 'w') AlignIO.write(align_in_subset_filtered, align_file_out_handle, 'fasta') align_file_out_handle.close()
def align_sequences(multiple_sequences, ms_path, msa_path, cluster_number): # create alignment object from a list of multiple sequences and write it to a file aln = MultipleSeqAlignment(Gapped(IUPAC.protein, '-')) # write ms to file with open(ms_path, 'w') as f: f.write('\n'.join('>' + str(i) + '\n' + multiple_sequences[i] for i in range(len(multiple_sequences)))) if len(multiple_sequences) > 1: # align file logger.info('running mafft:') mafft_cmd = f'mafft --quiet {ms_path} > {msa_path}' logger.info(mafft_cmd) os.system(mafft_cmd) #avoid removing sparse columns!! causes a bug when trying to grep from annotations!! # remove columns with more than 20% gaps # remove_sparse_columns(msa_path, msa_path, 0.2) else: logger.info( 'Only 1 sequence in {}. Skipping mafft and copying ms file as is.'. format(cluster_number)) with open(msa_path, 'w') as f: f.write('\n'.join('>' + str(i) + '\n' + multiple_sequences[i] for i in range(len(multiple_sequences)))) for record in SeqIO.parse(ms_path, 'fasta'): aln.add_sequence(record.id, str(record.seq)) logger.debug('Alignment is:') msa_str = '' for record in aln._records: msa_str += record.seq._data + '\n' logger.debug(msa_str) return aln, msa_str.split()
def compute_consensus(self): align = MultipleSeqAlignment(Gapped(IUPAC.extended_protein, "-")) for i, seq in enumerate(self.msa): align.add_sequence(str(i), str(seq)) summary_align = AlignInfo.SummaryInfo(align) self.consensus = summary_align.gap_consensus(threshold=0, ambiguous="-")
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return None while line.rstrip() != "#=======================================": line = handle.readline() if not line: return None length_of_seqs = None number_of_seqs = None ids = [] seqs = [] while line[0] == "#": #Read in the rest of this alignment header, #try and discover the number of records expected #and their length parts = line[1:].split(":",1) key = parts[0].lower().strip() if key == "aligned_sequences": number_of_seqs = int(parts[1].strip()) assert len(ids) == 0 # Should now expect the record identifiers... for i in range(number_of_seqs): line = handle.readline() parts = line[1:].strip().split(":",1) assert i+1 == int(parts[0].strip()) ids.append(parts[1].strip()) assert len(ids) == number_of_seqs if key == "length": length_of_seqs = int(parts[1].strip()) #And read in another line... line = handle.readline() if number_of_seqs is None: raise ValueError("Number of sequences missing!") if length_of_seqs is None: raise ValueError("Length of sequences missing!") if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs: raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) seqs = ["" for id in ids] seq_starts = [] index = 0 #Parse the seqs while line: if len(line) > 21: id_start = line[:21].strip().split(None, 1) seq_end = line[21:].strip().split(None, 1) if len(id_start) == 2 and len(seq_end) == 2: #identifier, seq start position, seq, seq end position #(an aligned seq is broken up into multiple lines) id, start = id_start seq, end = seq_end if start==end: #Special case, either a single letter is present, #or no letters at all. if seq.replace("-","") == "": start = int(start) end = int(end) else: start = int(start) - 1 end = int(end) else: assert seq.replace("-","") != "" start = int(start)-1 #python counting end = int(end) #The identifier is truncated... assert 0 <= index and index < number_of_seqs, \ "Expected index %i in range [0,%i)" \ % (index, number_of_seqs) assert id==ids[index] or id == ids[index][:len(id)] if len(seq_starts) == index: #Record the start seq_starts.append(start) #Check the start... if start == end: assert seq.replace("-","") == "", line else: assert start - seq_starts[index] == len(seqs[index].replace("-","")), \ "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \ % (len(seqs[index].replace("-","")), index, id, repr(seqs[index]), start, line) seqs[index] += seq #Check the end ... assert end == seq_starts[index] + len(seqs[index].replace("-","")), \ "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \ % (len(seqs[index].replace("-","")), index, id, repr(seqs[index]), seq_starts[index], end, line) index += 1 if index >= number_of_seqs: index = 0 else: #just a start value, this is just alignment annotation (?) #print "Skipping: " + line.rstrip() pass elif line.strip() == "": #Just a spacer? pass else: print line assert False line = handle.readline() if line.rstrip() == "#---------------------------------------" \ or line.rstrip() == "#=======================================": #End of alignment self._header = line break assert index == 0 if self.records_per_alignment is not None \ and self.records_per_alignment != len(ids): raise ValueError("Found %i records in this alignment, told to expect %i" \ % (len(ids), self.records_per_alignment)) alignment = MultipleSeqAlignment(self.alphabet) for id, seq in zip(ids, seqs): if len(seq) != length_of_seqs: #EMBOSS 2.9.0 is known to use spaces instead of minus signs #for leading gaps, and thus fails to parse. This old version #is still used as of Dec 2008 behind the EBI SOAP webservice: #http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl raise ValueError("Error parsing alignment - sequences of " "different length? You could be using an " "old version of EMBOSS.") alignment.add_sequence(id, seq) return alignment
if len(glob.glob(files)) > 0: nPass = 0 for fasta in glob.glob(files): # get seqs fastaname = fasta.split('/')[-1] # get fasta name without path aln = AlignIO.read(fasta, "fasta") # extract alignment from fasta # nSeq1 = len(aln) if include_missing_data == False: gap_aln = MultipleSeqAlignment([]) for current_seq in aln: # for each sequence seq = str(current_seq.seq) seq = str.replace( seq, "N", "-" ) # change Ns to gaps to make them invisible to dumb_consensus gap_aln.add_sequence(current_seq.id, seq) aln = gap_aln summary_align = AlignInfo.SummaryInfo(aln) consensus_unamb = summary_align.dumb_consensus( threshold=identity, ambiguous="N", consensus_alpha=alpha.IUPACUnambiguousDNA, require_multiple=0) consensus_unamb = str(consensus_unamb) consensus_unamb = str.replace(consensus_unamb, "n", "N") outseq = MultipleSeqAlignment([]) outseq.add_sequence(name, consensus_unamb) outpath = '{0}{1}{2}{3}{4}'.format(outdir, "cons_sim", identity, "_", fastaname) f = open(outpath, 'w')
def plot_bar(data,nuclstr,column='value',factor=None,ymin=None,ymax=None,stat='identity',dpi=300,features=None,feature_types=['all'],add_features=[],funcgroups=None,shading_modes=['charge_functional'],usd=False,right_overhang_fix=None,debug=False,startnumber=1,cropseq=(0,None),aspect_ratio=None,reverse_seq=False,double_seq=False,transparent=True,fill_params=None,bar_position='stack',title=None): """ A wrapper function to make a plot of data with bars along the sequnce input should be a dataframe with resid, segid column and 'value' This one is inspired by seqplot/seqplot/pdb_plot.py """ segid=data['segid'].values[0] if title is None: title="Segid: %s, Type: %s"%(segid,nuclstr.components[segid]['type']) seq=Seq(str(nuclstr.seqs[segid]['fullseq']),generic_protein \ if nuclstr.components[segid]['entity'] is 'DNA' or 'histone' or 'protein' else generic_dna) msar=MultipleSeqAlignment([SeqRecord(seq=seq,id=nuclstr.components[segid]['type']+':'+segid,\ name=nuclstr.components[segid]['type']+':'+segid)]) if(reverse_seq): logger.info("Experimental feature will reverse the sequence") msar[0].seq=msar[0].seq[::-1] if double_seq: msar.add_sequence('reverse',str(msar[0].seq[::-1])) msar=msar[:,cropseq[0]:cropseq[1]] # print("Seq to plot:",msar) #We need to get starting residue, currently for DNA chains only cifseq gets it correctly resid_start=nuclstr.seqs[segid]['resid_start'] logger.debug("Starting resid",resid_start) overhang=nuclstr.seqs[segid]['overhangL'] datafixed=data.copy() datafixed.loc[:,'resid']=datafixed.loc[:,'resid']-resid_start+overhang+1-cropseq[0] sl=len(msar[0].seq) # fn=shade.seqfeat2shadefeat(msar,feature_types=feature_types,force_feature_pos='bottom',debug=debug) if features is None: fn=nuclstr.shading_features[segid] else: fn=features fn2=[] for i in fn: if (i['style'] in feature_types) or ('all' in feature_types) : fn2.append(i) fn2.extend(add_features) if usd: ruler='top' else: ruler=None shaded=ipyshade.shadedmsa4plot(msar,features=fn2,shading_modes=shading_modes,debug=debug,startnumber=startnumber,setends=[startnumber-2,sl+startnumber+2],funcgroups=funcgroups,ruler=ruler,density=200) #If sl%10=10 se will have a ruler number hanging beyond the sequence image, and we need to correct for that. if right_overhang_fix is None: if sl%10==0: if sl<100: rof= 0.1 else: rof=0.5 else: rof=0 else: rof=right_overhang_fix if (not aspect_ratio is None ): ar=aspect_ratio else: ar=0.2*100./sl # print(datafixed) plot=(ggplot(data=datafixed,mapping=aes(x='resid', y=column)) # + geom_point(size=0.1) # +geom_bar(stat='identity',width=0.5,mapping=aes(fill=factor)) + scale_x_continuous(limits=(0.5,sl+0.5+rof),expand=(0,0.2),name='',breaks=[]) # + scale_y_continuous(breaks=[0,0.5,1.0]) + theme_light()+theme(aspect_ratio=ar,dpi=dpi,plot_margin=0,text=element_text(size=6), legend_key_size=5 ,legend_position='bottom',legend_direction='horizontal')) #+ facet_wrap('~ segid',dir='v') +guides(color=guide_legend(ncol=10)) if factor is None: plot=plot+geom_bar(stat=stat,width=0.5) else: plot=plot+geom_bar(stat=stat,width=0.5,mapping=aes(fill=factor),position=bar_position) if fill_params is not None: plot=plot+scale_fill_manual(**fill_params) if not usd: if (ymax is not None) : plot=plot+scale_y_continuous(limits=(None,ymax)) else: if (ymin is not None) : plot=plot+scale_y_continuous(limits=(ymin,None)) if ymax is None: ymax=data[column].max() if ymin is None: ymin=data[column].min() # print(ymax) plot = plot + geom_seq_x(seqimg=shaded.img,\ xlim=(1,sl+rof),ylim=(ymin,ymax),usd=usd,aspect_ratio=ar,transparent=transparent)+ggtitle(title) return plot
def next(self): handle = self.handle try: #Header we saved from when we were parsing #the previous alignment. line = self._header del self._header except AttributeError: line = handle.readline() if not line: return line = line.strip() parts = filter(None, line.split()) if len(parts)!=2: raise ValueError("First line should have two integers") try: number_of_seqs = int(parts[0]) length_of_seqs = int(parts[1]) except ValueError: raise ValueError("First line should have two integers") assert self._is_header(line) if self.records_per_alignment is not None \ and self.records_per_alignment != number_of_seqs: raise ValueError("Found %i records in this alignment, told to expect %i" \ % (number_of_seqs, self.records_per_alignment)) ids = [] seqs = [] #Expects STRICT truncation/padding to 10 characters #Does not require any white space between name and seq. for i in range(0,number_of_seqs): line = handle.readline().rstrip() ids.append(line[:10].strip()) #first ten characters seqs.append([line[10:].strip().replace(" ","")]) #Look for further blocks line="" while True: #Skip any blank lines between blocks... while ""==line.strip(): line = handle.readline() if not line : break #end of file if not line : break #end of file if self._is_header(line): #Looks like the start of a concatenated alignment self._header = line break #print "New block..." for i in range(0,number_of_seqs): seqs[i].append(line.strip().replace(" ","")) line = handle.readline() if (not line) and i+1 < number_of_seqs: raise ValueError("End of file mid-block") if not line : break #end of file alignment = MultipleSeqAlignment(self.alphabet) for i in range(0,number_of_seqs): seq = "".join(seqs[i]) if len(seq)!=length_of_seqs: raise ValueError("Sequence %i length %i, expected length %i" \ % (i+1, len(seq), length_of_seqs)) alignment.add_sequence(ids[i], seq) record = alignment[-1] assert ids[i] == record.id or ids[i] == record.description record.id = ids[i] record.name = ids[i] record.description = ids[i] return alignment
def replace_outgroup_with_gap(seq_directory, outgroup_path, window_size = 20, Max_p_sites_o = 8): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input directory from s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory_1 = genes_result_s7 + "/s1_rm_polymorphism_sites/" output_directory_2 = output_directory_1.replace("/s1_rm_polymorphism_sites/","/s2_rm_polymorphism_in_outgroups/") if os.path.isdir(output_directory_2) == False: os.makedirs(output_directory_2) ### iterate each gene for file in os.listdir(output_directory_1): if file != ".DS_Store": output_directory_file = output_directory_2 + file fasta_name = output_directory_1 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " + sequence) alignment = AlignIO.read(sequence, 'fasta') ### calculate the polymorphism in outgroup ### change alignment to an array. total_wrong_poly_sites_outgroup = [] align_array_outgroup = np.array([list(rec) for rec in alignment]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = alignment.get_alignment_length() # alignment = AlignIO.read(sequence, 'fasta') for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position_outgroup = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array_outgroup[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position_outgroup.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 8 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position_outgroup) > float(Max_p_sites_o): print(column_position_outgroup) total_wrong_poly_sites_outgroup = total_wrong_poly_sites_outgroup + column_position_outgroup unique_wrong_sites_ougroup = list(np.unique(total_wrong_poly_sites_outgroup)) print(unique_wrong_sites_ougroup) print("outgroup") align_2 = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for record in alignment: new_seq = "" if record.id in outgroups: print(record.seq) for i in range(total_length): if i in unique_wrong_sites_ougroup: new_seq = new_seq + "-" else: new_seq = new_seq + str(record.seq[i]) align_2.add_sequence(str(record.id), str(new_seq)) else: align_2.add_sequence(str(record.id), str(record.seq)) print(align_2) AlignIO.write(align_2, output_directory_file, "fasta")
def rm_wrong_polymorphism_sites(seq_directory, outgroup_path, window_size = 20, Max_p_sites = 4): ### define iupac iupac_bases = ['m', 'r', 'w', 's', 'y', 'k', 'M', 'R', 'W', 'S', 'Y', 'K', "v", "h", "d", "b", "V", "H", "D", "B"] ### input files are from s6 genes_result_s6 = seq_directory.replace("s1_Gene/", "s6_trimal/") ### mkdir output directory for s7 genes_result_s7 = seq_directory.replace("s1_Gene/", "s7_well_trimal/") ### return outgroup list outgroups = input_outgroup(outgroup_path) output_directory = genes_result_s7 + "/s1_rm_polymorphism_sites/" if os.path.isdir(output_directory) == False: os.makedirs(output_directory) ### iterate each gene for file in os.listdir(genes_result_s6): if file != ".DS_Store": output_directory_file = output_directory + file fasta_name = genes_result_s6 + file sequences = glob(fasta_name) ### read each alignment sequences for sequence in sequences: print("sequence: " +sequence) alignment = AlignIO.read(sequence, 'fasta') # print(alignment) ### generate a new alignment sequences without outgroups. align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for record in alignment: if record.id not in outgroups: # print(record.id) # print(record.seq) align.add_sequence(str(record.id), str(record.seq)) print(align) # print(align.get_alignment_length()) total_wrong_poly_sites = [] ### change alignment to an array. align_array = np.array([list(rec) for rec in align]) ### , np.character # print(align_array) ### calculate the whole length of the alignment total_length = align.get_alignment_length() ### using 20bp-long sliding windows. for each in window(range(total_length), window_size): # print(list(each)) poly_site_no_iupac = 0 poly_site_number = 0 column_position = [] ### for each block calculate the polymorphism sites number. for column in each: ### calculate each site (each column). counter = Counter(align_array[:, column]) ### sorted by frequency sorted_bases = counter.most_common() # print(counter) # print(sorted_bases) # print(len(counter)) ### count the sites with different situations. gap_yes = 0 if len(counter) ==1: poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 elif len(counter) == 2: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter poly_site_number = poly_site_number + 0 poly_site_no_iupac = poly_site_no_iupac + 0 else: iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 if len(iupac_in_alignment) == 0: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) elif len(counter) == 3: for i in sorted_bases: if i[0] == "-": gap_yes = 1 else: gap_yes = 0 # print("gap is 1 or 0:" + str(gap_yes)) if gap_yes == 1: # print counter iupac_in_alignment = [ele for ele in sorted_bases if (ele[0] in iupac_bases)] # print(iupac_in_alignment) if len(iupac_in_alignment) == 1: # poly_site_no_iupac = poly_site_no_iupac + 1 poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 0 else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) else: poly_site_number = poly_site_number + 1 poly_site_no_iupac = poly_site_no_iupac + 1 # print(column) column_position.append(column) # print("column_position: " + str(column_position)) # print(len(column_position)) ### if there are more than 4 polymorphic sites in 20 base pairs, select those sites positions. if len(column_position) > float(Max_p_sites): print(column_position) total_wrong_poly_sites = total_wrong_poly_sites + column_position #print(total_wrong_poly_sites) ### generate the unique positions total_wrong_poly_sites = total_wrong_poly_sites + list(range(10)) total_wrong_poly_sites = total_wrong_poly_sites + list(range(total_length-10, total_length)) ### extract the polymorphic sites from alignment data, might be useful for delete the first 2 species. unique_wrong_sites = list(np.unique(total_wrong_poly_sites)) print(len(unique_wrong_sites)) # sum2 = alignment[:, total_length:total_length + 1] # for i in unique_wrong_sites: # sum2 = sum2 + alignment[:, i:i+1] # print(sum2) # SeqIO.write(sum2, "/Users/zhouwenbin/Downloads/result/M40_total.phy", "phylip") ### operating: if any window has more than 3 polymorphic sites, use trimal to remove those sites. ### otherwise, copy the gene to the new folder. if len(unique_wrong_sites) > 0: print(str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}")) cmd_selected_col = str(unique_wrong_sites).replace(" ", "").replace("[", "\{ ").replace("]", " \}") cmd = "trimal -in " + fasta_name + " -out " + output_directory_file + " -selectcols " + cmd_selected_col print(cmd) os.system(cmd) else: cmd_2 = "cp " + fasta_name + " " + output_directory_file print(cmd_2) os.system(cmd_2)
for i in aln_r: # for each column in alignment site = list(aln_a[:, i]) nSeq = len(site) # count seqs nGap_s = float( site.count('N') + site.count('n') + site.count('-')) # count gaps and missing data pcGap_s = nGap_s / nSeq * 100 if pcGap_s > maxpcGap_s: delsites.append(i) # if proportion of seqs in the column with missing data is above threshold, delete column aln_a = np.delete(aln_a, delsites, 1) c = 0 for current_seq in aln: filt1_aln.add_sequence(current_seq.id, ''.join(map(str, list(aln_a[c, ])))) c += 1 length = filt1_aln.get_alignment_length( ) # if length of alignment after column-wise filter is above threshold, continue if length >= minLen: filt2_aln = MultipleSeqAlignment([]) for current_seq in filt1_aln: # for each sequence seq = str(current_seq.seq) nGap = float(seq.count('N') + seq.count('n') + seq.count('-')) pcGap = nGap / length * 100 if pcGap < maxpcGap: # if proportion of missing data in seq is below threshold, print to filtered alignment filt2_aln.add_sequence(current_seq.id, str(current_seq.seq)) filt3_aln = MultipleSeqAlignment([]) for current_seq in filt2_aln: # for each sequence
files = '{0}{1}'.format("*.", args.input_extension) outfile = args.outfile taxa = args.taxa taxa = taxa.split(",") numfiles = len(glob.glob(files)) numtaxa = len(taxa) line1 = '{0} fasta files and {1} taxa found, alignments will be concatenated and written to {2}\n'.format( numfiles, numtaxa, outfile) print(line1) if numfiles > 0: cataln = MultipleSeqAlignment([]) for taxon in taxa: cataln.add_sequence(taxon, "") # make alignment with all required taxa for fasta in glob.glob(files): fastaname = fasta.split('/')[-1] # get fasta name without path aln = AlignIO.read(fasta, "fasta") # extract alignment from fasta seqLen = aln.get_alignment_length() newaln = MultipleSeqAlignment([]) seq = "X" for catrec in cataln: # for each taxon catid = str(catrec.id) for rec in aln: if str( rec.id ) == catid: # find sequence in fasta alignment if it's there seq = str(rec.seq) if seq == "X":
from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Align import MultipleSeqAlignment if __name__ == "__main__": with open("multialign.txt", "r") as f: lines = [line.strip() for line in f.readlines() if line.strip()] cnt = 0 align = MultipleSeqAlignment([]) for seq in lines: cnt += 1 align.add_sequence(f"seq-{cnt}", seq) print(align)