def run_pal2nal(fname_aln, fname_nuc, fname_prot): """ Generate a codon alignment via PAL2NAL. @param fname_aln: MSA of protein sequences in CLUSTAL format (.aln) @param fname_nuc: Nucleotide sequences in FASTA format (.fasta) @param fname_prot: Protein sequences in FASTA format (.fasta) @return: Codon alignment in CLUSTAL format (.aln), suitable for codeml 1""" sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc)) # Reorder fname_nuc according to the order of the proteins in fname_aln, which # was reordered due to CLUSTALW2. Note that the first protein in each of # these files remains the same as at the start, however; this first protein # is our original query protein. nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")] prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")] records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records)) fname_nuc2 = "homologs_ordered.dna.fasta" with open(fname_nuc2, "w") as f: for record in SeqIO.parse(fname_aln, "clustal"): SeqIO.write(records_map[record.id], f, "fasta") fname_codon = "homologs.codon.aln" # TODO: use subprocess os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon)) return fname_codon
def run_clustalw2(fname_prot): """ Generate a MSA of the amino acids (in fasta format) via clustalw. @param fname_prot: Protein sequences in FASTA format (.fasta) @return: MSA of protein sequences in CLUSTAL format (.aln) """ sys.stderr.write("\nSTEP: run_clustalw2(%s)\n" % fname_prot) fname_aln = "homologs.aa.aln" fname_log = "clustalw2.log" sys.stderr.write("\tRunning clustalw2, please be patient (may take minutes)...\n") proc = subprocess.Popen( "%s/clustalw2 -INFILE=%s -OUTFILE=%s" % (bin_dir(), os.path.abspath(fname_prot), os.path.abspath(fname_aln)), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) stdout, stderr = proc.communicate() with open(fname_log, "w") as f_log: f_log.write(stderr) sys.stderr.write("\tclustalw2 run successful, log available at %s\n" % os.path.abspath(fname_log)) # Remove the extra file... if "." in os.path.split(fname_prot)[-1]: fname_dnd = ".".join(fname_prot.split(".")[:-1]) + ".dnd" else: fname_dnd = fname_prot + ".dnd" os.remove(fname_dnd) return fname_aln
def run_codeml(fname_ctl): """ XXX Input: fname_ctl Output: codeml output, currently """ os.system("%s/codeml %s" % (bin_dir(), fname_ctl))
def run_phyml(fname_aln, n_bootstrap): """ Generate a phylogenetic tree via PHYML. @param fname_aln: MSA of protein sequences in CLUSTAL format (.aln) @return: (tree_file, bootstrap_file) = File of phylo tree with clade confidences (_tree.txt), file of bootstrapped phylo trees (_boot_trees.txt) """ sys.stderr.write("\nSTEP: run_phyml(%s, %s)\n" % (fname_aln, n_bootstrap)) fname_phy = "homologs.aa.phy" with open(fname_aln, "rU") as f_in: with open(fname_phy, "w") as f_out: SeqIO.convert(f_in, "clustal", f_out, "phylip-relaxed") current_dir = os.getcwd() fname_tree = fname_phy + "_phyml_tree.txt" if n_bootstrap > 1: bootstrap_str = "-b %d" % n_bootstrap fname_boot_trees = fname_phy + "_phyml_boot_trees.txt" else: bootstrap_str = "" fname_boot_trees = None fname_log = "phyml.log" sys.stderr.write("\tRunning phyml, please be patient (may take minutes)...\n") proc = subprocess.Popen( "%s/phyml -i %s -d aa %s %s" % (bin_dir(), os.path.abspath(fname_phy), bootstrap_str, current_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) stdout, stderr = proc.communicate() with open(fname_log, "w") as f_log: f_log.write(stderr) sys.stderr.write("\tclustalw2 run successful, log available at %s\n" % os.path.abspath(fname_log)) return fname_tree, fname_boot_trees