def test_properties(self): """Test setting options via properties.""" input_file = "Registry/seqs.fasta" output_file = "temp_test.aln" cline = MSAProbsCommandline(msaprobs_exe) cline.infile = input_file cline.outfile = output_file cline.clustalw = True self.standard_test_procedure(cline)
def test_output_filename_with_spaces(self): """Test an output filename containing spaces.""" input_file = "Registry/seqs.fasta" output_file = "temp with spaces.aln" cline = MSAProbsCommandline( msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True ) self.standard_test_procedure(cline)
def test_simple_fasta(self): """Test a simple fasta file.""" input_file = "Registry/seqs.fasta" output_file = "temp_test.aln" cline = MSAProbsCommandline( msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True ) self.standard_test_procedure(cline)
def test_invalid_format(self): """Test an input file in an invalid format.""" input_file = "Medline/pubmed_result1.txt" self.assertTrue(os.path.isfile(input_file)) cline = MSAProbsCommandline(msaprobs_exe, infile=input_file) try: stdout, stderr = cline() except ApplicationError as err: self.assertEqual(err.returncode, 1) else: self.fail("Should have failed, returned:\n%s\n%s" % (stdout, stderr))
def test_single_sequence(self): """Test an input file containing a single sequence.""" input_file = "Fasta/f001" self.assertTrue(os.path.isfile(input_file)) self.assertEqual(len(list(SeqIO.parse(input_file, "fasta"))), 1) cline = MSAProbsCommandline(msaprobs_exe, infile=input_file) try: stdout, stderr = cline() except ApplicationError as err: self.assertEqual(err.returncode, 139) else: self.fail("Should have failed, returned:\n%s\n%s" % (stdout, stderr))
def test_input_filename_with_space(self): """Test an input filename containing a space.""" input_file = "Clustalw/temp horses.fasta" with open(input_file, "w") as handle: SeqIO.write(SeqIO.parse("Phylip/hennigian.phy", "phylip"), handle, "fasta") output_file = "temp_test.aln" cline = MSAProbsCommandline( msaprobs_exe, infile=input_file, outfile=output_file, clustalw=True ) self.add_file_to_clean(input_file) self.standard_test_procedure(cline)
def test_empty_file(self): """Test an empty file.""" input_file = "does_not_exist.fasta" self.assertFalse(os.path.isfile(input_file)) cline = MSAProbsCommandline(msaprobs_exe, infile=input_file) try: stdout, stderr = cline() except ApplicationError as err: self.assertTrue("Cannot open sequence file" in str(err) or "Cannot open input file" in str(err) or "Non-zero return code " in str(err), str(err)) else: self.fail("Should have failed, returned:\n%s\n%s" % (stdout, stderr))
def alignfunc(self, f_in, f_out, c=5, ir=500, **kwargs): """ Create multiple sequence alignment from unaligned sequences :param f_in: The file of unaligned sequence. :param f_out: The desired output filename. :param ir: Specifies the -ir flag to msaprobs :param c: Specifies the -c flag to msaprobs .. note:: This function requires `msaprobs <http://msaprobs.sourceforge.net/homepage.htm#latest>`_. """ annotfile = '%s.annot' % f_out try: print "Attempting to align using MSAProbsCommandline." from Bio.Align.Applications import MSAProbsCommandline cline = MSAProbsCommandline(infile=f_in, outfile=f_out, annot=annotfile, **kwargs) cline() except ImportError as e1: print e1 print "Trying another way..." callstr = 'msaprobs -annot %s -c %s -ir %s' % (annotfile, c, ir) tf = tempfile.NamedTemporaryFile(delete=False) s, o = getstatusoutput("cat %s | sed 's/\*/X/g' > %s" % (f_in, tf.name)) if s == 0: s, o = getstatusoutput('%s %s > %s' % (callstr, tf.name, f_out)) try: os.remove(tf.name) except Exception as e: print "Error removing temporary file" print e if s != 0: print "ERROR in aligning sequence" print o raise OSError
def test_single_sequence(self): """Test an input file containing a single sequence.""" input_file = "Fasta/f001" self.assertTrue(os.path.isfile(input_file)) self.assertEqual(len(list(SeqIO.parse(input_file, "fasta"))), 1) cline = MSAProbsCommandline(msaprobs_exe, infile=input_file) try: stdout, stderr = cline() except ApplicationError as err: if sys.platform == "win32": expected = 0xC0000005 else: expected = 139 # TODO: Check return codes on various other platforms self.assertEqual(expected, err.returncode) else: self.fail("Should have failed, returned:\n%s\n%s" % (stdout, stderr))
def run_multiple_sequence_alignment(records, workdir, msa): """ This runs the MSA, user can choose between emma, clustalw (old and busted), clustal omega (recommended for proteins and also uses HMM), MUSCLE or MAFFT (recommended for nucleotide data, and MUSCLE should be pretty fast), T-Coffee (good for distantly related sequences). FUTURE: Add more iterative methods to improve runtime? Add HMMER? HHpred is also quite fast """ #get filename for fasta file sequence_list_file = os.path.join(workdir, "msa.fasta") #write sequences SeqIO.write(records, sequence_list_file, "fasta") #prepare filenames for MSA output outfile = os.path.join(workdir, "msa.aln") treefile = os.path.join(workdir, "msa.dnd") #Prepare command line according to chosen algorithm if msa.lower() == "emma": #output is fasta print "Aligning by emma" cmd = EmmaCommandline(sequence=sequence_list_file, outseq=outfile, dendoutfile=treefile) elif msa.lower() == "clustalo" or msa.lower( ) == "clustal_omega" or msa.lower() == "clustal-omega": print "Aligning by Clustal Omega" cmd = ClustalOmegaCommandline(infile=sequence_list_file, outfile=outfile, verbose=True, auto=True, guidetree_out=treefile, outfmt="clu", force=True) elif msa.lower() == "t-coffee" or msa.lower( ) == "t_coffee": #should output tree file automatically print "Aligning by T-Coffeee" cmd = TCoffeeCommandline(infile=sequence_list_file, output="clustalw", outfile=outfile) elif msa.lower() == "muscle": print "Aligning by MUSCLE" #cmd = MuscleCommandline(input=sequence_list_file, out=outfile, tree2=treefile, clw=True) cmd = MuscleCommandline(input=sequence_list_file, out=outfile, tree2=treefile) elif msa.lower() == "mafft": #probably gonna save tree as input.tree print "Aligning by MAFFT" cmd = MafftCommandline(input=sequence_list_file, clustalout=True, treeout=True) elif msa.lower() == "clustalw" or msa.lower() == "clustalw2": print "Aligning by ClustalW2" cmd = ClustalwCommandline("clustalw", infile=sequence_list_file, outfile=outfile, tree=True, newtree=treefile) elif msa.lower( ) == "prank": #output is fasta, tree will be outputted to .dnd file? print "Aligning by PRANK" cmd = PrankCommandline(d=sequence_list_file, o=outfile, f=8, showtree=True, noxml=True) elif msa.lower() == "msaprobs": #doesn't use a guide tree print "Aligning by MSAprobs" cmd = MSAProbsCommandline(infile=sequence_list_file, outfile=outfile, clustalw=True) elif msa.lower() == "probcons": print "Aligning by ProbCons" cmd = ProbconsCommandline(input=sequence_list_file, clustalw=True) elif msa.lower( ) == "dialign": #phylip tree should be created automatically, names are a mystery? print "Aligning by Dialign" cmd = DialignCommandline(input=sequence_list_file, cw=True, fn=outfile) else: raise BaseException( "Only Multiple Sequence Alignment algorithms currently supported are emma, clustalo, t_coffee, muscle and mafft" ) #Execute the command stdout, stderr = cmd() #For algorithms that don't have an option to save ouptut to file, capture the stdout if msa.lower() == "mafft" or msa.lower() == "probcons": with open(outfile, "w") as handle: handle.write(stdout)