def test_no_clustering(self): otu_table = [self.headers,['ribosomal_protein_L11_rplK_gpkg','minimal','GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC','7','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['ribosomal_protein_L11_rplK_gpkg','minimal','GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA','6','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli'], #last base only is different to first sequence ['ribosomal_protein_S17_gpkg','minimal','GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT','9','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus']] otu_table = "\n".join(["\t".join(x) for x in otu_table]) with tempfile.NamedTemporaryFile() as f: f.write(otu_table) f.flush() with tempdir.TempDir() as d: cmd = "{} makedb --db_path {}/db --otu_table {} --clustering_divergence 0".format( path_to_script, d, f.name) extern.run(cmd) with tempfile.NamedTemporaryFile() as f2: f2.write(">seq1\n") # first sequence with an extra A at the start f2.write("AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\n") f2.flush() # Querying the smafadb directly should show no clustering cmd = "smafa query {} {}".format( os.path.join(d,'db','ribosomal_protein_L11_rplK_gpkg.smafadb'), f2.name) out = extern.run(cmd) self.assertEqual( out, 'seq1\tAGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\tGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA\t2\t60\n'+ 'seq1\tAGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\tGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\t1\t60\n')
def test_hello_world(self): with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f1: with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f2: f1.write(self.eg1) f1.flush() extern.run("diamond makedb --in %s --db %s.dmnd" %\ (f1.name, f1.name)) f2.write(self.eg1) f2.write(self.eg2) f2.flush() extern.run("diamond makedb --in %s --db %s.dmnd" %\ (f2.name, f2.name)) with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f3: with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f4: f3.write(self.eg1) f3.flush() ret = DecoyFilter( Diamond(f2.name+".dmnd"), Diamond(f1.name+".dmnd")).filter(f1.name, f4.name) self.assertEqual(True, ret) seqs = SequenceIO().read_fasta_file(f4.name) self.assertEqual(1, len(seqs)) self.assertEqual("PROKKA_03952", seqs[0].name) # clean up os.remove(f1.name+".dmnd") os.remove(f2.name+".dmnd")
def run(self, input_sequence_file, input_sequence_type, daa_file_basename=None): '''Run input sequences in either blastp or blastx mode against the database specified in __init__. Parameters ---------- input_sequence_file: str path to query sequences input_sequence_type: either 'nucleotide' or 'protein' the input_sequences are this kind of sequence Returns ------- DiamondSearchResult ''' cmd_list = ["diamond"] if input_sequence_type == UnpackRawReads.PROTEIN_SEQUENCE_TYPE: cmd_list.append('blastp') elif input_sequence_type == UnpackRawReads.NUCLEOTIDE_SEQUENCE_TYPE: cmd_list.append('blastx') else: raise Exception("Programming error") basename = daa_file_basename if basename is None: with tempfile.NamedTemporaryFile(prefix='graftm_diamond') as t: # we are just stealing the name, don't need the file itself basename = t.name for c in ['-k 1', "-d", self._database, "-q", "%s" % input_sequence_file, "-a", basename]: cmd_list.append(c) if self._threads: cmd_list.append("--threads") cmd_list.append(str(self._threads)) if self._evalue: cmd_list.append("--evalue") cmd_list.append(str(self._evalue)) cmd = ' '.join(cmd_list) extern.run(cmd) daa_name = "%s.daa" % basename res = DiamondSearchResult.import_from_daa_file(daa_name) if daa_file_basename is None: # Diamond makes an extra file, need to remove this os.remove(daa_name) return res
def write_krona_plot(self, sample_names, read_taxonomies, output_krona_filename): '''Creates krona plot at the given location. Assumes the krona executable ktImportText is available on the shell PATH''' tempfiles = [] for n in sample_names: tempfiles.append(tempfile.NamedTemporaryFile(prefix='GraftMkronaInput', suffix=n)) delim=u'\t' for _, tax, counts in self._iterate_otu_table_rows(read_taxonomies): for i, c in enumerate(counts): if c != 0: tempfiles[i].write(delim.join((str(c), delim.join(tax) ))+"\n") for t in tempfiles: t.flush() cmd = ["ktImportText",'-o',output_krona_filename] for i, tmp in enumerate(tempfiles): cmd.append(','.join([tmp.name,sample_names[i]])) # run the actual krona cmd = ' '.join(cmd) extern.run(cmd) # close tempfiles for t in tempfiles: t.close()
def test_hello_world(self): with tempdir.in_tempdir(): with tempfile.NamedTemporaryFile() as fasta: with tempfile.NamedTemporaryFile() as tax: fasta.write(Tests.extra_mcra_fasta) fasta.flush() tax.write(Tests.extra_mcra_taxonomy) tax.flush() prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg') cmd1 = "%s update --graftm_package %s --sequences %s --taxonomy %s --output %s" %( path_to_script, prev_path, fasta.name, tax.name, 'updated.gpkg') extern.run(cmd1) prev = GraftMPackage.acquire(prev_path) up = GraftMPackage.acquire('updated.gpkg') prevhash = prev.taxonomy_hash() taxhash = up.taxonomy_hash() self.assertEqual(len(prevhash)+1, len(taxhash)) self.assertEqual(['mcrA','Euryarchaeota_mcrA','Methanofastidiosa'], taxhash['KYC55281.1']) self.assertEqual(prevhash['638165755'], taxhash['638165755']) seqio = SequenceIO() self.assertEqual( len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1, len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
def test_query_with_otu_table_two_samples_same_sequence(self): with tempfile.NamedTemporaryFile() as f: query = [self.headers, # second sequence with an extra A at the end ['ribosomal_protein_L11_rplK_gpkg','maximal','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','7','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['ribosomal_protein_L11_rplK_gpkg','minimal','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','7','4.95','Root; k__Bacteria; p__Firmicutes; c__Bacilli'] ] # converted A to T in the middle query = "\n".join(["\t".join(x) for x in query]) f.write(query) f.flush() with tempdir.TempDir() as d: cmd = "{} makedb --db {}/sdb --otu_table {}".format( path_to_script, d, f.name) extern.run(cmd) cmd = "{} query --query_otu_table {} --db {}/sdb".format( path_to_script, f.name, d) expected = [['query_name','query_sequence','divergence','num_hits','sample','marker','hit_sequence','taxonomy'], ['maximal;ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','0','7','maximal','ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['maximal;ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','0','7','minimal','ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','Root; k__Bacteria; p__Firmicutes; c__Bacilli'], ['minimal;ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','0','7','maximal','ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['minimal;ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','0','7','minimal','ribosomal_protein_L11_rplK_gpkg','CGTCGTTGGAACCCAAAAATGAAATAATATATCTTCACTGAGAGAAATGGTATTTATATA','Root; k__Bacteria; p__Firmicutes; c__Bacilli'], ] observed = subprocess.check_output(cmd, shell=True) self.assertEqualOtuTable(expected, observed)
def test_alignment_rereplication(self): gpkg = os.path.join(path_to_data,'61_otus.gpkg') test_sequences=""">FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA_2/1 ACACTGCCCAGACACCTACGGGTGGCTGCAGTCGAGGATCTTCGGCAATGGGCGAAAGCCTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGGT >FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA/1 ACACTGCCCAGACACCTACGGGTGGCTGCAGTCGAGGATCTTCGGCAATGGGCGAAAGCCTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGGT >FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA/2 CGGGGTATCTAATCCCGTTCGCTCCCCTAGCTTTCGTGCCTCAGCGTCAGAAAAGACCCAGTGAGCCGCTTTCGCCCCCGGTGTTCCTTAGGATATCAAC """ expected_rereplicated_alignment=""">FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA/2 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TTGATATCCTAAGGAACACCGGGGGCGAAAGCGGCTCACTGGGTCTTCTGACGCTGAGGCACGAAAGCTAGGGGAGCGAACGGGATTAGATACCCC---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- >FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA_2/1 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ACACTGCCCAGACACCTACGGGTGGCTGCAGTCGAGGATCTTCGGCAATGGGCGAAAGCCTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- >FCC0WM1ACXX:2:2208:12709:74426#GTCCAGAA/1 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ACACTGCCCAGACACCTACGGGTGGCTGCAGTCGAGGATCTTCGGCAATGGGCGAAAGCCTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------""".split() with tempfile.NamedTemporaryFile(suffix=".fa") as tf: tf.write(test_sequences) tf.flush() with tempdir.TempDir() as tmp: cmd = "%s graft --forward %s --graftm_package %s --output_directory %s --force" % (path_to_script, tf.name, gpkg, tmp) extern.run(cmd) filename=os.path.splitext(os.path.basename(tf.name))[0] observed_rereplicated_alignment = [x.strip() for x in open(os.path.join(tmp, filename, "%s_hits.aln.fa" % filename))] self.assertEquals(expected_rereplicated_alignment, observed_rereplicated_alignment)
def _align_sequences(self, input_sequences_path, output_alignment_path, threads): '''Align sequences into alignment_file Parameters ---------- input_sequences_path: str path to input sequences in fasta format output_alignment_path: str path to output alignment path threads: str number of threads to use Returns ------- Nothing ''' logging.debug("Aligning sequences using mafft") cmd = "mafft --anysymbol --thread %s --auto /dev/stdin > %s" % ( threads, output_alignment_path) inputs = [] with open(input_sequences_path) as f: for name,seq,_ in SequenceIO().each(f): inputs.append('>%s' % name) # Do not include * characters in the HMM, as this means tree # insertion fails. inputs.append(seq.replace('*','')) extern.run(cmd, stdin="\n".join(inputs))
def run(self, input_sequence_file, input_sequence_type, daa_file_basename=None, extra_args=''): '''Run input sequences in either blastp or blastx mode against the database specified in __init__. Parameters ---------- input_sequence_file: str path to query sequences input_sequence_type: either 'nucleotide' or 'protein' the input_sequences are this kind of sequence Returns ------- DiamondSearchResult ''' cmd_list = ["diamond"] if input_sequence_type == UnpackRawReads.PROTEIN_SEQUENCE_TYPE: cmd_list.append('blastp') elif input_sequence_type == UnpackRawReads.NUCLEOTIDE_SEQUENCE_TYPE: cmd_list.append('blastx') else: raise Exception("Programming error") basename = daa_file_basename if basename is None: with tempfile.NamedTemporaryFile(prefix='graftm_diamond') as t: # we are just stealing the name, don't need the file itself basename = t.name for c in [ '-k 1', "-d", self._database, "-q", "%s" % input_sequence_file, "-a", basename, extra_args ]: cmd_list.append(c) if self._threads: cmd_list.append("--threads") cmd_list.append(str(self._threads)) if self._evalue: cmd_list.append("--evalue") cmd_list.append(str(self._evalue)) cmd = ' '.join(cmd_list) extern.run(cmd) daa_name = "%s.daa" % basename res = DiamondSearchResult.import_from_daa_file(daa_name) if daa_file_basename is None: # Diamond makes an extra file, need to remove this os.remove(daa_name) return res
def summarise(**kwargs): '''Summarise an OTU table''' krona_output_file = kwargs.pop('krona_output') table_collection = kwargs.pop('table_collection') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) # prep the array gene_to_sample_to_taxonomy_to_count = Summariser._collapse_otu_table_into_gene_to_sample_to_taxonomy_to_count( table_collection) # write the output krona files sample_name_to_tempfile = OrderedDict() logging.info("Writing krona %s" % krona_output_file) cmd = 'ktImportText -o %s' % krona_output_file sample_tempfiles = [] sample_to_gene_to_taxonomy_to_count = {} all_sample_names = set() all_gene_names = set() for gene, sample_to_taxonomy_to_count in gene_to_sample_to_taxonomy_to_count.items( ): all_gene_names.add(gene) for sample, taxonomy_to_count in sample_to_taxonomy_to_count.items( ): all_sample_names.add(sample) if sample not in sample_to_gene_to_taxonomy_to_count: sample_to_gene_to_taxonomy_to_count[sample] = {} sample_to_gene_to_taxonomy_to_count[sample][ gene] = taxonomy_to_count is_more_than_one_sample = len(sample_to_gene_to_taxonomy_to_count) > 1 for sample in sorted(all_sample_names): for gene in sorted(all_gene_names): if gene in sample_to_gene_to_taxonomy_to_count[sample]: f = tempfile.NamedTemporaryFile(prefix='singlem_for_krona', mode='w') sample_tempfiles.append(f) taxonomy_to_count = sample_to_gene_to_taxonomy_to_count[ sample][gene] for taxonomy, coverage in taxonomy_to_count.items(): tax_split = taxonomy.split('; ') if tax_split[0] == 'Root' and len(tax_split) > 1: tax_split = tax_split[1:] f.write('\t'.join([str(coverage)] + tax_split)) f.write('\n') f.flush() if is_more_than_one_sample: display_name = '%s: %s' % (sample, gene) else: display_name = gene cmd += " %s,'%s'" % (f.name, display_name) extern.run(cmd) for f in sample_tempfiles: f.close()
def test_bootstrap_executable(self): with tempfile.NamedTemporaryFile() as tf: cmd = '%s expand_search --verbosity 5 --contigs %s --output_hmm %s --search_hmm_files %s' % ( path_to_script, os.path.join(path_to_data, 'bootstrapper', 'contigs.fna'), tf.name, os.path.join(path_to_data, 'bootstrapper', 'DNGNGWU00001.hmm')) extern.run(cmd) self.assertEqual( "HMMER3/f [3.1b2 | February 2015]\n", subprocess.check_output("head -n1 %s" % tf.name, shell=True)) self.assertEqual('NSEQ 2\n', open(tf.name).readlines()[10])
def test_no_clustering(self): otu_table = [ self.headers, [ 'ribosomal_protein_L11_rplK_gpkg', 'minimal', 'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC', '7', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales' ], [ 'ribosomal_protein_L11_rplK_gpkg', 'minimal', 'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA', '6', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli' ], #last base only is different to first sequence [ 'ribosomal_protein_S17_gpkg', 'minimal', 'GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT', '9', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus' ] ] otu_table = "\n".join(["\t".join(x) for x in otu_table]) with tempfile.NamedTemporaryFile() as f: f.write(otu_table) f.flush() with tempdir.TempDir() as d: cmd = "{} makedb --db_path {}/db --otu_table {} --clustering_divergence 0".format( path_to_script, d, f.name) extern.run(cmd) with tempfile.NamedTemporaryFile() as f2: f2.write(">seq1\n") # first sequence with an extra A at the start f2.write( "AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\n" ) f2.flush() # Querying the smafadb directly should show no clustering cmd = "smafa query {} {}".format( os.path.join( d, 'db', 'ribosomal_protein_L11_rplK_gpkg.smafadb'), f2.name) out = extern.run(cmd) self.assertEqual( out, 'seq1\tAGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\tGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA\t2\t60\n' + 'seq1\tAGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\tGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC\t1\t60\n' )
def test_bootstrap_executable(self): with tempfile.NamedTemporaryFile() as tf: cmd = '%s expand_search --verbosity 5 --contigs %s --output_hmm %s --search_hmm_files %s' % (path_to_script, os.path.join(path_to_data,'bootstrapper','contigs.fna'), tf.name, os.path.join(path_to_data,'bootstrapper','DNGNGWU00001.hmm')) extern.run(cmd) self.assertTrue( subprocess.check_output("head -n1 %s" % tf.name, shell=True) in ["HMMER3/f [3.1b2 | February 2015]\n", "HMMER3/f [3.2.1 | June 2018]\n"]) self.assertEqual('NSEQ 2\n', open(tf.name).readlines()[10])
def test_jplace_output(self): expected_jpace = { 'fields': [ 'classification', 'distal_length', 'edge_num', 'like_weight_ratio', 'likelihood', 'pendant_length' ], 'metadata': 'the_metadata', 'placements': [{ 'nm': [[ 'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG', 2 ]], 'p': [[ "o__Bacillales", 0.0874346630859, 13, 0.333350512423, -608.20180926, 6.11351501465e-06 ], [ "o__Bacillales", 0.0643521435547, 14, 0.333326884837, -608.201880142, 6.11351501465e-06 ], [ "p__Firmicutes", 5.97534179688e-06, 15, 0.33332260274, -608.201892989, 6.11351501465e-06 ]] }], 'tree': 'tree_thanks', 'version': 3 } with tempdir.TempDir() as d: cmd = "%s pipe --sequences %s --otu_table /dev/null --output_jplace %s"\ " --singlem_packages %s" % ( path_to_script, os.path.join(path_to_data,'1_pipe','jplace_test.fna'), os.path.join(d, "my_jplace"), os.path.join(path_to_data,'4.12.22seqs.spkg')) extern.run(cmd) jplace_path = os.path.join( d, 'my_jplace_jplace_test_4.12.22seqs.jplace') with open(jplace_path) as f: j = json.load(f) j['tree'] = 'tree_thanks' j['metadata'] = 'the_metadata' self.assertEqual(expected_jpace, j) # Make sure the guppy sing does not croak extern.run("guppy sing -o /dev/null '%s'" % jplace_path)
def test_biom_hello_world(self): insert_otu_table = [self.headers, ['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','1','2.44','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG','2','2.94','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales']] with tempdir.TempDir() as tmp: with tempfile.NamedTemporaryFile(suffix='.otu_table.csv') as n: n.write("\n".join(["\t".join(x) for x in insert_otu_table]+[''])) n.flush() extern.run("%s summarise --biom_prefix '%s' --input_otu_tables '%s'" % ( path_to_script, os.path.join(tmp,"mybiom"), n.name)) self.assertEqual(['mybiom.4.12.ribosomal_protein_L11_rplK.biom'], os.listdir(tmp)) self.assertEqual( '# Constructed from biom file\n#OTU ID\tinsert\ttaxonomy\nRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG\t1.0\tRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales\nRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG\t2.0\tRoot; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales', extern.run("biom convert -i '%s' -o /dev/stdout --to-tsv --header-key taxonomy" % os.path.join(tmp,'mybiom.4.12.ribosomal_protein_L11_rplK.biom')))
def run(hmm_paths, output_directory, is_protein): cmd = self._graftm_command_prefix(is_protein) + \ "--threads %i "\ "--forward %s "\ "--search_only "\ "--search_hmm_files %s "\ "--output_directory %s "\ "--aln_hmm_file %s " % ( self._num_threads, ' '.join(forward_read_files), ' '.join(hmm_paths), output_directory, hmm_paths[0]) extern.run(cmd)
def test_jplace_output(self): expected_jpace = { u'fields': [ u'classification', u'distal_length', u'edge_num', u'like_weight_ratio', u'likelihood', u'pendant_length' ], u'metadata': 'the_metadata', u'placements': { u'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG': { u'nm': [[ u'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG', 2 ]], u'p': [[ u'o__Bacillales', 0.0874346630859, 13, 0.333351177694, -631.301684875, 0.150831104822 ], [ u'o__Bacillales', 0.0643521435547, 14, 0.333326655502, -631.301758441, 0.15083915761 ], [ u'p__Firmicutes', 5.97534179688e-06, 15, 0.333322166804, -631.301771907, 0.150839131805 ]] } }, u'tree': 'tree_thanks', u'version': 3 } with tempdir.TempDir() as d: cmd = "%s pipe --sequences %s --otu_table /dev/null --output_jplace %s"\ " --singlem_packages %s" % ( path_to_script, os.path.join(path_to_data,'1_pipe','jplace_test.fna'), os.path.join(d, "my_jplace"), os.path.join(path_to_data,'4.12.22seqs.spkg')) extern.run(cmd) j = json.load( open( os.path.join(d, 'my_jplace_jplace_test_4.12.22seqs.jplace'))) j['tree'] = 'tree_thanks' j['metadata'] = 'the_metadata' self.assertEqual(expected_jpace, j)
def _build_tree(self, alignment, base, ptype, fasttree): log_file = base + ".tre.log" tre_file = base + ".tre" if ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: # If it's a nucleotide sequence cmd = "%s -quiet -gtr -nt -log %s -out %s %s" % ( fasttree, log_file, tre_file, alignment) extern.run(cmd) else: # Or if its an amino acid sequence cmd = "%s -quiet -log %s -out %s %s" % (fasttree, log_file, tre_file, alignment) extern.run(cmd) self.the_trash += [log_file, tre_file] return log_file, tre_file
def global_search(self, query_otu_table_collection, subject_otu_table_collection, cluster_identity): '''Search a query OTU table against a subject OTU table, yield over UCEntry objects that have been modified so that the query and subject are the relevant OtuTableEntry objects rather than strings. Or they are None if there are no hits, since --output_no_hits is used. query_otu_table_collection: OtuTableCollection subject_otu_table_collection: OtuTableCollection cluster_identity: float or str reject hits if have lower identity than this (implemented with vsearch --id). ''' logging.info("Caching query OTUs") query_otus = list(query_otu_table_collection) logging.info("Caching target OTUs") subject_otus = list(subject_otu_table_collection) def name_to_index(name): return int(str.split(name, ';')[0]) # write out fasta file numbered to corresponding to the OTU info with tempfile.NamedTemporaryFile(prefix='singlem_q_for_vsearch',mode='w') as query_f: for i, u in enumerate(query_otus): query_f.write(">%i;size=%i\n" % (i, u.count)) query_f.write(u.sequence.replace('-','')+"\n") query_f.flush() with tempfile.NamedTemporaryFile(prefix='singlem_db_for_vsearch',mode='w') as db_f: for i, u in enumerate(subject_otu_table_collection): db_f.write(">%i;size=%i\n" % (i, u.count)) db_f.write(u.sequence.replace('-','')+"\n") db_f.flush() with tempfile.NamedTemporaryFile(prefix='singlem_uc') as uc: command = "vsearch --usearch_global %s --db %s --uc %s --id %s --output_no_hits" % ( query_f.name, db_f.name, uc.name, str(cluster_identity)) logging.info("Running search") extern.run(command) logging.info("Finished running search") with open(uc.name) as uc_read: for uc_entry in UCFile(uc_read): uc_entry.query = query_otus[name_to_index(uc_entry.query)] if uc_entry.target is not None: uc_entry.target = subject_otus[name_to_index(uc_entry.target)] yield uc_entry
def test_jplace_output(self): expected_jpace = {u'fields': [u'classification', u'distal_length', u'edge_num', u'like_weight_ratio', u'likelihood', u'pendant_length'], u'metadata': 'the_metadata', u'placements': [{ u'nm': [[u'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG', 2]], u'p': [[u'o__Bacillales', 0.0874346630859, 13, 0.333351177694, -631.301684875, 0.150831104822], [u'o__Bacillales', 0.0643521435547, 14, 0.333326655502, -631.301758441, 0.15083915761], [u'p__Firmicutes', 5.97534179688e-06, 15, 0.333322166804, -631.301771907, 0.150839131805]]}], u'tree': 'tree_thanks', u'version': 3} with tempdir.TempDir() as d: cmd = "%s pipe --sequences %s --otu_table /dev/null --output_jplace %s"\ " --singlem_packages %s" % ( path_to_script, os.path.join(path_to_data,'1_pipe','jplace_test.fna'), os.path.join(d, "my_jplace"), os.path.join(path_to_data,'4.12.22seqs.spkg')) extern.run(cmd) jplace_path = os.path.join(d, 'my_jplace_jplace_test_4.12.22seqs.jplace') j = json.load(open(jplace_path)) j['tree'] = 'tree_thanks' j['metadata'] = 'the_metadata' self.assertEqual(expected_jpace, j) # Make sure the guppy sing does not croak extern.run("guppy sing -o /dev/null '%s'" % jplace_path)
def _create_dmnd_database(self, unaligned_sequences_path, daa_output): ''' Build a diamond database using diamond makedb Parameters ---------- unaligned_sequences_path: str path to a FASTA file containing unaligned sequences daa_output: str Name of output database. ''' logging.debug("Building diamond database") cmd = "diamond makedb --in '%s' -d '%s'" % (unaligned_sequences_path, daa_output) extern.run(cmd)
def global_search(self, query_otu_table_collection, subject_otu_table_collection, cluster_identity): '''Search a query OTU table against a subject OTU table, yield over UCEntry objects that have been modified so that the query and subject are the relevant OtuTableEntry objects rather than strings. Or they are None if there are no hits, since --output_no_hits is used. query_otu_table_collection: OtuTableCollection subject_otu_table_collection: OtuTableCollection cluster_identity: float or str reject hits if have lower identity than this (implemented with vsearch --id). ''' logging.info("Caching query OTUs") query_otus = list(query_otu_table_collection) logging.info("Caching target OTUs") subject_otus = list(subject_otu_table_collection) def name_to_index(name): return int(string.split(name, ';')[0]) # write out fasta file numbered to corresponding to the OTU info with tempfile.NamedTemporaryFile(prefix='singlem_q_for_vsearch') as query_f: for i, u in enumerate(query_otus): query_f.write(">%i;size=%i\n" % (i, u.count)) query_f.write(u.sequence.replace('-','')+"\n") query_f.flush() with tempfile.NamedTemporaryFile(prefix='singlem_db_for_vsearch') as db_f: for i, u in enumerate(subject_otu_table_collection): db_f.write(">%i;size=%i\n" % (i, u.count)) db_f.write(u.sequence.replace('-','')+"\n") db_f.flush() with tempfile.NamedTemporaryFile(prefix='singlem_uc') as uc: command = "vsearch --usearch_global %s --db %s --uc %s --id %s --output_no_hits" % (query_f.name, db_f.name, uc.name, str(cluster_identity)) logging.info("Running search") extern.run(command) logging.info("Finished running search") with open(uc.name) as uc_read: for uc_entry in UCFile(uc_read): uc_entry.query = query_otus[name_to_index(uc_entry.query)] if uc_entry.target is not None: uc_entry.target = subject_otus[name_to_index(uc_entry.target)] yield uc_entry
def test_paired_reads_hello_world(self): # Reads should be merged expected = [ "\t".join(self.headers), '4.11.22seqs TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA 1 2.44 Root; d__Bacteria; p__Firmicutes', ''] inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA ''' inseqs_reverse = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT TTCAGCTGCACGACGTACCATAGTGTTTTTGTATACTTTATACTCAACACCAGCTTCACGTAATTGTGAACGTAAGTCAGTAACTTCAGCTACTGTTAAT ''' # reverse complement of the forward, so should collapse. with tempfile.NamedTemporaryFile(suffix='.fa') as n: n.write(inseqs) n.flush() with tempfile.NamedTemporaryFile(suffix='.fa') as n2: n2.write(inseqs_reverse) n2.flush() cmd = "{} pipe --sequences {} --otu_table /dev/stdout --singlem_packages {} --reverse {}".format( path_to_script, n.name, os.path.join(path_to_data,'4.11.22seqs.gpkg.spkg'), n2.name) self.assertEqualOtuTable( list([line.split("\t") for line in expected]), extern.run(cmd).replace(os.path.basename(n.name).replace('.fa',''),''))
def test_seqs_dna(self): aln = '''>s1 ga-------------TATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGATGTG >s2 asdas ca---------GAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGA---- >s3 ga-------------TATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGGGCTGATGTG- >d4 -g----------AGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGATG-- ''' expected = '''TATGGAGGAACACCAGTGGC TATGGAGGAACACCAGTGGC TATGGAGGAACACCAGTGGC TATGGAGGAACACCAGTGGC ''' with tempfile.NamedTemporaryFile() as a: a.write(aln) a.flush() with tempfile.NamedTemporaryFile() as stderr: cmd = "%s --debug seqs --alignment %s --alignment_type dna"\ " --window_size 20 2>%s" % ( path_to_script, a.name, stderr.name) self.assertEqual('', extern.run(cmd)) # This includes ignored columns at the front, which were messing things up. self.assertTrue('Found best section of the alignment starting from 14\n' in \ open(stderr.name).read())
def _get_hmm_from_alignment(self, alignment, hmm_filename, output_alignment_filename): '''Return a HMM file and alignment of sequences to that HMM Parameters ---------- alignment: str path to aligned proteins hmm_filename: str write the hmm to this file path output_alignment_filename: str write the output alignment to this file path Returns ------- Return the pipeline type of the HMM. ''' logging.info("Building HMM from alignment") with tempfile.NamedTemporaryFile(suffix='.fasta', prefix='graftm', mode='w') as tempaln: cmd = "hmmbuild -O /dev/stdout -o /dev/stderr '%s' '%s'" % ( hmm_filename, alignment) output = extern.run(cmd) SeqIO.write(SeqIO.parse(StringIO(output), 'stockholm'), tempaln, 'fasta') tempaln.flush() ptype, _ = self._pipe_type(hmm_filename) SequenceSearcher(hmm_filename).alignment_correcter( [tempaln.name], output_alignment_filename) return ptype
def test_two_nucleotide_packages(self): expected = [ "\t".join(self.headers), '61_otus.v3 GGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGACTGACGCTGATGTGCGAAAGCG 2 5.13 Root; k__Bacteria; p__Proteobacteria', '61_otus.second.v3 TTAGGTAGTTGCTGGGGTAACGTCCCAACAAGCCGATAATCGGTACGGGTTGTGAGAGCA 1 1.66 Root; k__Archaea; p__Euryarchaeota', ''] inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1105:6981:63483 1:N:0:AAGAGGCAAAGGAGTA GATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTGGGGATCAAACAGGATTAGATACCCTGGTAGT >HWI-ST1243:156:D1K83ACXX:7:1105:6981:63483_revcom ACTACCAGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGCACATCAGCGTCAGTTACAGACCAGAAAGTCGCCTTCGCCACTGGTGTTCCTCCATATC >NS500333:10:H0V2GAGXX:2:13211:8623:16289 1:N:0:GATCAG ATTAGGTAGTTGCTGGGGTAACGTCCCAACAAGCCGATAATCGGTACGGGTTGTGAGAGCAAGAGCCCGGAGATGGATTCTGAGACACGAATCCAGGTCCTACGGGGCGCAGCAGGCGCGAAAACTTTACACTGCGCGAAAGCGCGATA ''' with tempfile.NamedTemporaryFile(suffix='.fa') as n: n.write(inseqs) n.flush() cmd = "%s pipe --sequences %s --otu_table /dev/stdout --singlem_packages %s %s" % ( path_to_script, n.name, os.path.join(path_to_data,'61_otus.v3.gpkg.spkg'), os.path.join(path_to_data,'second_packge.spkg')) self.assertEqualOtuTable( list([line.split("\t") for line in expected]), extern.run(cmd).replace(os.path.basename(n.name).replace('.fa',''),''))
def test_known_sequence_taxonomy(self): expected = [ "\t".join(self.headers), '4.11.22seqs TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA 2 4.88 mytax; yeh', ''] inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA >another ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA ''' with tempfile.NamedTemporaryFile(suffix='.fa') as n: n.write(inseqs) n.flush() with tempfile.NamedTemporaryFile() as taxf: taxf.write("HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482\tmytax; yeh\n") taxf.write("another\tmytax; yeh; 2\n") taxf.flush() cmd = "%s pipe --sequences %s --otu_table /dev/stdout --singlem_packages %s "\ "--no_assign_taxonomy --known_sequence_taxonomy %s"% ( path_to_script, n.name, os.path.join(path_to_data,'4.11.22seqs.gpkg.spkg'), taxf.name) self.assertEqual(expected, extern.run(cmd).replace( os.path.basename(n.name).replace('.fa',''), '').split("\n"))
def test_one_read_two_orfs_two_diamond_hits(self): # what a pain the real world is seq = '''>HWI-ST1240:128:C1DG3ACXX:7:2204:6599:65352 1:N:0:GTAGAGGATAGATCGC ACCCACAGCTCGGGGTTGCCCTTGCCCGACCCCATGCGTGTCTCGGCGGGCTTCTGGTGACGGGCTTGTCCGGGAAGACGCGGATCCAGACCTTGCCTCCGCGCTTGACGTGCCGGGTCATCGCGATACGGGCCGCCTCGATCTGACGTGC ''' expected = [ self.headers, [ 'S1.7.ribosomal_protein_L16_L10E_rplP CGCGTCTTCCCGGACAAGCCCGTCACCAGAAGCCCGCCGAGACACGCATGGGGTCGGGCA 1 1.64 GCA_000949295.1' ] ] exp = sorted(["\t".join(x) for x in expected] + ['']) with tempfile.NamedTemporaryFile(mode='w', prefix='singlem_test', suffix='.fa') as t: t.write(seq) t.flush() cmd = "%s --quiet pipe --sequences %s --otu_table /dev/stdout --threads 4 --assignment_method diamond_example" % ( path_to_script, t.name) self.assertEqual( exp, sorted( extern.run(cmd).replace( os.path.basename(t.name).replace('.fa', ''), '').split("\n")))
def test_paired_reads_one_read_each_diamond_example(self): # Reads should be merged expected = [ "\t".join(self.headers_with_extras), '4.11.22seqs TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA 2 4.88 2524614704 HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 seq2 60 60 False', ''] inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA >seq2 AAAAAAAAAAAAAAAAA ''' inseqs_reverse = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT AAAAAAAAAAAAAAAAA >seq2 TTCAGCTGCACGACGTACCATAGTGTTTTTGTATACTTTATACTCAACACCAGCTTCACGTAATTGTGAACGTAAGTCAGTAACTTCAGCTACTGTTAAT ''' # reverse complement of the forward, so should collapse. with tempfile.NamedTemporaryFile(suffix='.fa') as n: n.write(inseqs) n.flush() with tempfile.NamedTemporaryFile(suffix='.fa') as n2: n2.write(inseqs_reverse) n2.flush() cmd = "{} pipe --sequences {} --otu_table /dev/stdout --singlem_packages {} --reverse {} --output_extras --assignment_method diamond_example".format( path_to_script, n.name, os.path.join(path_to_data,'4.11.22seqs.gpkg.spkg'), n2.name) self.assertEqualOtuTable( list([line.split("\t") for line in expected]), extern.run(cmd).replace(os.path.basename(n.name).replace('.fa',''),''))
def test_query_with_otu_table(self): with tempfile.NamedTemporaryFile(mode='w') as f: query = [ self.headers, # second sequence with an extra A at the end [ 'ribosomal_protein_L11_rplK_gpkg', 'minimal', 'CGTCGTTGGAACCCAAAAATGAAAAAATATATCTTCACTGAGAGAAATGGTATTTATATA', '7', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales' ] ] query = "\n".join(["\t".join(x) for x in query]) f.write(query) f.flush() cmd = "%s query --query_otu_table %s --db %s" % ( path_to_script, f.name, os.path.join(path_to_data, 'a.sdb')) expected = [ [ 'query_name', 'query_sequence', 'divergence', 'num_hits', 'sample', 'marker', 'hit_sequence', 'taxonomy' ], [ 'minimal;ribosomal_protein_L11_rplK_gpkg', 'CGTCGTTGGAACCCAAAAATGAAAAAATATATCTTCACTGAGAGAAATGGTATTTATATA', '1', '6', 'minimal', 'ribosomal_protein_S2_rpsB_gpkg', 'CGTCGTTGGAACCCAAAAATGAAAAAATATATCTTCACTGAGAGAAATGGTATTTATATC', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli' ] ] expected = ["\t".join(x) for x in expected] + [''] self.assertEqual(expected, extern.run(cmd).split('\n'))
def test_paired_reads_hello_world(self): # Reads should be merged expected = [ "\t".join(self.headers), '4.11.22seqs TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA 1 2.44 Root; d__Bacteria; p__Firmicutes', '' ] inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA ''' inseqs_reverse = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT TTCAGCTGCACGACGTACCATAGTGTTTTTGTATACTTTATACTCAACACCAGCTTCACGTAATTGTGAACGTAAGTCAGTAACTTCAGCTACTGTTAAT ''' # reverse complement of the forward, so should collapse. with tempfile.NamedTemporaryFile(suffix='.fa') as n: n.write(inseqs) n.flush() with tempfile.NamedTemporaryFile(suffix='.fa') as n2: n2.write(inseqs_reverse) n2.flush() cmd = "{} pipe --sequences {} --otu_table /dev/stdout --singlem_packages {} --reverse {}".format( path_to_script, n.name, os.path.join(path_to_data, '4.11.22seqs.gpkg.spkg'), n2.name) self.assertEqualOtuTable( list([line.split("\t") for line in expected]), extern.run(cmd).replace( os.path.basename(n.name).replace('.fa', ''), ''))
def test_cluster_across_samples_via_script(self): e = [['gene','sample','sequence','num_hits','coverage','taxonomy'], ['4.11.ribosomal_protein_L10','minimal','TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACT','2','4.88','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus'], ['4.12.ribosomal_protein_L11_rplK','ma','TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACA','4','9.76','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'] ] exp = "\n".join(["\t".join(x) for x in e]+['']) with tempfile.NamedTemporaryFile(prefix='singlem_cluster') as f: cmd = "%s summarise --cluster --cluster_id %f --input_otu_tables %s --output_otu_table /dev/stdout" % ( path_to_script, 58.5/60, f.name) for l in ["\t".join(o) for o in e]: f.write(l+"\n") f.flush() output = extern.run(cmd) out_clusters = [o.split("\t") for o in output.split("\n")] self.assertEqual( [['gene', 'sample', 'sequence', 'num_hits', 'coverage', 'taxonomy'], ['4.12.ribosomal_protein_L11_rplK', 'ma', 'TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACA', '4', '9.76', 'Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['4.12.ribosomal_protein_L11_rplK', 'minimal', 'TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACA', '2', '4.88', 'Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['']], out_clusters)
def test_paired_reads_one_read_each_diamond_example(self): # Reads should be merged expected = [ "\t".join(self.headers_with_extras), '4.11.22seqs TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA 2 4.88 2524614704 HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 seq2 60 60 False', '' ] inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA >seq2 AAAAAAAAAAAAAAAAA ''' inseqs_reverse = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT AAAAAAAAAAAAAAAAA >seq2 TTCAGCTGCACGACGTACCATAGTGTTTTTGTATACTTTATACTCAACACCAGCTTCACGTAATTGTGAACGTAAGTCAGTAACTTCAGCTACTGTTAAT ''' # reverse complement of the forward, so should collapse. with tempfile.NamedTemporaryFile(suffix='.fa') as n: n.write(inseqs) n.flush() with tempfile.NamedTemporaryFile(suffix='.fa') as n2: n2.write(inseqs_reverse) n2.flush() cmd = "{} pipe --sequences {} --otu_table /dev/stdout --singlem_packages {} --reverse {} --output_extras --assignment_method diamond_example".format( path_to_script, n.name, os.path.join(path_to_data, '4.11.22seqs.gpkg.spkg'), n2.name) self.assertEqualOtuTable( list([line.split("\t") for line in expected]), extern.run(cmd).replace( os.path.basename(n.name).replace('.fa', ''), ''))
def test_diamond_assign_taxonomy(self): with tempfile.NamedTemporaryFile(suffix='.fasta') as f: query = "\n".join([ '>HWI-ST1243:156:D1K83ACXX:7:1109:18214:9910 1:N:0:TCCTGAGCCTAAGCCT', 'GTTAAATTACAAATTCCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATCATGGGATTCTGTAAAGAGT', '' ]) f.write(query) f.flush() cmd = "%s --debug pipe --sequences %s --otu_table /dev/stdout --assignment_method diamond --threads 4" % ( path_to_script, f.name) expected = [ self.headers, [ 'S1.5.ribosomal_protein_L11_rplK', os.path.basename(f.name)[:-6], 'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG', '1', '2.44', 'Root; d__Bacteria; p__Firmicutes; c__Bacilli_A; o__Thermoactinomycetales; f__Thermoactinomycetaceae' ] ] expected = ["\t".join(x) for x in expected] + [''] observed = extern.run(cmd).split("\n") r = re.compile( '; g__.*' ) # Do not test beyond genus level because updated diamond version change slightly. self.assertEqual([r.sub('', e) for e in expected], [r.sub('', e) for e in observed])
def test_two_nucleotide_packages(self): expected = [ "\t".join(self.headers), '61_otus.v3 GGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGACTGACGCTGATGTGCGAAAGCG 2 5.13 Root; k__Bacteria; p__Proteobacteria', '61_otus.second.v3 TTAGGTAGTTGCTGGGGTAACGTCCCAACAAGCCGATAATCGGTACGGGTTGTGAGAGCA 1 1.66 Root; k__Archaea; p__Euryarchaeota', '' ] inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1105:6981:63483 1:N:0:AAGAGGCAAAGGAGTA GATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGTAACTGACGCTGATGTGCGAAAGCGTGGGGATCAAACAGGATTAGATACCCTGGTAGT >HWI-ST1243:156:D1K83ACXX:7:1105:6981:63483_revcom ACTACCAGGGTATCTAATCCTGTTTGATCCCCACGCTTTCGCACATCAGCGTCAGTTACAGACCAGAAAGTCGCCTTCGCCACTGGTGTTCCTCCATATC >NS500333:10:H0V2GAGXX:2:13211:8623:16289 1:N:0:GATCAG ATTAGGTAGTTGCTGGGGTAACGTCCCAACAAGCCGATAATCGGTACGGGTTGTGAGAGCAAGAGCCCGGAGATGGATTCTGAGACACGAATCCAGGTCCTACGGGGCGCAGCAGGCGCGAAAACTTTACACTGCGCGAAAGCGCGATA ''' with tempfile.NamedTemporaryFile(suffix='.fa') as n: n.write(inseqs) n.flush() cmd = "%s pipe --sequences %s --otu_table /dev/stdout --singlem_packages %s %s" % ( path_to_script, n.name, os.path.join(path_to_data, '61_otus.v3.gpkg.spkg'), os.path.join(path_to_data, 'second_packge.spkg')) self.assertEqualOtuTable( list([line.split("\t") for line in expected]), extern.run(cmd).replace( os.path.basename(n.name).replace('.fa', ''), ''))
def test_known_sequence_taxonomy(self): expected = [ "\t".join(self.headers), '4.11.22seqs TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA 2 4.88 mytax; yeh', '' ] inseqs = '''>HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482 1:N:0:TAAGGCGACTAAGCCT ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA >another ATTAACAGTAGCTGAAGTTACTGACTTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTACGTCGTGCAGCTGAA ''' with tempfile.NamedTemporaryFile(suffix='.fa') as n: n.write(inseqs) n.flush() with tempfile.NamedTemporaryFile() as taxf: taxf.write( "HWI-ST1243:156:D1K83ACXX:7:1106:18671:79482\tmytax; yeh\n" ) taxf.write("another\tmytax; yeh; 2\n") taxf.flush() cmd = "%s pipe --sequences %s --otu_table /dev/stdout --singlem_packages %s "\ "--no_assign_taxonomy --known_sequence_taxonomy %s"% ( path_to_script, n.name, os.path.join(path_to_data,'4.11.22seqs.gpkg.spkg'), taxf.name) self.assertEqual( expected, extern.run(cmd).replace( os.path.basename(n.name).replace('.fa', ''), '').split("\n"))
def test_seqs_dna(self): aln = '''>s1 ga-------------TATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGATGTG >s2 asdas ca---------GAGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGA---- >s3 ga-------------TATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGGGCTGATGTG- >d4 -g----------AGATATGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGTCTGtaACTGACGCTGATG-- ''' expected = '''TATGGAGGAACACCAGTGGC TATGGAGGAACACCAGTGGC TATGGAGGAACACCAGTGGC TATGGAGGAACACCAGTGGC ''' with tempfile.NamedTemporaryFile(mode='w') as a: a.write(aln) a.flush() with tempfile.NamedTemporaryFile() as stderr: cmd = "%s --debug seqs --alignment %s --alignment_type dna"\ " --window_size 20 2>%s" % ( path_to_script, a.name, stderr.name) self.assertEqual('', extern.run(cmd)) # This includes ignored columns at the front, which were messing things up. with open(stderr.name) as stde: self.assertTrue( 'Found best section of the alignment starting from 14\n' in \ stde.read())
def create_diamond_db(self): '''Create a diamond database from the unaligned sequences in this package. Returns ------- path to the created diamond db e.g. 'my_sequences.dmnd' ''' base = self.unaligned_sequence_database_path() cmd = "diamond makedb --in '%s' -d '%s'" % (self.unaligned_sequence_database_path(), base) extern.run(cmd) diamondb = '%s.dmnd' % base # Mostly this moves a file to it's current location because Create # follows this same logic, but there's a specially crafted # test/data/mcrA.gpkg which is slightly different. os.rename(diamondb, self.diamond_database_path()) return diamondb
def test_known_tax_table(self): expected = [ self.headers, [ '4.12.22seqs', 'small', 'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG', '4', '9.76', 'Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales' ], [ '4.11.22seqs', 'small', 'TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA', '2', '4.88', 'Root; d__Bacteria; p__Firmicutes' ] ] exp = sorted(["\t".join(x) for x in expected] + ['']) cmd = "%s --quiet pipe --sequences %s/1_pipe/small.fa --otu_table /dev/stdout --threads 4 --singlem_packages %s" % ( path_to_script, path_to_data, self.two_packages) self.assertEqual(exp, sorted(extern.run(cmd).split("\n"))) expected = [ self.headers, [ '4.12.22seqs', 'small', 'CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG', '4', '9.76', 'some1' ], [ '4.11.22seqs', 'small', 'TTACGTTCACAATTACGTGAAGCTGGTGTTGAGTATAAAGTATACAAAAACACTATGGTA', '2', '4.88', 'Root; d__Bacteria; p__Firmicutes' ] ] exp = sorted(["\t".join(x) for x in expected] + ['']) with tempfile.NamedTemporaryFile(mode='w', prefix='singlem_test_known') as t: t.write('\n'.join(["\t".join(x) for x in expected[:2]])) t.flush() cmd = "%s --quiet pipe --sequences %s/1_pipe/small.fa --otu_table /dev/stdout --threads 4 --known_otu_tables %s --singlem_packages %s"\ % (path_to_script, path_to_data, t.name, self.two_packages) self.assertEqual(exp, sorted(extern.run(cmd).split("\n")))
def summarise(**kwargs): '''Summarise an OTU table''' krona_output_file = kwargs.pop('krona_output') table_collection = kwargs.pop('table_collection') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) # prep the array gene_to_sample_to_taxonomy_to_count = Summariser._collapse_otu_table_into_gene_to_sample_to_taxonomy_to_count(table_collection) # write the output krona files sample_name_to_tempfile = OrderedDict() logging.info("Writing krona %s" % krona_output_file) cmd = 'ktImportText -o %s' % krona_output_file sample_tempfiles = [] sample_to_gene_to_taxonomy_to_count = {} all_sample_names = set() all_gene_names = set() for gene, sample_to_taxonomy_to_count in gene_to_sample_to_taxonomy_to_count.items(): all_gene_names.add(gene) for sample, taxonomy_to_count in sample_to_taxonomy_to_count.items(): all_sample_names.add(sample) if sample not in sample_to_gene_to_taxonomy_to_count: sample_to_gene_to_taxonomy_to_count[sample] = {} sample_to_gene_to_taxonomy_to_count[sample][gene] = taxonomy_to_count is_more_than_one_sample = len(sample_to_gene_to_taxonomy_to_count) > 1 for sample in sorted(all_sample_names): for gene in sorted(all_gene_names): if gene in sample_to_gene_to_taxonomy_to_count[sample]: f = tempfile.NamedTemporaryFile(prefix='singlem_for_krona') sample_tempfiles.append(f) taxonomy_to_count = sample_to_gene_to_taxonomy_to_count[sample][gene] for taxonomy, coverage in taxonomy_to_count.iteritems(): tax_split = taxonomy.split('; ') if tax_split[0] == 'Root' and len(tax_split) > 1: tax_split = tax_split[1:] f.write('\t'.join([str(coverage)]+tax_split)) f.write('\n') f.flush() if is_more_than_one_sample: display_name = '%s: %s' % (sample, gene) else: display_name = gene cmd += " %s,'%s'" % (f.name, display_name) extern.run(cmd) for f in sample_tempfiles: f.close()
def hmmsearch(self, input_pipe, hmms, output_files): r"""Run HMMsearch with all the HMMs, generating output files Parameters ---------- input_pipe: String A string which is a partial command line. When this command is run is outputs to STDOUT fasta formatted protein sequences, which hmmsearch runs on. hmms: list of paths A list of (string) paths to HMM files which are used to search with. output_files: list of paths A list of (string) paths to output CSV files to be generated by the HMM searching Returns ------- N/A May raise an exception if hmmsearching went amiss""" # Check input and output paths are the same length if len(hmms) != len(output_files): raise Exception( "Programming error: number of supplied HMMs differs from the number of supplied output files" ) # Create queue data structure queue = [] for i, hmm in enumerate(hmms): queue.append([hmm, output_files[i]]) # While there are more things left in the queue while len(queue) > 0: pairs_to_run = self._munch_off_batch(queue) # Run hmmsearches with each of the pairs cmd = self._hmm_command(input_pipe, pairs_to_run) logging.debug("Running command: %s" % cmd) try: extern.run(cmd) except extern.ExternCalledProcessError as e: if e.stderr == b'\nError: Sequence file - is empty or misformatted\n\n': raise NoInputSequencesException(cmd) else: raise e
def test_clustering(self): otu_table = [ self.headers, [ 'ribosomal_protein_L11_rplK_gpkg', 'minimal', 'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC', '7', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales' ], [ 'ribosomal_protein_L11_rplK_gpkg', 'minimal', 'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA', '6', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli' ], #last base only is different to first sequence [ 'ribosomal_protein_S17_gpkg', 'minimal', 'GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT', '9', '4.95', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus' ] ] otu_table = "\n".join(["\t".join(x) for x in otu_table]) with tempfile.NamedTemporaryFile(mode='w') as f: f.write(otu_table) f.flush() with tempdir.TempDir() as d: cmd = "{} makedb --db_path {}/db --otu_table {} --clustering_divergence 3".format( path_to_script, d, f.name) subprocess.check_call(cmd, shell=True) cmd = "%s query --query_sequence %s --db %s/db" % ( path_to_script, 'AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC', # first sequence with an extra A at the start d) expected = [ [ 'query_name', 'query_sequence', 'divergence', 'num_hits', 'sample', 'marker', 'hit_sequence', 'taxonomy' ], [ 'unnamed_sequence', 'AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC', '1', '7', 'minimal', 'ribosomal_protein_L11_rplK_gpkg', 'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales' ], [ 'unnamed_sequence', 'AGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC', '2', '6', 'minimal', 'ribosomal_protein_L11_rplK_gpkg', 'GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATA', 'Root; k__Bacteria; p__Firmicutes; c__Bacilli' ] ] self.assertEqualOtuTable(expected, extern.run(cmd))
def get_dmnd(self): ''' Create temporary DIAMOND file for search method ''' fasta_paths = [ pkg.graftm_package().unaligned_sequence_database_path() for pkg in self.singlem_packages ] temp_dmnd = tempfile.NamedTemporaryFile( mode="w", prefix='singlem-diamond-prefilter', suffix='.dmnd', delete=False).name cmd = 'cat %s | '\ 'diamond makedb --in - --db %s' % (' '.join(fasta_paths), temp_dmnd) extern.run(cmd) return temp_dmnd
def test_hello_world(self): with tempdir.TempDir() as tmp: with tempdir.TempDir() as tmp2: cmd1 = "%s create --verbosity 2 --sequences %s --alignment %s --taxonomy %s --rerooted_tree %s --output %s" \ %(path_to_script, os.path.join(path_to_data,'create','homologs.trimmed.unaligned.faa'), os.path.join(path_to_data,'create','homologs.trimmed.aligned.faa'), os.path.join(path_to_data,'create','homologs.tax2tree.rerooted.decorated.tree-consensus-strings'), os.path.join(path_to_data,'create','homologstre.tree'), tmp+".gpkg") extern.run(cmd1) cmd2 = "%s graft --verbosity 2 --graftm_package %s --forward %s --output_directory %s" \ % (path_to_script, "%s.gpkg" % tmp, os.path.join(path_to_data,'create','test.faa'), tmp2+"_") extern.run(cmd2)
def test_print_insert(self): expected = [self.headers,['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','1','2.44','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['4.12.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG','1','2.51','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales']] exp = sorted(["\t".join(x) for x in expected]+['']) cmd = "%s --debug pipe --sequences %s/1_pipe/insert.fna --otu_table /dev/stdout --threads 4 --include_inserts" % (path_to_script, path_to_data) self.assertEqual(exp, sorted(extern.run(cmd).split("\n")))
def test_bootstrap_executable(self): with tempfile.NamedTemporaryFile() as tf: cmd = '%s expand_search --verbosity 5 --contigs %s --output_hmm %s --search_hmm_files %s' % ( path_to_script, os.path.join(path_to_data, 'bootstrapper', 'contigs.fna'), tf.name, os.path.join(path_to_data, 'bootstrapper', 'DNGNGWU00001.hmm')) extern.run(cmd) with open(tf.name) as tf2: lines = tf2.readlines() first_line = lines[0] self.assertTrue(first_line in [ "HMMER3/f [3.1b2 | February 2015]\n", "HMMER3/f [3.2.1 | June 2018]\n" ], msg=first_line) self.assertEqual('NSEQ 2\n', lines[10])
def test_print_insert(self): expected = [self.headers,['S1.5.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','1','2.44','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['S1.5.ribosomal_protein_L11_rplK','insert','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTtttCAAGCAGGTGTG','1','2.51','Root; d__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales']] exp = sorted(["\t".join(x) for x in expected]+['']) cmd = "%s --debug pipe --sequences %s/1_pipe/insert.fna --otu_table /dev/stdout --threads 4 --include_inserts" % (path_to_script, path_to_data) self.assertEqual(exp, sorted(extern.run(cmd).split("\n")))
def hmmsearch(self, input_pipe, hmms, output_files): r"""Run HMMsearch with all the HMMs, generating output files Parameters ---------- input_pipe: String A string which is a partial command line. When this command is run is outputs to STDOUT fasta formatted protein sequences, which hmmsearch runs on. hmms: list of paths A list of (string) paths to HMM files which are used to search with. output_files: list of paths A list of (string) paths to output CSV files to be generated by the HMM searching Returns ------- N/A May raise an exception if hmmsearching went amiss""" # Check input and output paths are the same length if len(hmms) != len(output_files): raise Exception("Programming error: number of supplied HMMs differs from the number of supplied output files") # Create queue data structure queue = [] for i, hmm in enumerate(hmms): queue.append( [hmm, output_files[i]] ) # While there are more things left in the queue while len(queue) > 0: pairs_to_run = self._munch_off_batch(queue) # Run hmmsearches with each of the pairs cmd = self._hmm_command(input_pipe, pairs_to_run) logging.debug("Running command: %s" % cmd) try: extern.run(cmd) except extern.ExternCalledProcessError, e: if e.stderr == '\nError: Sequence file - is empty or misformatted\n\n': raise NoInputSequencesException(cmd) else: raise e
def _generate_tree_log_file(self, tree, alignment, output_tree_file_path, output_log_file_path, residue_type, fasttree): '''Generate the FastTree log file given a tree and the alignment that made that tree Returns ------- Nothing. The log file as parameter is written as the log file. ''' if residue_type==Create._NUCLEOTIDE_PACKAGE_TYPE: cmd = "%s -quiet -gtr -nt -nome -mllen -intree '%s' -log %s -out %s %s" %\ (fasttree, tree, output_log_file_path, output_tree_file_path, alignment) elif residue_type==Create._PROTEIN_PACKAGE_TYPE: cmd = "%s -quiet -nome -mllen -intree '%s' -log %s -out %s %s" %\ (fasttree, tree, output_log_file_path, output_tree_file_path, alignment) extern.run(cmd)
def test_get_tree_default(self): cmd = "{} get_tree".format(path_to_script) observed = extern.run(cmd) splits = observed.split('\n') self.assertEqual('marker\ttree_file', splits[0]) self.assertEqual('.tre', splits[1][-4:]) self.assertGreater(len(splits), 10) for line in splits[1:-1]: self.assertTrue(os.path.exists(line.split('\t')[1]))
def test_get_tree_default(self): cmd = "{} get_tree".format(path_to_script) observed = extern.run(cmd) splits = observed.split('\n') self.assertEqual('marker\ttree_file', splits[0]) self.assertEqual('.tre',splits[1][-4:]) self.assertGreater(len(splits), 10) for line in splits[1:-1]: self.assertTrue(os.path.exists(line.split('\t')[1]))
def _generate_tree_log_file(self, tree, alignment, output_tree_file_path, output_log_file_path, residue_type, fasttree): '''Generate the FastTree log file given a tree and the alignment that made that tree Returns ------- Nothing. The log file as parameter is written as the log file. ''' if residue_type == Create._NUCLEOTIDE_PACKAGE_TYPE: cmd = "%s -quiet -gtr -nt -nome -mllen -intree '%s' -log %s -out %s %s" %\ (fasttree, tree, output_log_file_path, output_tree_file_path, alignment) elif residue_type == Create._PROTEIN_PACKAGE_TYPE: cmd = "%s -quiet -nome -mllen -intree '%s' -log %s -out %s %s" %\ (fasttree, tree, output_log_file_path, output_tree_file_path, alignment) extern.run(cmd)
def _build_tree(self, alignment, base, ptype, fasttree): log_file = base + ".tre.log" tre_file = base + ".tre" if ptype == Create._NUCLEOTIDE_PACKAGE_TYPE: # If it's a nucleotide sequence cmd = "%s -quiet -gtr -nt -log %s -out %s %s" % (fasttree, log_file, tre_file, alignment) extern.run(cmd) else: # Or if its an amino acid sequence cmd = "%s -quiet -log %s -out %s %s" % (fasttree, log_file, tre_file, alignment) extern.run(cmd) self.the_trash += [log_file, tre_file] return log_file, tre_file
def test_dump(self): expected = """gene sample sequence num_hits coverage taxonomy ribosomal_protein_L11_rplK_gpkg minimal GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC 7 15.1 Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales ribosomal_protein_S2_rpsB_gpkg minimal CGTCGTTGGAACCCAAAAATGAAAAAATATATCTTCACTGAGAGAAATGGTATTTATATC 6 12.4 Root; k__Bacteria; p__Firmicutes; c__Bacilli ribosomal_protein_S17_gpkg minimal GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT 9 19.5 Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus""" cmd = "{} query --db {}/a.sdb --dump".format( path_to_script, path_to_data) self.assertEqualOtuTable( list([line.split("\t") for line in expected.split("\n")]), extern.run(cmd))
def test_diamond_example_assign_taxonomy(self): expected = [self.headers,['S1.5.ribosomal_protein_L11_rplK','minimal','CCTGCAGGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTG','4','9.76','2513237297'] ] exp = sorted(["\t".join(x) for x in expected]+['']) cmd = "%s --debug pipe --sequences %s/1_pipe/minimal.fa --otu_table /dev/stdout --threads 4 --assignment_method diamond_example" % (path_to_script, path_to_data) observed = sorted(extern.run(cmd).split("\n")) r = re.compile('\t.*?$') # Do not test the exact genome number because updated diamond version change this slightly. self.assertEqual([r.sub('',e) for e in exp], [r.sub('',e) for e in observed])
def test_query_by_taxonomy(self): expected = [ self.headers, ['ribosomal_protein_L11_rplK_gpkg','minimal','GGTAAAGCGAATCCAGCACCACCAGTTGGTCCAGCATTAGGTCAAGCAGGTGTGAACATC','7','15.10','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales'], ['ribosomal_protein_S17_gpkg','minimal','GCTAAATTAGGAGACATTGTTAAAATTCAAGAAACTCGTCCTTTATCAGCAACAAAACGT','9','19.50','Root; k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus']] expected = ["\t".join(x) for x in expected]+[''] cmd = "%s query --db %s/a.sdb --taxonomy o__Bacillales" %(path_to_script, path_to_data) self.assertEqual(expected, extern.run(cmd).split('\n'))
def _concatenate_file(self, file_list, output): ''' Call unix "cat" to concatenate a list of files Parameters ---------- file_list: list List of strings, each leading to a file. These files are the ones to be concatenate together. E.g.: ["/path/to/file1", "/path/to/file2"] output: str Path to file to which to the files in file_list will be concatenated into. ''' to_cat = ' '.join(file_list) logging.debug("Concatenating files: %s" % (to_cat)) cmd = "cat %s > %s" % (to_cat, output) extern.run(cmd)