def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if params['chimera_detection_method'] == 'blast_fragments': blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['reference_seqs_fp'], output_dir=working_dir) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db elif params['chimera_detection_method'] == 'ChimeraSlayer': #copy the reference files to working dir #ChimeraSlayer creates an index file of the ref and #will crash without write permission in the ref seqs dir aligned_reference_seqs_fp = params['aligned_reference_seqs_fp'] _, new_ref_filename = split(aligned_reference_seqs_fp) copy(aligned_reference_seqs_fp, working_dir) aligned_reference_seqs_fp = working_dir + "/" + new_ref_filename self.files_to_remove.append(aligned_reference_seqs_fp) params['aligned_reference_seqs_fp'] = aligned_reference_seqs_fp #if given, also copy the unaligned ref db reference_seqs_fp = params['reference_seqs_fp'] if reference_seqs_fp: _, new_ref_filename = split(reference_seqs_fp) copy(reference_seqs_fp, working_dir) reference_seqs_fp = working_dir + "/" + new_ref_filename else: #otherwise create it reference_seqs_fp = write_degapped_fasta_to_file( MinimalFastaParser(open(aligned_reference_seqs_fp)), tmp_dir=working_dir) #delete it afterwards self.files_to_remove.append(reference_seqs_fp) params['reference_seqs_fp'] = reference_seqs_fp #build blast db of reference, otherwise ChimeraSlayer will do it #and parallel jobs clash _, db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_fp) self.files_to_remove += db_files_to_remove #make the index file globally #Reason: ChimeraSlayer first checks to see if the index file is #there. If not it tries to create it. This can lead to race #condition if several parallel jobs try to create it at the same #time. make_cidx_file(aligned_reference_seqs_fp) self.files_to_remove.append(aligned_reference_seqs_fp + ".cidx") else: raise ValueError("Unrecognized chimera detection method '%s'." % params['chimera_detection_method'])
def test_seqs_to_taxonomy(self): """BlastTaxonAssigner._seqs_to_taxonomy: functions as expected """ p = BlastTaxonAssigner({\ 'reference_seqs_filepath':self.reference_seqs_fp,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) # build the id_to_taxonomy_map as this test doesn't execute __call__ id_to_taxonomy_map = { "AY800210": \ "Archaea;Euryarchaeota;Halobacteriales;uncultured", "EU883771": \ "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.", "EF503699": \ "Archaea;Crenarchaeota;uncultured;uncultured", "DQ260310": \ "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", "EF503697": \ "Archaea;Crenarchaeota;uncultured;uncultured", } # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove # read the input file into (seq_id, seq) pairs seqs = list(MinimalFastaParser(open(self.input_seqs_fp))) actual = p._seqs_to_taxonomy(seqs, blast_db, id_to_taxonomy_map) self.assertEqual(actual, self.expected1) # passing empty list of seqs functions as expected actual = p._seqs_to_taxonomy([], blast_db, id_to_taxonomy_map) self.assertEqual(actual, {})
def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if not params["blast_db"]: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc. blast_db, db_files_to_remove = build_blast_db_from_fasta_path(params["reference_seqs_fp"]) self.files_to_remove += db_files_to_remove params["blast_db"] = blast_db
def test_seqs_to_taxonomy(self): """BlastTaxonAssigner._seqs_to_taxonomy: functions as expected """ p = BlastTaxonAssigner({\ 'reference_seqs_filepath':self.reference_seqs_fp,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) # build the id_to_taxonomy_map as this test doesn't execute __call__ id_to_taxonomy_map = { "AY800210": \ "Archaea;Euryarchaeota;Halobacteriales;uncultured", "EU883771": \ "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.", "EF503699": \ "Archaea;Crenarchaeota;uncultured;uncultured", "DQ260310": \ "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", "EF503697": \ "Archaea;Crenarchaeota;uncultured;uncultured", } # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove # read the input file into (seq_id, seq) pairs seqs = list(MinimalFastaParser(open(self.input_seqs_fp))) actual = p._seqs_to_taxonomy(seqs,blast_db,id_to_taxonomy_map) self.assertEqual(actual,self.expected1) # passing empty list of seqs functions as expected actual = p._seqs_to_taxonomy([],blast_db,id_to_taxonomy_map) self.assertEqual(actual,{})
def test_generate_new_otus_stats(self): """Test generating new OTU stats on valid input data.""" exp = [('New.CleanUp.ReferenceOTU972', 'ATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGATGTTTAAG' 'TGGGATGTGAAATCCCCGGGCTTAACCTGGGGGCTGC', 'foo;bar;baz', 2, 60.0, 4.349999999999994, {'Env2': 25.0, 'Env1': 35.0}), ('New.CleanUp.ReferenceOTU969', 'ATACGTAGGTCCCGAGCGTTGTCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATGGCAAG' 'TCTGAAGTGAAAACCCAGGGCTCAACCCTGGGACTGC', 'foo;bar;baz', 2, 14.0, 12.5, {'Env2': 8.0, 'Env1': 6.0}), ('New.CleanUp.ReferenceOTU964', 'ATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGACGGCGAAGCAAG' 'TCTGAAGTGAAAGCCCGGGGCTCAACCGCGGGACTGC', 'foo;bar;baz', 2, 5.0, 14.769999999999996, {'Env2': 2.0, 'Env1': 3.0}), ('New.CleanUp.ReferenceOTU999', 'ATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGATGTTTAAG' 'TGGGATGTGAAATCCCCGGGCTTAACCTGGGGGCTGC', 'foo;bar;bazz', 1, 99.0, 4.349999999999994, {'Env2': 0.0, 'Env1': 99.0})] ref_seqs_db, ref_seqs_db_files_to_remove = \ build_blast_db_from_fasta_path(self.ref_seqs_f.name) obs = _generate_new_otus_stats(self.otu_table_f, self.rep_set_f, self.ref_seqs_f, ref_seqs_db, self.mapping_f, self.grouping_category, self.top_n) remove_files(ref_seqs_db_files_to_remove) self.assertFloatEqual(obs, exp)
def qiime_blast_seqs(seqs, blast_constructor=Blastall, blast_program='blastn', blast_db=None, refseqs=None, refseqs_fp=None, blast_mat_root=None, params={}, WorkingDir=None, seqs_per_blast_run=1000, HALT_EXEC=False): """Blast list of sequences. seqs: a list (or object with list-like interace) of (seq_id, seq) tuples (e.g., the output of MinimalFastaParser) """ assert blast_db or refseqs_fp or refseqs, \ 'Must provide either a blast_db or a fasta '+\ 'filepath containing sequences to build one.' if refseqs_fp: blast_db, db_files_to_remove =\ build_blast_db_from_fasta_path(refseqs_fp,output_dir=WorkingDir) elif refseqs: blast_db, db_files_to_remove =\ build_blast_db_from_fasta_file(refseqs,output_dir=WorkingDir) else: db_files_to_remove = [] params["-d"] = blast_db params["-p"] = blast_program blast_app = blast_constructor(params=params, blast_mat_root=blast_mat_root, InputHandler='_input_as_seq_id_seq_pairs', WorkingDir=WorkingDir, SuppressStderr=True, HALT_EXEC=HALT_EXEC) current_seqs = [] blast_results = BlastResult([]) for seq in seqs: current_seqs.append(seq) if len(current_seqs) % seqs_per_blast_run == 0: if blast_results: blast_results.update(\ BlastResult(blast_app(current_seqs)['StdOut'])) else: blast_results = BlastResult(blast_app(current_seqs)['StdOut']) current_seqs = [] # clean-up run: blast the remaining sequences blast_results.update(\ BlastResult(blast_app(current_seqs)['StdOut'])) remove_files(db_files_to_remove) return blast_results
def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if params['refseqs_path']: # Build the blast database from the refseqs_path -- all procs # will then access one db rather than create one per proc. blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['refseqs_path']) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db
def _precommand_initiation(self, input_fp, output_dir, working_dir, params): if not params['blast_db']: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc. blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['reference_seqs_fp']) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db
def __call__(self, seq_path, result_path=None, log_path=None, blast_db=None, refseqs_fp=None): self.log_lines = [] if not blast_db: self.blast_db, self.db_files_to_remove = \ build_blast_db_from_fasta_path(refseqs_fp) self.log_lines.append('Reference seqs fp (to build blast db): %s'%\ refseqs_fp) else: self.blast_db = blast_db self.db_files_to_remove = [] self.log_lines.append('Blast database: %s' % self.blast_db) clusters, failures = self._cluster_seqs(\ MinimalFastaParser(open(seq_path))) self.log_lines.append('Num OTUs: %d' % len(clusters)) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, 'w') for cluster_id, cluster in clusters.items(): of.write('%s\t%s\n' % (cluster_id, '\t'.join(cluster))) of.close() result = None self.log_lines.append('Result path: %s\n' % result_path) else: # if the user did not provide a result_path, store # the clusters in a dict of {otu_id:[seq_ids]}, where # otu_id is arbitrary result = clusters self.log_lines.append('Result path: None, returned as dict.') if log_path: # if the user provided a log file path, log the run log_file = open(log_path, 'w') self.log_lines = [str(self)] + self.log_lines log_file.write('\n'.join(self.log_lines)) failures.sort() log_file.write('Num failures: %d\n' % len(failures)) log_file.write('Failures: %s\n' % '\t'.join(failures)) remove_files(self.db_files_to_remove, error_on_missing=False) # return the result (note this is None if the data was # written to file) return result
def _precommand_initiation(self,input_fp,output_dir,working_dir,params): if not params['blast_db']: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(params['template_fp']) self.files_to_remove += db_files_to_remove params['blast_db'] = blast_db if params['min_length'] < 0: params['min_length'] = compute_min_alignment_length(\ open(input_fp,'U'))
def test_call_existing_blast_db(self): """BlastTaxonAssigner.__call__ functions w existing db """ # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({'blast_db':blast_db,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) actual = p(self.input_seqs_fp) self.assertEqual(actual, self.expected1)
def test_call_existing_blast_db(self): """BlastTaxonAssigner.__call__ functions w existing db """ # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({'blast_db':blast_db,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) actual = p(self.input_seqs_fp) self.assertEqual(actual,self.expected1)
def __call__(self, seq_path, result_path=None, log_path=None, blast_db=None, refseqs_fp=None): self.log_lines = [] if not blast_db: self.blast_db, self.db_files_to_remove = build_blast_db_from_fasta_path(refseqs_fp) self.log_lines.append("Reference seqs fp (to build blast db): %s" % refseqs_fp) else: self.blast_db = blast_db self.db_files_to_remove = [] self.log_lines.append("Blast database: %s" % self.blast_db) clusters, failures = self._cluster_seqs(MinimalFastaParser(open(seq_path))) self.log_lines.append("Num OTUs: %d" % len(clusters)) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, "w") for cluster_id, cluster in clusters.items(): of.write("%s\t%s\n" % (cluster_id, "\t".join(cluster))) of.close() result = None self.log_lines.append("Result path: %s\n" % result_path) else: # if the user did not provide a result_path, store # the clusters in a dict of {otu_id:[seq_ids]}, where # otu_id is arbitrary result = clusters self.log_lines.append("Result path: None, returned as dict.") if log_path: # if the user provided a log file path, log the run log_file = open(log_path, "w") self.log_lines = [str(self)] + self.log_lines log_file.write("\n".join(self.log_lines)) failures.sort() log_file.write("Num failures: %d\n" % len(failures)) log_file.write("Failures: %s\n" % "\t".join(failures)) remove_files(self.db_files_to_remove, error_on_missing=False) # return the result (note this is None if the data was # written to file) return result
def __init__(self, params): """Return new BlastFragmentsChimeraChecker object with specified params. """ _params = { 'max_e_value': 1e-30, 'min_pct_id': 0.90, 'num_fragments': 3, 'taxonomy_depth': 4 } _params.update(params) try: id_to_taxonomy_fp = params['id_to_taxonomy_fp'] except KeyError: raise ValueError("id_to_taxonomy_filepath must be provided to %s" % self.Name) # Create the blast database if it hasn't been provided if 'blast_db' not in params or params['blast_db'] is None: try: reference_seqs_fp = params['reference_seqs_fp'] except KeyError: raise ValueError( "refseqs_fp or blast_db must be provided to %s" % self.Name) blast_db, self._db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_fp) else: blast_db = params['blast_db'] self._db_files_to_remove = [] self._taxon_assigner = BlastTaxonAssigner({ 'blast_db': blast_db, 'id_to_taxonomy_filepath': id_to_taxonomy_fp, 'Max E value': _params['max_e_value'], 'Min percent identity': _params['min_pct_id'] }) ChimeraChecker.__init__(self, _params)
def test_build_blast_db_from_fasta_path_aln(self): """build_blast_db_from_fasta_path works with alignment as input """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp) self.assertEqual(blast_db, self.in_aln1_fp) expected_db_files = set([blast_db + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db, e_value=0.0)), 1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_fasta_path(self): """build_blast_db_from_fasta_path convenience function works as expected """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_seqs1_fp) self.assertEqual(blast_db, self.in_seqs1_fp) expected_db_files = set([self.in_seqs1_fp + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db)), 1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_fasta_path_aln(self): """build_blast_db_from_fasta_path works with alignment as input """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp) self.assertEqual(blast_db, self.in_aln1_fp) expected_db_files = set([blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def __init__(self, params): """Return new BlastFragmentsChimeraChecker object with specified params. """ _params = {'max_e_value': 1e-30, 'min_pct_id': 0.90, 'num_fragments': 3, 'taxonomy_depth': 4} _params.update(params) try: id_to_taxonomy_fp = params['id_to_taxonomy_fp'] except KeyError: raise ValueError( "id_to_taxonomy_filepath must be provided to %s" % self.Name) # Create the blast database if it hasn't been provided if 'blast_db' not in params or params['blast_db'] is None: try: reference_seqs_fp = params['reference_seqs_fp'] except KeyError: raise ValueError( "refseqs_fp or blast_db must be provided to %s" % self.Name) blast_db, self._db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_fp) else: blast_db = params['blast_db'] self._db_files_to_remove = [] self._taxon_assigner = BlastTaxonAssigner( {'blast_db': blast_db, 'id_to_taxonomy_filepath': id_to_taxonomy_fp, 'Max E value': _params['max_e_value'], 'Min percent identity': _params['min_pct_id'] }) ChimeraChecker.__init__(self, _params)
def test_call_logs_run(self): """BlastTaxonAssigner.__call__ logs the run when expected """ log_path = get_tmp_filename(\ prefix='BlastTaxonAssignerTests_',suffix='.fasta') self._paths_to_clean_up.append(log_path) # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp,\ 'blast_db':blast_db}) actual = p(self.input_seqs_fp,log_path=log_path) log_file = open(log_path) log_file_str = log_file.read() log_file.close() log_file_exp = [ "BlastTaxonAssigner parameters:", 'Min percent identity:0.9', 'Application:blastn/megablast', 'Max E value:1e-30', 'Result path: None, returned as dict.', 'blast_db:%s' % str(self.reference_seqs_fp)[1:-1], 'id_to_taxonomy_filepath:%s' % self.id_to_taxonomy_fp, 'Number of sequences inspected: 6', 'Number with no blast hits: 1', '', ] # compare data in log file to fake expected log file # NOTE: Since p.params is a dict, the order of lines is not # guaranteed, so testing is performed to make sure that # the equal unordered lists of lines is present in actual and expected self.assertEqualItems(log_file_str.split('\n'), log_file_exp)
def test_call_logs_run(self): """BlastTaxonAssigner.__call__ logs the run when expected """ log_path = get_tmp_filename(\ prefix='BlastTaxonAssignerTests_',suffix='.fasta') self._paths_to_clean_up.append(log_path) # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp,\ 'blast_db':blast_db}) actual = p(self.input_seqs_fp, log_path=log_path) log_file = open(log_path) log_file_str = log_file.read() log_file.close() log_file_exp = [ "BlastTaxonAssigner parameters:", 'Min percent identity:0.9', 'Application:blastn/megablast', 'Max E value:1e-30', 'Result path: None, returned as dict.', 'blast_db:%s' % str(self.reference_seqs_fp)[1:-1], 'id_to_taxonomy_filepath:%s' % self.id_to_taxonomy_fp, 'Number of sequences inspected: 6', 'Number with no blast hits: 1', '', ] # compare data in log file to fake expected log file # NOTE: Since p.params is a dict, the order of lines is not # guaranteed, so testing is performed to make sure that # the equal unordered lists of lines is present in actual and expected self.assertEqualItems(log_file_str.split('\n'), log_file_exp)
def test_get_blast_hits(self): """BlastTaxonAssigner._get_blast_hits functions w existing db """ # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({}) seq_coll_blast_results = p._get_blast_hits(blast_db, self.test_seqs) # mapping from identifier in test_seq_coll to the id of the sequence # in the refseq collection (a silva derivative) expected_matches = {\ 's1':'AY800210', 's2':'EU883771',\ 's3':'EF503699',\ 's4':'DQ260310',\ 's5':'EF503697'} # no results for s6 (which is a randomly-generated sequence) s6_blast_results = seq_coll_blast_results['s6'] self.assertEqual(s6_blast_results, []) # expected results for all other query sequences for seq_id in expected_matches: blast_results = seq_coll_blast_results[seq_id] blast_results_d = dict(blast_results) # explicitly checks that the result is in the data before # pulling it out (this is redundant, but allows for a useful # error message if the data wasn't in there b/c e.g. there # were no blast results returned) self.assertTrue(expected_matches[seq_id] in blast_results_d) # now check that the perfect match got a 0.0 e-value as it should # on this data self.assertEqual(blast_results_d[expected_matches[seq_id]], 0.0)
def test_build_blast_db_from_fasta_path(self): """build_blast_db_from_fasta_path convenience function works as expected """ blast_db, db_files = \ build_blast_db_from_fasta_path(self.in_seqs1_fp) self.assertEqual(blast_db, self.in_seqs1_fp) expected_db_files = set([self.in_seqs1_fp + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_get_blast_hits(self): """BlastTaxonAssigner._get_blast_hits functions w existing db """ # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({}) seq_coll_blast_results = p._get_blast_hits(blast_db,self.test_seqs) # mapping from identifier in test_seq_coll to the id of the sequence # in the refseq collection (a silva derivative) expected_matches = {\ 's1':'AY800210', 's2':'EU883771',\ 's3':'EF503699',\ 's4':'DQ260310',\ 's5':'EF503697'} # no results for s6 (which is a randomly-generated sequence) s6_blast_results = seq_coll_blast_results['s6'] self.assertEqual(s6_blast_results,[]) # expected results for all other query sequences for seq_id in expected_matches: blast_results = seq_coll_blast_results[seq_id] blast_results_d = dict(blast_results) # explicitly checks that the result is in the data before # pulling it out (this is redundant, but allows for a useful # error message if the data wasn't in there b/c e.g. there # were no blast results returned) self.assertTrue(expected_matches[seq_id] in blast_results_d) # now check that the perfect match got a 0.0 e-value as it should # on this data self.assertEqual(blast_results_d[expected_matches[seq_id]],0.0)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.blast_db == None and opts.refseqs_fp == None: option_parser.error('Either blast_db or refseqs_fp must be provided.') # create local copies of command-line options python_exe_fp = opts.python_exe_fp pick_otus_fp = opts.pick_otus_fp refseqs_fp = opts.refseqs_fp cluster_jobs_fp = opts.cluster_jobs_fp input_fasta_fp = opts.input_fasta_fp jobs_to_start = opts.jobs_to_start output_dir = opts.output_dir poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep max_e_value = opts.max_e_value similarity = opts.similarity poll_directly = opts.poll_directly min_aligned_percent = opts.min_aligned_percent created_temp_paths = [] if not opts.blast_db: # Build the blast database from the reference_seqs_fp -- all procs # will then access one db rather than create one per proc blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(refseqs_fp) created_temp_paths += db_files_to_remove else: blast_db = opts.blast_db # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with RDP job_prefix = opts.job_prefix or get_random_job_prefix('POTU') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,\ job_prefix,working_dir=output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes and the list of # output files generated by each job commands, job_result_filepaths = \ get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps, output_dir,blast_db,job_prefix,working_dir,max_e_value,similarity, min_aligned_percent) created_temp_paths += job_result_filepaths # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir process_run_results_f =\ 'qiime.parallel.pick_otus_blast.parallel_blast_process_run_results_f' write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) # Generate the command to run the poller, and the list of temp files # created by the poller if not poll_directly: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep) created_temp_paths += poller_result_filepaths # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,process_run_results_f,\ seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def __call__(self, seq_path=None, seqs=None, result_path=None, log_path=None): """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq. """ assert seq_path or seqs, \ "Must provide either seqs or seq_path when calling a BlastTaxonAssigner." # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) # assign the blast database, either as a pre-exisiting database # specified as self.Params['blast_db'] or by creating a # temporary database from the sequence file specified # as self.Params['reference_seqs_filepath'] try: blast_db = self.Params['blast_db'] except KeyError: # build a temporary blast_db reference_seqs_path = self.Params['reference_seqs_filepath'] refseqs_dir, refseqs_name = os.path.split(reference_seqs_path) blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_path) # build the mapping of sequence identifier # (wrt to the blast db seqs) to taxonomy id_to_taxonomy_map = self._parse_id_to_taxonomy_file(\ open(self.Params['id_to_taxonomy_filepath'],'U')) ## Iterate over the input self.SeqsPerBlastRun seqs at a time. # There are two competing issues here when dealing with very large # inputs. If all sequences are read in at once, the containing object # can be very large, causing the system to page. On the other hand, # in such cases it would be very slow to treat each sequence # individually, since blast requires a filepath. Each call would # therefore involve writing a single sequence to file, opening/closing # and removing the file. To balance this, sequences are read in and # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time. # This appears to solve the problem with the largest sets I've worked # with so far. if seq_path: # Get a seq iterator seqs = MinimalFastaParser(open(seq_path)) # Build object to keep track of the current set of sequence to be # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score) # mapping) current_seqs = [] result = {} # Iterate over the (seq_id, seq) pairs for seq_id, seq in seqs: # append the current seq_id,seq to list of seqs to be blasted current_seqs.append((seq_id,seq)) # When there are 1000 in the list, blast them if len(current_seqs) == self.SeqsPerBlastRun: # update the result object result.update(self._seqs_to_taxonomy(\ current_seqs,blast_db,id_to_taxonomy_map)) # reset the list of seqs to be blasted current_seqs = [] # Assign taxonomy to the remaining sequences result.update(self._seqs_to_taxonomy(\ current_seqs,blast_db,id_to_taxonomy_map)) ## End iteration over the input self.SeqsPerBlastRun seqs at a time. # Write log data if we have a path (while the logger can handle # being called if we are not logging, some of these steps are slow). if log_path is not None: num_inspected = len(result) logger.info('Number of sequences inspected: %s' % num_inspected) num_null_hits = [r[1] for r in result.values()].count(None) logger.info('Number with no blast hits: %s' % num_null_hits) if result_path: # if the user provided a result_path, write the # results to file of = open(result_path,'w') for seq_id, (lineage, confidence, blast_hit_id) in result.items(): of.write('%s\t%s\t%s\t%s\n' % (seq_id, lineage, confidence, blast_hit_id)) of.close() result = None logger.info('Result path: %s' % result_path) else: # Returning the data as a dict, so no modification to result # is necessary. pass # if no result_path was provided, return the data as a dict logger.info('Result path: None, returned as dict.') # clean-up temp blastdb files, if a temp blastdb was created if 'reference_seqs_filepath' in self.Params: map(remove,db_files_to_remove) # return the result return result
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # create local copies of command-line options input_fasta_fp = opts.infile_path refseqs_path = opts.refseqs_path python_exe_fp = opts.python_exe_fp num_jobs_to_start = opts.jobs_to_start blastall_fp = opts.blastall_fp blastmat_dir = opts.blastmat_dir cluster_jobs_fp = opts.cluster_jobs_fp e_value = opts.e_value word_size = opts.word_size num_hits = opts.num_hits output_dir = opts.output_dir suppress_format_blastdb = opts.suppress_format_blastdb poller_fp = opts.poller_fp retain_temp_files = opts.retain_temp_files suppress_polling = opts.suppress_polling seconds_to_sleep = opts.seconds_to_sleep poll_directly = opts.poll_directly disable_low_complexity_filter = opts.disable_low_complexity_filter created_temp_paths = [] # split the input filepath into directory and filename, base filename and # extension input_dir, input_fasta_fn = split(input_fasta_fp) input_file_basename, input_fasta_ext = splitext(input_fasta_fn) # set the job_prefix either based on what the user passed in, # or a random string beginning with BLAST job_prefix = opts.job_prefix or get_random_job_prefix('BLAST') # A temporary output directory is created in output_dir named # job_prefix. Output files are then moved from the temporary # directory to the output directory when they are complete, allowing # a poller to detect when runs complete by the presence of their # output files. working_dir = '%s/%s' % (output_dir,job_prefix) try: makedirs(working_dir) created_temp_paths.append(working_dir) except OSError: # working dir already exists pass # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,num_jobs_to_start) # Build the blast database if necessary if not suppress_format_blastdb: blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(refseqs_path) created_temp_paths += db_files_to_remove # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fasta_fp),num_seqs_per_file,job_prefix,output_dir) created_temp_paths += tmp_fasta_fps # build the filepath for the 'jobs script' jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix) created_temp_paths.append(jobs_fp) # generate the list of commands to be pushed out to nodes commands, job_result_filepaths = get_commands(tmp_fasta_fps,refseqs_path, blastall_fp,blastmat_dir,e_value,word_size,num_hits,output_dir,working_dir, disable_low_complexity_filter=disable_low_complexity_filter, command_prefix=None,command_suffix=None) created_temp_paths += job_result_filepaths # Set up poller apparatus if the user does not suppress polling if not suppress_polling: # Write the list of files which must exist for the jobs to be # considered complete expected_files_filepath = '%s/expected_out_files.txt' % working_dir write_filepaths_to_file(job_result_filepaths,expected_files_filepath) created_temp_paths.append(expected_files_filepath) # Write the mapping file which described how the output files from # each job should be merged into the final output files merge_map_filepath = '%s/merge_map.txt' % working_dir write_merge_map_file_blast(job_result_filepaths,output_dir,\ merge_map_filepath,input_file_basename) created_temp_paths.append(merge_map_filepath) # Create the filepath listing the temporary files to be deleted, # but don't write it yet deletion_list_filepath = '%s/deletion_list.txt' % working_dir created_temp_paths.append(deletion_list_filepath) if not poll_directly: # Generate the command to run the poller, and the list of temp files # created by the poller poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep) # append the poller command to the list of job commands commands.append(poller_command) else: poller_command, poller_result_filepaths =\ get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\ merge_map_filepath,deletion_list_filepath,\ seconds_to_sleep=seconds_to_sleep,\ command_prefix='',command_suffix='') created_temp_paths += poller_result_filepaths if not retain_temp_files: # If the user wants temp files deleted, now write the list of # temp files to be deleted write_filepaths_to_file(created_temp_paths,deletion_list_filepath) else: # Otherwise just write an empty file write_filepaths_to_file([],deletion_list_filepath) # write the commands to the 'jobs files' write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp) # submit the jobs file using cluster_jobs, if not suppressed by the # user if not opts.suppress_submit_jobs: submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix) if poll_directly: try: check_call(poller_command.split()) except CalledProcessError, e: print '**Error occuring when calling the poller directly. '+\ 'Jobs may have been submitted, but are not being polled.' print str(e) exit(-1)
def __call__(self, seq_path=None, seqs=None, result_path=None, log_path=None): """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq. """ assert seq_path or seqs, \ "Must provide either seqs or seq_path when calling a BlastTaxonAssigner." # initialize the logger logger = self._get_logger(log_path) logger.info(str(self)) # assign the blast database, either as a pre-exisiting database # specified as self.Params['blast_db'] or by creating a # temporary database from the sequence file specified # as self.Params['reference_seqs_filepath'] try: blast_db = self.Params['blast_db'] except KeyError: # build a temporary blast_db reference_seqs_path = self.Params['reference_seqs_filepath'] refseqs_dir, refseqs_name = os.path.split(reference_seqs_path) blast_db, db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_path) # build the mapping of sequence identifier # (wrt to the blast db seqs) to taxonomy id_to_taxonomy_map = self._parse_id_to_taxonomy_file( open(self.Params['id_to_taxonomy_filepath'], 'U')) # Iterate over the input self.SeqsPerBlastRun seqs at a time. # There are two competing issues here when dealing with very large # inputs. If all sequences are read in at once, the containing object # can be very large, causing the system to page. On the other hand, # in such cases it would be very slow to treat each sequence # individually, since blast requires a filepath. Each call would # therefore involve writing a single sequence to file, opening/closing # and removing the file. To balance this, sequences are read in and # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time. # This appears to solve the problem with the largest sets I've worked # with so far. if seq_path: # Get a seq iterator seqs = MinimalFastaParser(open(seq_path)) # Build object to keep track of the current set of sequence to be # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score) # mapping) current_seqs = [] result = {} # Iterate over the (seq_id, seq) pairs for seq_id, seq in seqs: # append the current seq_id,seq to list of seqs to be blasted current_seqs.append((seq_id, seq)) # When there are 1000 in the list, blast them if len(current_seqs) == self.SeqsPerBlastRun: # update the result object result.update( self._seqs_to_taxonomy(current_seqs, blast_db, id_to_taxonomy_map)) # reset the list of seqs to be blasted current_seqs = [] # Assign taxonomy to the remaining sequences result.update( self._seqs_to_taxonomy(current_seqs, blast_db, id_to_taxonomy_map)) # End iteration over the input self.SeqsPerBlastRun seqs at a time. # Write log data if we have a path (while the logger can handle # being called if we are not logging, some of these steps are slow). if log_path is not None: num_inspected = len(result) logger.info('Number of sequences inspected: %s' % num_inspected) num_null_hits = [r[1] for r in result.values()].count(None) logger.info('Number with no blast hits: %s' % num_null_hits) if result_path: # if the user provided a result_path, write the # results to file of = open(result_path, 'w') for seq_id, (lineage, confidence, blast_hit_id) in result.items(): of.write('%s\t%s\t%s\t%s\n' % (seq_id, lineage, confidence, blast_hit_id)) of.close() result = None logger.info('Result path: %s' % result_path) else: # Returning the data as a dict, so no modification to result # is necessary. pass # if no result_path was provided, return the data as a dict logger.info('Result path: None, returned as dict.') # clean-up temp blastdb files, if a temp blastdb was created if 'reference_seqs_filepath' in self.Params: map(remove, db_files_to_remove) # return the result return result