def test_map_ids_to_taxonomy(self): """Mapping sequence ids to taxonomy functions as expected """ p = BlastTaxonAssigner({}) id_to_taxonomy_map = { "AY800210": "Archaea;Euryarchaeota;Halobacteriales;uncultured", "EU883771": "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.", "EF503699": "Archaea;Crenarchaeota;uncultured;uncultured", "DQ260310": "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", "EF503697": "Archaea;Crenarchaeota;uncultured;uncultured", } hits = { 's1': ("AY800210", 1e-99), 's5': ("EU883771", 'weird confidence value'), 's3': ("DQ260310", 42.), 's4': None, } expected = { 's1': ("Archaea;Euryarchaeota;Halobacteriales;uncultured", 1e-99, "AY800210"), 's5': ('Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.', 'weird confidence value', "EU883771"), 's3': ("Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", 42., "DQ260310"), 's4': ('No blast hit', None, None), } actual = p._map_ids_to_taxonomy(hits, id_to_taxonomy_map) self.assertEqual(actual, expected)
def test_seqs_to_taxonomy(self): """BlastTaxonAssigner._seqs_to_taxonomy: functions as expected """ p = BlastTaxonAssigner({\ 'reference_seqs_filepath':self.reference_seqs_fp,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) # build the id_to_taxonomy_map as this test doesn't execute __call__ id_to_taxonomy_map = { "AY800210": \ "Archaea;Euryarchaeota;Halobacteriales;uncultured", "EU883771": \ "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.", "EF503699": \ "Archaea;Crenarchaeota;uncultured;uncultured", "DQ260310": \ "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", "EF503697": \ "Archaea;Crenarchaeota;uncultured;uncultured", } # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove # read the input file into (seq_id, seq) pairs seqs = list(MinimalFastaParser(open(self.input_seqs_fp))) actual = p._seqs_to_taxonomy(seqs, blast_db, id_to_taxonomy_map) self.assertEqual(actual, self.expected1) # passing empty list of seqs functions as expected actual = p._seqs_to_taxonomy([], blast_db, id_to_taxonomy_map) self.assertEqual(actual, {})
def test_map_ids_to_taxonomy(self): """Mapping sequence ids to taxonomy functions as expected """ p = BlastTaxonAssigner({}) id_to_taxonomy_map = { "AY800210": "Archaea;Euryarchaeota;Halobacteriales;uncultured", "EU883771": "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.", "EF503699": "Archaea;Crenarchaeota;uncultured;uncultured", "DQ260310": "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", "EF503697": "Archaea;Crenarchaeota;uncultured;uncultured", } hits = { 's1': ("AY800210", 1e-99), 's5': ("EU883771", 'weird confidence value'), 's3': ("DQ260310", 42.), 's4': None, } expected = { 's1': ("Archaea;Euryarchaeota;Halobacteriales;uncultured", 1e-99, "AY800210"), 's5': ('Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.', 'weird confidence value',"EU883771"), 's3': ("Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", 42.,"DQ260310"), 's4': ('No blast hit', None, None), } actual = p._map_ids_to_taxonomy(hits,id_to_taxonomy_map) self.assertEqual(actual,expected)
def test_seqs_to_taxonomy(self): """BlastTaxonAssigner._seqs_to_taxonomy: functions as expected """ p = BlastTaxonAssigner({\ 'reference_seqs_filepath':self.reference_seqs_fp,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) # build the id_to_taxonomy_map as this test doesn't execute __call__ id_to_taxonomy_map = { "AY800210": \ "Archaea;Euryarchaeota;Halobacteriales;uncultured", "EU883771": \ "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.", "EF503699": \ "Archaea;Crenarchaeota;uncultured;uncultured", "DQ260310": \ "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", "EF503697": \ "Archaea;Crenarchaeota;uncultured;uncultured", } # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove # read the input file into (seq_id, seq) pairs seqs = list(MinimalFastaParser(open(self.input_seqs_fp))) actual = p._seqs_to_taxonomy(seqs,blast_db,id_to_taxonomy_map) self.assertEqual(actual,self.expected1) # passing empty list of seqs functions as expected actual = p._seqs_to_taxonomy([],blast_db,id_to_taxonomy_map) self.assertEqual(actual,{})
def test_parse_id_to_taxonomy_file(self): """Parsing taxonomy files functions as expected """ lines = id_to_taxonomy_string.splitlines() p = BlastTaxonAssigner({}) expected = {\ "AY800210":"Archaea;Euryarchaeota;Halobacteriales;uncultured",\ "EU883771":"Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",\ "EF503699":"Archaea;Crenarchaeota;uncultured;uncultured",\ "DQ260310":"Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",\ "EF503697":"Archaea;Crenarchaeota;uncultured;uncultured"} self.assertEqual(p._parse_id_to_taxonomy_file(lines), expected)
def test_get_first_blast_hit_per_seq(self): """Extracting the first blast hit for each seq functions as expected """ p = BlastTaxonAssigner({}) blast_hits = {'s1':[('blah',0.0)],\ 's3':[('dsasd',1e-42),('rrr',1e-12),('qqq',0.001)],\ 's2':[]} expected = {'s1':('blah',0.0),\ 's3':('dsasd',1e-42),\ 's2':None} actual = p._get_first_blast_hit_per_seq(blast_hits) self.assertEqual(actual, expected)
def test_parse_id_to_taxonomy_file(self): """Parsing taxonomy files functions as expected """ lines = id_to_taxonomy_string.splitlines() p = BlastTaxonAssigner({}) expected = {\ "AY800210":"Archaea;Euryarchaeota;Halobacteriales;uncultured",\ "EU883771":"Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",\ "EF503699":"Archaea;Crenarchaeota;uncultured;uncultured",\ "DQ260310":"Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",\ "EF503697":"Archaea;Crenarchaeota;uncultured;uncultured"} self.assertEqual(p._parse_id_to_taxonomy_file(lines),expected)
def test_get_first_blast_hit_per_seq(self): """Extracting the first blast hit for each seq functions as expected """ p = BlastTaxonAssigner({}) blast_hits = {'s1':[('blah',0.0)],\ 's3':[('dsasd',1e-42),('rrr',1e-12),('qqq',0.001)],\ 's2':[]} expected = {'s1':('blah',0.0),\ 's3':('dsasd',1e-42),\ 's2':None} actual = p._get_first_blast_hit_per_seq(blast_hits) self.assertEqual(actual,expected)
def test_call_output_to_file(self): """BlastTaxonAssigner.__call__ functions w output to file """ result_path = get_tmp_filename(prefix='BlastTaxonAssignerTests_', suffix='.fasta') self._paths_to_clean_up.append(result_path) p = BlastTaxonAssigner({ 'reference_seqs_filepath': self.reference_seqs_fp, 'id_to_taxonomy_filepath': self.id_to_taxonomy_fp, }) actual = p(self.input_seqs_fp, result_path=result_path) expected_lines = set([ 's1\tArchaea;Euryarchaeota;Halobacteriales;uncultured\t0.0\tAY800210\n', 's2\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.\t0.0\tEU883771\n', 's3\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503699\n', 's4\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium\t0.0\tDQ260310\n', 's5\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503697\n', 's6\tNo blast hit\tNone\tNone\n', ]) f = open(result_path) observed_lines = set(f.readlines()) f.close() self.assertEqual(observed_lines, expected_lines) # Return value is None when result_path is provided (Not sure # if this is what we want yet, or if we would want both so # results could be logged to file...) self.assertEqual(actual, None)
def test_init(self): """BlastTaxonAssigner __init__ should store name, params""" p = BlastTaxonAssigner({}) self.assertEqual(p.Name, 'BlastTaxonAssigner') # default parameters correctly initialized default_params = {'Min percent identity':0.90,\ 'Max E value':1e-30,\ 'Application':'blastn/megablast'} self.assertEqual(p.Params, default_params)
def test_call_on_the_fly_blast_db(self): """BlastTaxonAssigner.__call__ functions w creating blast db """ p = BlastTaxonAssigner({\ 'reference_seqs_filepath':self.reference_seqs_fp,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) actual = p(self.input_seqs_fp) self.assertEqual(actual, self.expected1)
def test_call_existing_blast_db(self): """BlastTaxonAssigner.__call__ functions w existing db """ # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({'blast_db':blast_db,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) actual = p(self.input_seqs_fp) self.assertEqual(actual, self.expected1)
def test_get_blast_hits(self): """BlastTaxonAssigner._get_blast_hits functions w existing db """ # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({}) seq_coll_blast_results = p._get_blast_hits(blast_db, self.test_seqs) # mapping from identifier in test_seq_coll to the id of the sequence # in the refseq collection (a silva derivative) expected_matches = {\ 's1':'AY800210', 's2':'EU883771',\ 's3':'EF503699',\ 's4':'DQ260310',\ 's5':'EF503697'} # no results for s6 (which is a randomly-generated sequence) s6_blast_results = seq_coll_blast_results['s6'] self.assertEqual(s6_blast_results, []) # expected results for all other query sequences for seq_id in expected_matches: blast_results = seq_coll_blast_results[seq_id] blast_results_d = dict(blast_results) # explicitly checks that the result is in the data before # pulling it out (this is redundant, but allows for a useful # error message if the data wasn't in there b/c e.g. there # were no blast results returned) self.assertTrue(expected_matches[seq_id] in blast_results_d) # now check that the perfect match got a 0.0 e-value as it should # on this data self.assertEqual(blast_results_d[expected_matches[seq_id]], 0.0)
def test_get_blast_hits(self): """BlastTaxonAssigner._get_blast_hits functions w existing db """ # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({}) seq_coll_blast_results = p._get_blast_hits(blast_db,self.test_seqs) # mapping from identifier in test_seq_coll to the id of the sequence # in the refseq collection (a silva derivative) expected_matches = {\ 's1':'AY800210', 's2':'EU883771',\ 's3':'EF503699',\ 's4':'DQ260310',\ 's5':'EF503697'} # no results for s6 (which is a randomly-generated sequence) s6_blast_results = seq_coll_blast_results['s6'] self.assertEqual(s6_blast_results,[]) # expected results for all other query sequences for seq_id in expected_matches: blast_results = seq_coll_blast_results[seq_id] blast_results_d = dict(blast_results) # explicitly checks that the result is in the data before # pulling it out (this is redundant, but allows for a useful # error message if the data wasn't in there b/c e.g. there # were no blast results returned) self.assertTrue(expected_matches[seq_id] in blast_results_d) # now check that the perfect match got a 0.0 e-value as it should # on this data self.assertEqual(blast_results_d[expected_matches[seq_id]],0.0)
def __init__(self, params): """Return new BlastFragmentsChimeraChecker object with specified params. """ _params = { 'max_e_value': 1e-30, 'min_pct_id': 0.90, 'num_fragments': 3, 'taxonomy_depth': 4 } _params.update(params) try: id_to_taxonomy_fp = params['id_to_taxonomy_fp'] except KeyError: raise ValueError("id_to_taxonomy_filepath must be provided to %s" % self.Name) # Create the blast database if it hasn't been provided if 'blast_db' not in params or params['blast_db'] is None: try: reference_seqs_fp = params['reference_seqs_fp'] except KeyError: raise ValueError( "refseqs_fp or blast_db must be provided to %s" % self.Name) blast_db, self._db_files_to_remove = \ build_blast_db_from_fasta_path(reference_seqs_fp) else: blast_db = params['blast_db'] self._db_files_to_remove = [] self._taxon_assigner = BlastTaxonAssigner({ 'blast_db': blast_db, 'id_to_taxonomy_filepath': id_to_taxonomy_fp, 'Max E value': _params['max_e_value'], 'Min percent identity': _params['min_pct_id'] }) ChimeraChecker.__init__(self, _params)
def test_call_alt_input_types(self): """BlastTaxonAssigner.__call__ functions w alt input types """ p = BlastTaxonAssigner({\ 'reference_seqs_filepath':self.reference_seqs_fp,\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp}) # neither seqs or seq_fp passed results in AssertionError self.assertRaises(AssertionError, p) # Functions with a list of (seq_id, seq) pairs seqs = list(MinimalFastaParser(open(self.input_seqs_fp))) actual = p(seqs=seqs) self.assertEqual(actual, self.expected1) # Functions with input path actual = p(self.input_seqs_fp) self.assertEqual(actual, self.expected1) # same result when passing fp or seqs self.assertEqual(p(seqs=seqs), p(self.input_seqs_fp))
def test_call_logs_run(self): """BlastTaxonAssigner.__call__ logs the run when expected """ log_path = get_tmp_filename(\ prefix='BlastTaxonAssignerTests_',suffix='.fasta') self._paths_to_clean_up.append(log_path) # build the blast database and keep track of the files to clean up blast_db, files_to_remove = \ build_blast_db_from_fasta_path(self.reference_seqs_fp) self._paths_to_clean_up += files_to_remove p = BlastTaxonAssigner({\ 'id_to_taxonomy_filepath':self.id_to_taxonomy_fp,\ 'blast_db':blast_db}) actual = p(self.input_seqs_fp, log_path=log_path) log_file = open(log_path) log_file_str = log_file.read() log_file.close() log_file_exp = [ "BlastTaxonAssigner parameters:", 'Min percent identity:0.9', 'Application:blastn/megablast', 'Max E value:1e-30', 'Result path: None, returned as dict.', 'blast_db:%s' % str(self.reference_seqs_fp)[1:-1], 'id_to_taxonomy_filepath:%s' % self.id_to_taxonomy_fp, 'Number of sequences inspected: 6', 'Number with no blast hits: 1', '', ] # compare data in log file to fake expected log file # NOTE: Since p.params is a dict, the order of lines is not # guaranteed, so testing is performed to make sure that # the equal unordered lists of lines is present in actual and expected self.assertEqualItems(log_file_str.split('\n'), log_file_exp)