Exemplo n.º 1
0
 def test_map_ids_to_taxonomy(self):
     """Mapping sequence ids to taxonomy functions as expected
     """
     p = BlastTaxonAssigner({})
     id_to_taxonomy_map = {
         "AY800210": "Archaea;Euryarchaeota;Halobacteriales;uncultured",
         "EU883771":
         "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",
         "EF503699": "Archaea;Crenarchaeota;uncultured;uncultured",
         "DQ260310":
         "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",
         "EF503697": "Archaea;Crenarchaeota;uncultured;uncultured",
     }
     hits = {
         's1': ("AY800210", 1e-99),
         's5': ("EU883771", 'weird confidence value'),
         's3': ("DQ260310", 42.),
         's4': None,
     }
     expected = {
         's1': ("Archaea;Euryarchaeota;Halobacteriales;uncultured", 1e-99,
                "AY800210"),
         's5':
         ('Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.',
          'weird confidence value', "EU883771"),
         's3': ("Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",
                42., "DQ260310"),
         's4': ('No blast hit', None, None),
     }
     actual = p._map_ids_to_taxonomy(hits, id_to_taxonomy_map)
     self.assertEqual(actual, expected)
Exemplo n.º 2
0
    def test_seqs_to_taxonomy(self):
        """BlastTaxonAssigner._seqs_to_taxonomy: functions as expected
        """
        p = BlastTaxonAssigner({\
         'reference_seqs_filepath':self.reference_seqs_fp,\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})

        # build the id_to_taxonomy_map as this test doesn't execute __call__
        id_to_taxonomy_map = {
            "AY800210": \
             "Archaea;Euryarchaeota;Halobacteriales;uncultured",
            "EU883771": \
             "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",
            "EF503699": \
             "Archaea;Crenarchaeota;uncultured;uncultured",
            "DQ260310": \
             "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",
            "EF503697": \
             "Archaea;Crenarchaeota;uncultured;uncultured",
            }

        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove

        # read the input file into (seq_id, seq) pairs
        seqs = list(MinimalFastaParser(open(self.input_seqs_fp)))

        actual = p._seqs_to_taxonomy(seqs, blast_db, id_to_taxonomy_map)
        self.assertEqual(actual, self.expected1)

        # passing empty list of seqs functions as expected
        actual = p._seqs_to_taxonomy([], blast_db, id_to_taxonomy_map)
        self.assertEqual(actual, {})
Exemplo n.º 3
0
 def test_map_ids_to_taxonomy(self):
     """Mapping sequence ids to taxonomy functions as expected
     """
     p = BlastTaxonAssigner({})
     id_to_taxonomy_map = {
         "AY800210": "Archaea;Euryarchaeota;Halobacteriales;uncultured",
         "EU883771": "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",
         "EF503699": "Archaea;Crenarchaeota;uncultured;uncultured",
         "DQ260310": "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",
         "EF503697": "Archaea;Crenarchaeota;uncultured;uncultured",
         }
     hits = {
         's1': ("AY800210", 1e-99),
         's5': ("EU883771", 'weird confidence value'),
         's3': ("DQ260310", 42.),
         's4': None,
         }
     expected = {
         's1': ("Archaea;Euryarchaeota;Halobacteriales;uncultured", 1e-99, "AY800210"),
         's5': ('Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.',
                'weird confidence value',"EU883771"),
         's3': ("Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium", 42.,"DQ260310"),
         's4': ('No blast hit', None, None),
         }
     actual = p._map_ids_to_taxonomy(hits,id_to_taxonomy_map)
     self.assertEqual(actual,expected)
Exemplo n.º 4
0
 def test_seqs_to_taxonomy(self):
     """BlastTaxonAssigner._seqs_to_taxonomy: functions as expected
     """
     p = BlastTaxonAssigner({\
      'reference_seqs_filepath':self.reference_seqs_fp,\
      'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})
      
     # build the id_to_taxonomy_map as this test doesn't execute __call__
     id_to_taxonomy_map = {
         "AY800210": \
          "Archaea;Euryarchaeota;Halobacteriales;uncultured",
         "EU883771": \
          "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",
         "EF503699": \
          "Archaea;Crenarchaeota;uncultured;uncultured",
         "DQ260310": \
          "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",
         "EF503697": \
          "Archaea;Crenarchaeota;uncultured;uncultured",
         }
     
     # build the blast database and keep track of the files to clean up
     blast_db, files_to_remove = \
      build_blast_db_from_fasta_path(self.reference_seqs_fp)
     self._paths_to_clean_up += files_to_remove
     
     # read the input file into (seq_id, seq) pairs
     seqs = list(MinimalFastaParser(open(self.input_seqs_fp)))
     
     actual = p._seqs_to_taxonomy(seqs,blast_db,id_to_taxonomy_map)
     self.assertEqual(actual,self.expected1)
     
     # passing empty list of seqs functions as expected
     actual = p._seqs_to_taxonomy([],blast_db,id_to_taxonomy_map)
     self.assertEqual(actual,{})
Exemplo n.º 5
0
 def test_parse_id_to_taxonomy_file(self):
     """Parsing taxonomy files functions as expected
     """
     lines = id_to_taxonomy_string.splitlines()
     p = BlastTaxonAssigner({})
     expected = {\
      "AY800210":"Archaea;Euryarchaeota;Halobacteriales;uncultured",\
      "EU883771":"Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",\
      "EF503699":"Archaea;Crenarchaeota;uncultured;uncultured",\
      "DQ260310":"Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",\
      "EF503697":"Archaea;Crenarchaeota;uncultured;uncultured"}
     self.assertEqual(p._parse_id_to_taxonomy_file(lines), expected)
Exemplo n.º 6
0
 def test_get_first_blast_hit_per_seq(self):
     """Extracting the first blast hit for each seq functions as expected
     """
     p = BlastTaxonAssigner({})
     blast_hits = {'s1':[('blah',0.0)],\
                   's3':[('dsasd',1e-42),('rrr',1e-12),('qqq',0.001)],\
                   's2':[]}
     expected = {'s1':('blah',0.0),\
                   's3':('dsasd',1e-42),\
                   's2':None}
     actual = p._get_first_blast_hit_per_seq(blast_hits)
     self.assertEqual(actual, expected)
Exemplo n.º 7
0
 def test_parse_id_to_taxonomy_file(self):
     """Parsing taxonomy files functions as expected
     """
     lines = id_to_taxonomy_string.splitlines()
     p = BlastTaxonAssigner({})
     expected = {\
      "AY800210":"Archaea;Euryarchaeota;Halobacteriales;uncultured",\
      "EU883771":"Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",\
      "EF503699":"Archaea;Crenarchaeota;uncultured;uncultured",\
      "DQ260310":"Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",\
      "EF503697":"Archaea;Crenarchaeota;uncultured;uncultured"}
     self.assertEqual(p._parse_id_to_taxonomy_file(lines),expected)
Exemplo n.º 8
0
 def test_get_first_blast_hit_per_seq(self):
     """Extracting the first blast hit for each seq functions as expected
     """
     p = BlastTaxonAssigner({})
     blast_hits = {'s1':[('blah',0.0)],\
                   's3':[('dsasd',1e-42),('rrr',1e-12),('qqq',0.001)],\
                   's2':[]}
     expected = {'s1':('blah',0.0),\
                   's3':('dsasd',1e-42),\
                   's2':None}
     actual = p._get_first_blast_hit_per_seq(blast_hits)
     self.assertEqual(actual,expected)
Exemplo n.º 9
0
    def test_call_output_to_file(self):
        """BlastTaxonAssigner.__call__ functions w output to file
        """
        result_path = get_tmp_filename(prefix='BlastTaxonAssignerTests_',
                                       suffix='.fasta')
        self._paths_to_clean_up.append(result_path)

        p = BlastTaxonAssigner({
            'reference_seqs_filepath':
            self.reference_seqs_fp,
            'id_to_taxonomy_filepath':
            self.id_to_taxonomy_fp,
        })
        actual = p(self.input_seqs_fp, result_path=result_path)

        expected_lines = set([
            's1\tArchaea;Euryarchaeota;Halobacteriales;uncultured\t0.0\tAY800210\n',
            's2\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.\t0.0\tEU883771\n',
            's3\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503699\n',
            's4\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium\t0.0\tDQ260310\n',
            's5\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503697\n',
            's6\tNo blast hit\tNone\tNone\n',
        ])
        f = open(result_path)
        observed_lines = set(f.readlines())
        f.close()
        self.assertEqual(observed_lines, expected_lines)

        # Return value is None when result_path is provided (Not sure
        # if this is what we want yet, or if we would want both so
        # results could be logged to file...)
        self.assertEqual(actual, None)
Exemplo n.º 10
0
 def test_init(self):
     """BlastTaxonAssigner __init__ should store name, params"""
     p = BlastTaxonAssigner({})
     self.assertEqual(p.Name, 'BlastTaxonAssigner')
     # default parameters correctly initialized
     default_params = {'Min percent identity':0.90,\
      'Max E value':1e-30,\
      'Application':'blastn/megablast'}
     self.assertEqual(p.Params, default_params)
Exemplo n.º 11
0
    def test_call_on_the_fly_blast_db(self):
        """BlastTaxonAssigner.__call__ functions w creating blast db
        """
        p = BlastTaxonAssigner({\
         'reference_seqs_filepath':self.reference_seqs_fp,\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})
        actual = p(self.input_seqs_fp)

        self.assertEqual(actual, self.expected1)
Exemplo n.º 12
0
    def test_call_existing_blast_db(self):
        """BlastTaxonAssigner.__call__ functions w existing db
        """
        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove

        p = BlastTaxonAssigner({'blast_db':blast_db,\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})
        actual = p(self.input_seqs_fp)

        self.assertEqual(actual, self.expected1)
Exemplo n.º 13
0
    def test_get_blast_hits(self):
        """BlastTaxonAssigner._get_blast_hits functions w existing db
        
        """
        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove

        p = BlastTaxonAssigner({})
        seq_coll_blast_results = p._get_blast_hits(blast_db, self.test_seqs)
        # mapping from identifier in test_seq_coll to the id of the sequence
        # in the refseq collection (a silva derivative)
        expected_matches = {\
         's1':'AY800210',
         's2':'EU883771',\
         's3':'EF503699',\
         's4':'DQ260310',\
         's5':'EF503697'}

        # no results for s6 (which is a randomly-generated sequence)
        s6_blast_results = seq_coll_blast_results['s6']
        self.assertEqual(s6_blast_results, [])

        # expected results for all other query sequences
        for seq_id in expected_matches:
            blast_results = seq_coll_blast_results[seq_id]
            blast_results_d = dict(blast_results)
            # explicitly checks that the result is in the data before
            # pulling it out (this is redundant, but allows for a useful
            # error message if the data wasn't in there b/c e.g. there
            # were no blast results returned)
            self.assertTrue(expected_matches[seq_id] in blast_results_d)
            # now check that the perfect match got a 0.0 e-value as it should
            # on this data
            self.assertEqual(blast_results_d[expected_matches[seq_id]], 0.0)
Exemplo n.º 14
0
 def test_get_blast_hits(self):
     """BlastTaxonAssigner._get_blast_hits functions w existing db
     
     """
     # build the blast database and keep track of the files to clean up
     blast_db, files_to_remove = \
      build_blast_db_from_fasta_path(self.reference_seqs_fp)
     self._paths_to_clean_up += files_to_remove
     
     p = BlastTaxonAssigner({})
     seq_coll_blast_results = p._get_blast_hits(blast_db,self.test_seqs)
     # mapping from identifier in test_seq_coll to the id of the sequence
     # in the refseq collection (a silva derivative)
     expected_matches = {\
      's1':'AY800210',
      's2':'EU883771',\
      's3':'EF503699',\
      's4':'DQ260310',\
      's5':'EF503697'}
     
     # no results for s6 (which is a randomly-generated sequence) 
     s6_blast_results = seq_coll_blast_results['s6']
     self.assertEqual(s6_blast_results,[])
     
     # expected results for all other query sequences
     for seq_id in expected_matches:
         blast_results = seq_coll_blast_results[seq_id]
         blast_results_d = dict(blast_results)
         # explicitly checks that the result is in the data before 
         # pulling it out (this is redundant, but allows for a useful
         # error message if the data wasn't in there b/c e.g. there 
         # were no blast results returned)
         self.assertTrue(expected_matches[seq_id] in blast_results_d)
         # now check that the perfect match got a 0.0 e-value as it should
         # on this data
         self.assertEqual(blast_results_d[expected_matches[seq_id]],0.0)
Exemplo n.º 15
0
    def __init__(self, params):
        """Return new BlastFragmentsChimeraChecker object with specified params.

        """
        _params = {
            'max_e_value': 1e-30,
            'min_pct_id': 0.90,
            'num_fragments': 3,
            'taxonomy_depth': 4
        }
        _params.update(params)

        try:
            id_to_taxonomy_fp = params['id_to_taxonomy_fp']
        except KeyError:
            raise ValueError("id_to_taxonomy_filepath must be provided to %s" %
                             self.Name)

        # Create the blast database if it hasn't been provided
        if 'blast_db' not in params or params['blast_db'] is None:
            try:
                reference_seqs_fp = params['reference_seqs_fp']
            except KeyError:
                raise ValueError(
                    "refseqs_fp or blast_db must be provided to  %s" %
                    self.Name)
            blast_db, self._db_files_to_remove = \
                build_blast_db_from_fasta_path(reference_seqs_fp)
        else:
            blast_db = params['blast_db']
            self._db_files_to_remove = []

        self._taxon_assigner = BlastTaxonAssigner({
            'blast_db':
            blast_db,
            'id_to_taxonomy_filepath':
            id_to_taxonomy_fp,
            'Max E value':
            _params['max_e_value'],
            'Min percent identity':
            _params['min_pct_id']
        })

        ChimeraChecker.__init__(self, _params)
Exemplo n.º 16
0
    def test_call_alt_input_types(self):
        """BlastTaxonAssigner.__call__ functions w alt input types """
        p = BlastTaxonAssigner({\
         'reference_seqs_filepath':self.reference_seqs_fp,\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})

        # neither seqs or seq_fp passed results in AssertionError
        self.assertRaises(AssertionError, p)

        # Functions with a list of (seq_id, seq) pairs
        seqs = list(MinimalFastaParser(open(self.input_seqs_fp)))
        actual = p(seqs=seqs)
        self.assertEqual(actual, self.expected1)

        # Functions with input path
        actual = p(self.input_seqs_fp)
        self.assertEqual(actual, self.expected1)

        # same result when passing fp or seqs
        self.assertEqual(p(seqs=seqs), p(self.input_seqs_fp))
Exemplo n.º 17
0
    def test_call_logs_run(self):
        """BlastTaxonAssigner.__call__ logs the run when expected
        """
        log_path = get_tmp_filename(\
         prefix='BlastTaxonAssignerTests_',suffix='.fasta')
        self._paths_to_clean_up.append(log_path)

        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove

        p = BlastTaxonAssigner({\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp,\
         'blast_db':blast_db})
        actual = p(self.input_seqs_fp, log_path=log_path)

        log_file = open(log_path)
        log_file_str = log_file.read()
        log_file.close()

        log_file_exp = [
            "BlastTaxonAssigner parameters:",
            'Min percent identity:0.9',
            'Application:blastn/megablast',
            'Max E value:1e-30',
            'Result path: None, returned as dict.',
            'blast_db:%s' % str(self.reference_seqs_fp)[1:-1],
            'id_to_taxonomy_filepath:%s' % self.id_to_taxonomy_fp,
            'Number of sequences inspected: 6',
            'Number with no blast hits: 1',
            '',
        ]
        # compare data in log file to fake expected log file
        # NOTE: Since p.params is a dict, the order of lines is not
        # guaranteed, so testing is performed to make sure that
        # the equal unordered lists of lines is present in actual and expected
        self.assertEqualItems(log_file_str.split('\n'), log_file_exp)