Python build_blast_db_from_fasta_path示例，cogent.app.formatdb.build_blast_db_from_fasta_path Python示例

示例#1

0

显示文件

文件： identify_chimeric_seqs.py 项目： HongweiZhou/qiime

    def _precommand_initiation(self, input_fp, output_dir, working_dir,
                               params):
        if params['chimera_detection_method'] == 'blast_fragments':
            blast_db, db_files_to_remove = \
                 build_blast_db_from_fasta_path(params['reference_seqs_fp'],
                                                output_dir=working_dir)
            self.files_to_remove += db_files_to_remove
            params['blast_db'] = blast_db
        elif params['chimera_detection_method'] == 'ChimeraSlayer':
            #copy the reference files to working dir
            #ChimeraSlayer creates an index file of the ref and
            #will crash without write permission in the ref seqs dir
            aligned_reference_seqs_fp = params['aligned_reference_seqs_fp']
            _, new_ref_filename = split(aligned_reference_seqs_fp)
            copy(aligned_reference_seqs_fp, working_dir)
            aligned_reference_seqs_fp = working_dir + "/" + new_ref_filename

            self.files_to_remove.append(aligned_reference_seqs_fp)
            params['aligned_reference_seqs_fp'] = aligned_reference_seqs_fp

            #if given, also copy the unaligned ref db
            reference_seqs_fp = params['reference_seqs_fp']
            if reference_seqs_fp:
                _, new_ref_filename = split(reference_seqs_fp)
                copy(reference_seqs_fp, working_dir)
                reference_seqs_fp = working_dir + "/" + new_ref_filename
            else:
                #otherwise create it
                reference_seqs_fp = write_degapped_fasta_to_file(
                    MinimalFastaParser(open(aligned_reference_seqs_fp)),
                    tmp_dir=working_dir)
            #delete it afterwards
            self.files_to_remove.append(reference_seqs_fp)
            params['reference_seqs_fp'] = reference_seqs_fp

            #build blast db of reference, otherwise ChimeraSlayer will do it
            #and parallel jobs clash
            _, db_files_to_remove = \
                 build_blast_db_from_fasta_path(reference_seqs_fp)
            self.files_to_remove += db_files_to_remove

            #make the index file globally
            #Reason: ChimeraSlayer first checks to see if the index file is
            #there. If not it tries to create it. This can lead to race
            #condition if several parallel jobs try to create it at the same
            #time.
            make_cidx_file(aligned_reference_seqs_fp)
            self.files_to_remove.append(aligned_reference_seqs_fp + ".cidx")
        else:
            raise ValueError("Unrecognized chimera detection method '%s'." %
                             params['chimera_detection_method'])

示例#2

0

显示文件

文件： identify_chimeric_seqs.py 项目： rob-knight/qiime

    def _precommand_initiation(self, input_fp, output_dir, working_dir,
                               params):
        if params['chimera_detection_method'] == 'blast_fragments':
            blast_db, db_files_to_remove = \
                 build_blast_db_from_fasta_path(params['reference_seqs_fp'],
                                                output_dir=working_dir)
            self.files_to_remove += db_files_to_remove
            params['blast_db'] = blast_db
        elif params['chimera_detection_method'] == 'ChimeraSlayer':
            #copy the reference files to working dir
            #ChimeraSlayer creates an index file of the ref and
            #will crash without write permission in the ref seqs dir
            aligned_reference_seqs_fp = params['aligned_reference_seqs_fp']
            _, new_ref_filename = split(aligned_reference_seqs_fp)
            copy(aligned_reference_seqs_fp, working_dir)
            aligned_reference_seqs_fp = working_dir + "/" + new_ref_filename

            self.files_to_remove.append(aligned_reference_seqs_fp)
            params['aligned_reference_seqs_fp'] = aligned_reference_seqs_fp
     
            #if given, also copy the unaligned ref db
            reference_seqs_fp = params['reference_seqs_fp']
            if reference_seqs_fp:
                _, new_ref_filename = split(reference_seqs_fp)
                copy(reference_seqs_fp, working_dir)
                reference_seqs_fp = working_dir + "/" + new_ref_filename
            else:
                #otherwise create it
                reference_seqs_fp = write_degapped_fasta_to_file(
                        MinimalFastaParser(open(aligned_reference_seqs_fp)),
                                           tmp_dir=working_dir)
            #delete it afterwards
            self.files_to_remove.append(reference_seqs_fp)
            params['reference_seqs_fp'] = reference_seqs_fp

            #build blast db of reference, otherwise ChimeraSlayer will do it
            #and parallel jobs clash
            _, db_files_to_remove = \
                 build_blast_db_from_fasta_path(reference_seqs_fp)
            self.files_to_remove += db_files_to_remove

            #make the index file globally
            #Reason: ChimeraSlayer first checks to see if the index file is
            #there. If not it tries to create it. This can lead to race
            #condition if several parallel jobs try to create it at the same
            #time.
            make_cidx_file(aligned_reference_seqs_fp)
            self.files_to_remove.append(aligned_reference_seqs_fp + ".cidx")
        else:
            raise ValueError("Unrecognized chimera detection method '%s'." %
                             params['chimera_detection_method'])

示例#3

0

显示文件

文件： test_assign_taxonomy.py 项目： carze/clovr-base

    def test_seqs_to_taxonomy(self):
        """BlastTaxonAssigner._seqs_to_taxonomy: functions as expected
        """
        p = BlastTaxonAssigner({\
         'reference_seqs_filepath':self.reference_seqs_fp,\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})

        # build the id_to_taxonomy_map as this test doesn't execute __call__
        id_to_taxonomy_map = {
            "AY800210": \
             "Archaea;Euryarchaeota;Halobacteriales;uncultured",
            "EU883771": \
             "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",
            "EF503699": \
             "Archaea;Crenarchaeota;uncultured;uncultured",
            "DQ260310": \
             "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",
            "EF503697": \
             "Archaea;Crenarchaeota;uncultured;uncultured",
            }

        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove

        # read the input file into (seq_id, seq) pairs
        seqs = list(MinimalFastaParser(open(self.input_seqs_fp)))

        actual = p._seqs_to_taxonomy(seqs, blast_db, id_to_taxonomy_map)
        self.assertEqual(actual, self.expected1)

        # passing empty list of seqs functions as expected
        actual = p._seqs_to_taxonomy([], blast_db, id_to_taxonomy_map)
        self.assertEqual(actual, {})

示例#4

0

显示文件

文件： assign_taxonomy.py 项目： qinjunjie/qiime

 def _precommand_initiation(self, input_fp, output_dir, working_dir, params):
     if not params["blast_db"]:
         # Build the blast database from the reference_seqs_fp -- all procs
         # will then access one db rather than create one per proc.
         blast_db, db_files_to_remove = build_blast_db_from_fasta_path(params["reference_seqs_fp"])
         self.files_to_remove += db_files_to_remove
         params["blast_db"] = blast_db

示例#5

0

显示文件

文件： test_assign_taxonomy.py 项目： carze/clovr-base

 def test_seqs_to_taxonomy(self):
     """BlastTaxonAssigner._seqs_to_taxonomy: functions as expected
     """
     p = BlastTaxonAssigner({\
      'reference_seqs_filepath':self.reference_seqs_fp,\
      'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})
      
     # build the id_to_taxonomy_map as this test doesn't execute __call__
     id_to_taxonomy_map = {
         "AY800210": \
          "Archaea;Euryarchaeota;Halobacteriales;uncultured",
         "EU883771": \
          "Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.",
         "EF503699": \
          "Archaea;Crenarchaeota;uncultured;uncultured",
         "DQ260310": \
          "Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium",
         "EF503697": \
          "Archaea;Crenarchaeota;uncultured;uncultured",
         }
     
     # build the blast database and keep track of the files to clean up
     blast_db, files_to_remove = \
      build_blast_db_from_fasta_path(self.reference_seqs_fp)
     self._paths_to_clean_up += files_to_remove
     
     # read the input file into (seq_id, seq) pairs
     seqs = list(MinimalFastaParser(open(self.input_seqs_fp)))
     
     actual = p._seqs_to_taxonomy(seqs,blast_db,id_to_taxonomy_map)
     self.assertEqual(actual,self.expected1)
     
     # passing empty list of seqs functions as expected
     actual = p._seqs_to_taxonomy([],blast_db,id_to_taxonomy_map)
     self.assertEqual(actual,{})

示例#6

0

显示文件

文件： test_most_wanted_otus.py 项目： seangibbons/isme14

    def test_generate_new_otus_stats(self):
        """Test generating new OTU stats on valid input data."""
        exp = [('New.CleanUp.ReferenceOTU972',
            'ATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGATGTTTAAG'
            'TGGGATGTGAAATCCCCGGGCTTAACCTGGGGGCTGC', 'foo;bar;baz', 2, 60.0,
            4.349999999999994, {'Env2': 25.0, 'Env1': 35.0}),
            ('New.CleanUp.ReferenceOTU969',
            'ATACGTAGGTCCCGAGCGTTGTCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATGGCAAG'
            'TCTGAAGTGAAAACCCAGGGCTCAACCCTGGGACTGC', 'foo;bar;baz', 2, 14.0,
            12.5, {'Env2': 8.0, 'Env1': 6.0}), ('New.CleanUp.ReferenceOTU964',
            'ATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGACGGCGAAGCAAG'
            'TCTGAAGTGAAAGCCCGGGGCTCAACCGCGGGACTGC', 'foo;bar;baz', 2, 5.0,
            14.769999999999996, {'Env2': 2.0, 'Env1': 3.0}),
            ('New.CleanUp.ReferenceOTU999',
            'ATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGATGTTTAAG'
            'TGGGATGTGAAATCCCCGGGCTTAACCTGGGGGCTGC', 'foo;bar;bazz', 1, 99.0,
            4.349999999999994, {'Env2': 0.0, 'Env1': 99.0})]

        ref_seqs_db, ref_seqs_db_files_to_remove = \
            build_blast_db_from_fasta_path(self.ref_seqs_f.name)
        obs = _generate_new_otus_stats(self.otu_table_f, self.rep_set_f,
                self.ref_seqs_f, ref_seqs_db, self.mapping_f,
                self.grouping_category, self.top_n)
        remove_files(ref_seqs_db_files_to_remove)

        self.assertFloatEqual(obs, exp)

示例#7

0

显示文件

def qiime_blast_seqs(seqs,
                     blast_constructor=Blastall,
                     blast_program='blastn',
                     blast_db=None,
                     refseqs=None,
                     refseqs_fp=None,
                     blast_mat_root=None,
                     params={},
                     WorkingDir=None,
                     seqs_per_blast_run=1000,
                     HALT_EXEC=False):
    """Blast list of sequences.

    seqs: a list (or object with list-like interace) of (seq_id, seq) 
     tuples (e.g., the output of MinimalFastaParser)
    
    """
    assert blast_db or refseqs_fp or refseqs, \
     'Must provide either a blast_db or a fasta '+\
     'filepath containing sequences to build one.'

    if refseqs_fp:
        blast_db, db_files_to_remove =\
         build_blast_db_from_fasta_path(refseqs_fp,output_dir=WorkingDir)
    elif refseqs:
        blast_db, db_files_to_remove =\
         build_blast_db_from_fasta_file(refseqs,output_dir=WorkingDir)
    else:
        db_files_to_remove = []

    params["-d"] = blast_db
    params["-p"] = blast_program

    blast_app = blast_constructor(params=params,
                                  blast_mat_root=blast_mat_root,
                                  InputHandler='_input_as_seq_id_seq_pairs',
                                  WorkingDir=WorkingDir,
                                  SuppressStderr=True,
                                  HALT_EXEC=HALT_EXEC)

    current_seqs = []
    blast_results = BlastResult([])
    for seq in seqs:
        current_seqs.append(seq)
        if len(current_seqs) % seqs_per_blast_run == 0:
            if blast_results:
                blast_results.update(\
                 BlastResult(blast_app(current_seqs)['StdOut']))
            else:
                blast_results = BlastResult(blast_app(current_seqs)['StdOut'])
            current_seqs = []

    # clean-up run: blast the remaining sequences
    blast_results.update(\
     BlastResult(blast_app(current_seqs)['StdOut']))

    remove_files(db_files_to_remove)

    return blast_results

示例#8

0

显示文件

文件： blast.py 项目： rob-knight/qiime

 def _precommand_initiation(self, input_fp, output_dir, working_dir, params):
     if params['refseqs_path']:
         # Build the blast database from the refseqs_path -- all procs
         # will then access one db rather than create one per proc.
         blast_db, db_files_to_remove = \
              build_blast_db_from_fasta_path(params['refseqs_path'])
         self.files_to_remove += db_files_to_remove
         params['blast_db'] = blast_db

示例#9

0

显示文件

 def _precommand_initiation(self, input_fp, output_dir, working_dir, params):
     if not params['blast_db']:
         # Build the blast database from the reference_seqs_fp -- all procs
         # will then access one db rather than create one per proc.
         blast_db, db_files_to_remove = \
              build_blast_db_from_fasta_path(params['reference_seqs_fp'])
         self.files_to_remove += db_files_to_remove
         params['blast_db'] = blast_db

示例#10

0

显示文件

    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 blast_db=None,
                 refseqs_fp=None):

        self.log_lines = []

        if not blast_db:
            self.blast_db, self.db_files_to_remove = \
                build_blast_db_from_fasta_path(refseqs_fp)
            self.log_lines.append('Reference seqs fp (to build blast db): %s'%\
             refseqs_fp)
        else:
            self.blast_db = blast_db
            self.db_files_to_remove = []

        self.log_lines.append('Blast database: %s' % self.blast_db)

        clusters, failures = self._cluster_seqs(\
         MinimalFastaParser(open(seq_path)))
        self.log_lines.append('Num OTUs: %d' % len(clusters))

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, 'w')
            for cluster_id, cluster in clusters.items():
                of.write('%s\t%s\n' % (cluster_id, '\t'.join(cluster)))
            of.close()
            result = None
            self.log_lines.append('Result path: %s\n' % result_path)
        else:
            # if the user did not provide a result_path, store
            # the clusters in a dict of {otu_id:[seq_ids]}, where
            # otu_id is arbitrary
            result = clusters
            self.log_lines.append('Result path: None, returned as dict.')

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            self.log_lines = [str(self)] + self.log_lines
            log_file.write('\n'.join(self.log_lines))
            failures.sort()
            log_file.write('Num failures: %d\n' % len(failures))
            log_file.write('Failures: %s\n' % '\t'.join(failures))

        remove_files(self.db_files_to_remove, error_on_missing=False)
        # return the result (note this is None if the data was
        # written to file)
        return result

示例#11

0

显示文件

文件： align_seqs.py 项目： qiime/qp-refactor

 def _precommand_initiation(self,input_fp,output_dir,working_dir,params):
     if not params['blast_db']:        
         # Build the blast database from the reference_seqs_fp -- all procs
         # will then access one db rather than create one per proc
         blast_db, db_files_to_remove = \
              build_blast_db_from_fasta_path(params['template_fp'])
         self.files_to_remove += db_files_to_remove
         params['blast_db'] = blast_db
     
     if params['min_length'] < 0:
         params['min_length'] = compute_min_alignment_length(\
                                 open(input_fp,'U'))

示例#12

0

显示文件

 def _precommand_initiation(self,input_fp,output_dir,working_dir,params):
     if not params['blast_db']:        
         # Build the blast database from the reference_seqs_fp -- all procs
         # will then access one db rather than create one per proc
         blast_db, db_files_to_remove = \
              build_blast_db_from_fasta_path(params['template_fp'])
         self.files_to_remove += db_files_to_remove
         params['blast_db'] = blast_db
     
     if params['min_length'] < 0:
         params['min_length'] = compute_min_alignment_length(\
                                 open(input_fp,'U'))

示例#13

0

显示文件

文件： test_assign_taxonomy.py 项目： carze/clovr-base

    def test_call_existing_blast_db(self):
        """BlastTaxonAssigner.__call__ functions w existing db
        """
        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove

        p = BlastTaxonAssigner({'blast_db':blast_db,\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})
        actual = p(self.input_seqs_fp)

        self.assertEqual(actual, self.expected1)

示例#14

0

显示文件

文件： test_assign_taxonomy.py 项目： carze/clovr-base

 def test_call_existing_blast_db(self):
     """BlastTaxonAssigner.__call__ functions w existing db
     """
     # build the blast database and keep track of the files to clean up
     blast_db, files_to_remove = \
      build_blast_db_from_fasta_path(self.reference_seqs_fp)
     self._paths_to_clean_up += files_to_remove
     
     p = BlastTaxonAssigner({'blast_db':blast_db,\
      'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})
     actual = p(self.input_seqs_fp)
     
     self.assertEqual(actual,self.expected1)

示例#15

0

显示文件

文件： pick_otus.py 项目： Ecogenomics/FrankenQIIME

    def __call__(self, seq_path, result_path=None, log_path=None, blast_db=None, refseqs_fp=None):

        self.log_lines = []

        if not blast_db:
            self.blast_db, self.db_files_to_remove = build_blast_db_from_fasta_path(refseqs_fp)
            self.log_lines.append("Reference seqs fp (to build blast db): %s" % refseqs_fp)
        else:
            self.blast_db = blast_db
            self.db_files_to_remove = []

        self.log_lines.append("Blast database: %s" % self.blast_db)

        clusters, failures = self._cluster_seqs(MinimalFastaParser(open(seq_path)))
        self.log_lines.append("Num OTUs: %d" % len(clusters))

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, "w")
            for cluster_id, cluster in clusters.items():
                of.write("%s\t%s\n" % (cluster_id, "\t".join(cluster)))
            of.close()
            result = None
            self.log_lines.append("Result path: %s\n" % result_path)
        else:
            # if the user did not provide a result_path, store
            # the clusters in a dict of {otu_id:[seq_ids]}, where
            # otu_id is arbitrary
            result = clusters
            self.log_lines.append("Result path: None, returned as dict.")

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, "w")
            self.log_lines = [str(self)] + self.log_lines
            log_file.write("\n".join(self.log_lines))
            failures.sort()
            log_file.write("Num failures: %d\n" % len(failures))
            log_file.write("Failures: %s\n" % "\t".join(failures))

        remove_files(self.db_files_to_remove, error_on_missing=False)
        # return the result (note this is None if the data was
        # written to file)
        return result

示例#16

0

显示文件

文件： identify_chimeric_seqs.py 项目： franny911/qiime

    def __init__(self, params):
        """Return new BlastFragmentsChimeraChecker object with specified params.

        """
        _params = {
            'max_e_value': 1e-30,
            'min_pct_id': 0.90,
            'num_fragments': 3,
            'taxonomy_depth': 4
        }
        _params.update(params)

        try:
            id_to_taxonomy_fp = params['id_to_taxonomy_fp']
        except KeyError:
            raise ValueError("id_to_taxonomy_filepath must be provided to %s" %
                             self.Name)

        # Create the blast database if it hasn't been provided
        if 'blast_db' not in params or params['blast_db'] is None:
            try:
                reference_seqs_fp = params['reference_seqs_fp']
            except KeyError:
                raise ValueError(
                    "refseqs_fp or blast_db must be provided to  %s" %
                    self.Name)
            blast_db, self._db_files_to_remove = \
                build_blast_db_from_fasta_path(reference_seqs_fp)
        else:
            blast_db = params['blast_db']
            self._db_files_to_remove = []

        self._taxon_assigner = BlastTaxonAssigner({
            'blast_db':
            blast_db,
            'id_to_taxonomy_filepath':
            id_to_taxonomy_fp,
            'Max E value':
            _params['max_e_value'],
            'Min percent identity':
            _params['min_pct_id']
        })

        ChimeraChecker.__init__(self, _params)

示例#17

0

显示文件

文件： test_formatdb.py 项目： yesimon/pycogent

    def test_build_blast_db_from_fasta_path_aln(self):
        """build_blast_db_from_fasta_path works with alignment as input
        """
        blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp)
        self.assertEqual(blast_db, self.in_aln1_fp)
        expected_db_files = set([blast_db + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]])
        self.assertEqual(set(db_files), expected_db_files)
        # result returned when blasting against new db
        self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db, e_value=0.0)), 1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))

示例#18

0

显示文件

文件： test_formatdb.py 项目： yesimon/pycogent

    def test_build_blast_db_from_fasta_path(self):
        """build_blast_db_from_fasta_path convenience function works as expected
        """
        blast_db, db_files = build_blast_db_from_fasta_path(self.in_seqs1_fp)
        self.assertEqual(blast_db, self.in_seqs1_fp)
        expected_db_files = set([self.in_seqs1_fp + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]])
        self.assertEqual(set(db_files), expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db)), 1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))

示例#19

0

显示文件

    def test_build_blast_db_from_fasta_path_aln(self):
        """build_blast_db_from_fasta_path works with alignment as input
        """
        blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp)
        self.assertEqual(blast_db, self.in_aln1_fp)
        expected_db_files = set([blast_db + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files), expected_db_files)
        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))

示例#20

0

显示文件

文件： identify_chimeric_seqs.py 项目： TheSchwa/qiime

    def __init__(self, params):
        """Return new BlastFragmentsChimeraChecker object with specified params.

        """
        _params = {'max_e_value': 1e-30,
                   'min_pct_id': 0.90,
                   'num_fragments': 3,
                   'taxonomy_depth': 4}
        _params.update(params)

        try:
            id_to_taxonomy_fp = params['id_to_taxonomy_fp']
        except KeyError:
            raise ValueError(
                "id_to_taxonomy_filepath must be provided to %s" %
                self.Name)

        # Create the blast database if it hasn't been provided
        if 'blast_db' not in params or params['blast_db'] is None:
            try:
                reference_seqs_fp = params['reference_seqs_fp']
            except KeyError:
                raise ValueError(
                    "refseqs_fp or blast_db must be provided to  %s" %
                    self.Name)
            blast_db, self._db_files_to_remove = \
                build_blast_db_from_fasta_path(reference_seqs_fp)
        else:
            blast_db = params['blast_db']
            self._db_files_to_remove = []

        self._taxon_assigner = BlastTaxonAssigner(
            {'blast_db': blast_db,
             'id_to_taxonomy_filepath': id_to_taxonomy_fp,
             'Max E value': _params['max_e_value'],
             'Min percent identity': _params['min_pct_id']
             })

        ChimeraChecker.__init__(self, _params)

示例#21

0

显示文件

文件： test_assign_taxonomy.py 项目： carze/clovr-base

    def test_call_logs_run(self):
        """BlastTaxonAssigner.__call__ logs the run when expected
        """
        log_path = get_tmp_filename(\
         prefix='BlastTaxonAssignerTests_',suffix='.fasta')
        self._paths_to_clean_up.append(log_path) 
        
        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove
        
        p = BlastTaxonAssigner({\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp,\
         'blast_db':blast_db})
        actual = p(self.input_seqs_fp,log_path=log_path)
        
        log_file = open(log_path)
        log_file_str = log_file.read()
        log_file.close()

        log_file_exp = [
            "BlastTaxonAssigner parameters:",
            'Min percent identity:0.9',
            'Application:blastn/megablast',
            'Max E value:1e-30',
            'Result path: None, returned as dict.',
            'blast_db:%s' % str(self.reference_seqs_fp)[1:-1],
            'id_to_taxonomy_filepath:%s' % self.id_to_taxonomy_fp,
            'Number of sequences inspected: 6',
            'Number with no blast hits: 1',
            '',
         ]
        # compare data in log file to fake expected log file
        # NOTE: Since p.params is a dict, the order of lines is not
        # guaranteed, so testing is performed to make sure that 
        # the equal unordered lists of lines is present in actual and expected
        self.assertEqualItems(log_file_str.split('\n'), log_file_exp)

示例#22

0

显示文件

文件： test_assign_taxonomy.py 项目： carze/clovr-base

    def test_call_logs_run(self):
        """BlastTaxonAssigner.__call__ logs the run when expected
        """
        log_path = get_tmp_filename(\
         prefix='BlastTaxonAssignerTests_',suffix='.fasta')
        self._paths_to_clean_up.append(log_path)

        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove

        p = BlastTaxonAssigner({\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp,\
         'blast_db':blast_db})
        actual = p(self.input_seqs_fp, log_path=log_path)

        log_file = open(log_path)
        log_file_str = log_file.read()
        log_file.close()

        log_file_exp = [
            "BlastTaxonAssigner parameters:",
            'Min percent identity:0.9',
            'Application:blastn/megablast',
            'Max E value:1e-30',
            'Result path: None, returned as dict.',
            'blast_db:%s' % str(self.reference_seqs_fp)[1:-1],
            'id_to_taxonomy_filepath:%s' % self.id_to_taxonomy_fp,
            'Number of sequences inspected: 6',
            'Number with no blast hits: 1',
            '',
        ]
        # compare data in log file to fake expected log file
        # NOTE: Since p.params is a dict, the order of lines is not
        # guaranteed, so testing is performed to make sure that
        # the equal unordered lists of lines is present in actual and expected
        self.assertEqualItems(log_file_str.split('\n'), log_file_exp)

示例#23

0

显示文件

文件： test_assign_taxonomy.py 项目： carze/clovr-base

    def test_get_blast_hits(self):
        """BlastTaxonAssigner._get_blast_hits functions w existing db
        
        """
        # build the blast database and keep track of the files to clean up
        blast_db, files_to_remove = \
         build_blast_db_from_fasta_path(self.reference_seqs_fp)
        self._paths_to_clean_up += files_to_remove

        p = BlastTaxonAssigner({})
        seq_coll_blast_results = p._get_blast_hits(blast_db, self.test_seqs)
        # mapping from identifier in test_seq_coll to the id of the sequence
        # in the refseq collection (a silva derivative)
        expected_matches = {\
         's1':'AY800210',
         's2':'EU883771',\
         's3':'EF503699',\
         's4':'DQ260310',\
         's5':'EF503697'}

        # no results for s6 (which is a randomly-generated sequence)
        s6_blast_results = seq_coll_blast_results['s6']
        self.assertEqual(s6_blast_results, [])

        # expected results for all other query sequences
        for seq_id in expected_matches:
            blast_results = seq_coll_blast_results[seq_id]
            blast_results_d = dict(blast_results)
            # explicitly checks that the result is in the data before
            # pulling it out (this is redundant, but allows for a useful
            # error message if the data wasn't in there b/c e.g. there
            # were no blast results returned)
            self.assertTrue(expected_matches[seq_id] in blast_results_d)
            # now check that the perfect match got a 0.0 e-value as it should
            # on this data
            self.assertEqual(blast_results_d[expected_matches[seq_id]], 0.0)

示例#24

0

显示文件

    def test_build_blast_db_from_fasta_path(self):
        """build_blast_db_from_fasta_path convenience function works as expected
        """
        blast_db, db_files = \
         build_blast_db_from_fasta_path(self.in_seqs1_fp)
        self.assertEqual(blast_db, self.in_seqs1_fp)
        expected_db_files = set([self.in_seqs1_fp + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files), expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))

示例#25

0

显示文件

文件： test_assign_taxonomy.py 项目： carze/clovr-base

 def test_get_blast_hits(self):
     """BlastTaxonAssigner._get_blast_hits functions w existing db
     
     """
     # build the blast database and keep track of the files to clean up
     blast_db, files_to_remove = \
      build_blast_db_from_fasta_path(self.reference_seqs_fp)
     self._paths_to_clean_up += files_to_remove
     
     p = BlastTaxonAssigner({})
     seq_coll_blast_results = p._get_blast_hits(blast_db,self.test_seqs)
     # mapping from identifier in test_seq_coll to the id of the sequence
     # in the refseq collection (a silva derivative)
     expected_matches = {\
      's1':'AY800210',
      's2':'EU883771',\
      's3':'EF503699',\
      's4':'DQ260310',\
      's5':'EF503697'}
     
     # no results for s6 (which is a randomly-generated sequence) 
     s6_blast_results = seq_coll_blast_results['s6']
     self.assertEqual(s6_blast_results,[])
     
     # expected results for all other query sequences
     for seq_id in expected_matches:
         blast_results = seq_coll_blast_results[seq_id]
         blast_results_d = dict(blast_results)
         # explicitly checks that the result is in the data before 
         # pulling it out (this is redundant, but allows for a useful
         # error message if the data wasn't in there b/c e.g. there 
         # were no blast results returned)
         self.assertTrue(expected_matches[seq_id] in blast_results_d)
         # now check that the perfect match got a 0.0 e-value as it should
         # on this data
         self.assertEqual(blast_results_d[expected_matches[seq_id]],0.0)

示例#26

0

显示文件

文件： parallel_pick_otus_blast.py 项目： Ecogenomics/FrankenQIIME

def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if opts.blast_db == None and opts.refseqs_fp == None:
        option_parser.error('Either blast_db or refseqs_fp must be provided.')

   # create local copies of command-line options
    python_exe_fp = opts.python_exe_fp
    pick_otus_fp = opts.pick_otus_fp
    refseqs_fp = opts.refseqs_fp
    cluster_jobs_fp = opts.cluster_jobs_fp
    input_fasta_fp = opts.input_fasta_fp 
    jobs_to_start = opts.jobs_to_start
    output_dir = opts.output_dir
    poller_fp = opts.poller_fp
    retain_temp_files = opts.retain_temp_files
    suppress_polling = opts.suppress_polling
    seconds_to_sleep = opts.seconds_to_sleep
    max_e_value = opts.max_e_value
    similarity = opts.similarity
    poll_directly = opts.poll_directly
    min_aligned_percent = opts.min_aligned_percent

    created_temp_paths = []

    if not opts.blast_db:        
        # Build the blast database from the reference_seqs_fp -- all procs
        # will then access one db rather than create one per proc
        blast_db, db_files_to_remove = \
             build_blast_db_from_fasta_path(refseqs_fp)
        created_temp_paths += db_files_to_remove
    else:
        blast_db = opts.blast_db
    
    # split the input filepath into directory and filename, base filename and
    # extension
    input_dir, input_fasta_fn = split(input_fasta_fp)
    input_file_basename, input_fasta_ext = splitext(input_fasta_fn)
    
    # set the job_prefix either based on what the user passed in,
    # or a random string beginning with RDP
    job_prefix = opts.job_prefix or get_random_job_prefix('POTU')
    
    # A temporary output directory is created in output_dir named
    # job_prefix. Output files are then moved from the temporary 
    # directory to the output directory when they are complete, allowing
    # a poller to detect when runs complete by the presence of their
    # output files.
    working_dir = '%s/%s' % (output_dir,job_prefix)
    try:
        makedirs(working_dir)
        created_temp_paths.append(working_dir)
    except OSError:
        # working dir already exists
        pass
    
    # compute the number of sequences that should be included in
    # each file after splitting the input fasta file   
    num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,jobs_to_start)
     
    # split the fasta files and get the list of resulting files
    tmp_fasta_fps =\
      split_fasta(open(input_fasta_fp),num_seqs_per_file,\
      job_prefix,working_dir=output_dir)
    created_temp_paths += tmp_fasta_fps
    
    # build the filepath for the 'jobs script'
    jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix)
    created_temp_paths.append(jobs_fp)
    
    # generate the list of commands to be pushed out to nodes and the list of
    # output files generated by each job
    commands, job_result_filepaths = \
     get_job_commands(python_exe_fp,pick_otus_fp,tmp_fasta_fps,
     output_dir,blast_db,job_prefix,working_dir,max_e_value,similarity,
     min_aligned_percent)
    created_temp_paths += job_result_filepaths

    # Set up poller apparatus if the user does not suppress polling
    if not suppress_polling:
        # Write the list of files which must exist for the jobs to be 
        # considered complete
        expected_files_filepath = '%s/expected_out_files.txt' % working_dir
        write_filepaths_to_file(job_result_filepaths,expected_files_filepath)
        created_temp_paths.append(expected_files_filepath)
        
        # Write the mapping file which described how the output files from
        # each job should be merged into the final output files
        merge_map_filepath = '%s/merge_map.txt' % working_dir
        process_run_results_f =\
         'qiime.parallel.pick_otus_blast.parallel_blast_process_run_results_f'
        write_merge_map_file_pick_otus(job_result_filepaths,output_dir,\
            merge_map_filepath,input_file_basename)
        created_temp_paths.append(merge_map_filepath)
        
        # Create the filepath listing the temporary files to be deleted,
        # but don't write it yet
        deletion_list_filepath = '%s/deletion_list.txt' % working_dir
        created_temp_paths.append(deletion_list_filepath)
        
        # Generate the command to run the poller, and the list of temp files
        # created by the poller
        if not poll_directly:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,process_run_results_f,\
             seconds_to_sleep=seconds_to_sleep)
            created_temp_paths += poller_result_filepaths
            # append the poller command to the list of job commands
            commands.append(poller_command)
        else:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,process_run_results_f,\
             seconds_to_sleep=seconds_to_sleep,command_prefix='',command_suffix='')
            created_temp_paths += poller_result_filepaths
        
        if not retain_temp_files:
            # If the user wants temp files deleted, now write the list of 
            # temp files to be deleted
            write_filepaths_to_file(created_temp_paths,deletion_list_filepath)
        else:
            # Otherwise just write an empty file
            write_filepaths_to_file([],deletion_list_filepath)
     
    # write the commands to the 'jobs files'
    write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp)
    
    # submit the jobs file using cluster_jobs, if not suppressed by the
    # user
    if not opts.suppress_submit_jobs:
        submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix)
        
    if poll_directly:
        try:
            check_call(poller_command.split())
        except CalledProcessError, e:
            print '**Error occuring when calling the poller directly. '+\
            'Jobs may have been submitted, but are not being polled.'
            print str(e)
            exit(-1)

示例#27

0

显示文件

文件： assign_taxonomy.py 项目： andrea-campisano/qiime

    def __call__(self, seq_path=None, seqs=None, result_path=None, log_path=None):
        """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
        """
        assert seq_path or seqs, \
         "Must provide either seqs or seq_path when calling a BlastTaxonAssigner."

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        # assign the blast database, either as a pre-exisiting database
        # specified as self.Params['blast_db'] or by creating a
        # temporary database from the sequence file specified
        # as self.Params['reference_seqs_filepath']
        try:
            blast_db = self.Params['blast_db']
        except KeyError:
            # build a temporary blast_db
            reference_seqs_path = self.Params['reference_seqs_filepath']
            refseqs_dir, refseqs_name = os.path.split(reference_seqs_path)
            blast_db, db_files_to_remove = \
             build_blast_db_from_fasta_path(reference_seqs_path)

        # build the mapping of sequence identifier
        # (wrt to the blast db seqs) to taxonomy
        id_to_taxonomy_map = self._parse_id_to_taxonomy_file(\
         open(self.Params['id_to_taxonomy_filepath'],'U'))

        ## Iterate over the input self.SeqsPerBlastRun seqs at a time.
        # There are two competing issues here when dealing with very large
        # inputs. If all sequences are read in at once, the containing object
        # can be very large, causing the system to page. On the other hand,
        # in such cases it would be very slow to treat each sequence
        # individually, since blast requires a filepath. Each call would
        # therefore involve writing a single sequence to file, opening/closing
        # and removing the file. To balance this, sequences are read in and
        # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time.
        # This appears to solve the problem with the largest sets I've worked
        # with so far.

        if seq_path:
            # Get a seq iterator
            seqs = MinimalFastaParser(open(seq_path))
        # Build object to keep track of the current set of sequence to be
        # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score)
        # mapping)
        current_seqs = []
        result = {}

        # Iterate over the (seq_id, seq) pairs
        for seq_id, seq in seqs:
            # append the current seq_id,seq to list of seqs to be blasted
            current_seqs.append((seq_id,seq))

            # When there are 1000 in the list, blast them
            if len(current_seqs) == self.SeqsPerBlastRun:
                # update the result object
                result.update(self._seqs_to_taxonomy(\
                 current_seqs,blast_db,id_to_taxonomy_map))
                # reset the list of seqs to be blasted
                current_seqs = []
        # Assign taxonomy to the remaining sequences
        result.update(self._seqs_to_taxonomy(\
         current_seqs,blast_db,id_to_taxonomy_map))
        ## End iteration over the input self.SeqsPerBlastRun seqs at a time.

        # Write log data if we have a path (while the logger can handle
        # being called if we are not logging, some of these steps are slow).
        if log_path is not None:
            num_inspected = len(result)
            logger.info('Number of sequences inspected: %s' % num_inspected)
            num_null_hits = [r[1] for r in result.values()].count(None)
            logger.info('Number with no blast hits: %s' % num_null_hits)

        if result_path:
            # if the user provided a result_path, write the
            # results to file
            of = open(result_path,'w')
            for seq_id, (lineage, confidence, blast_hit_id) in result.items():
                of.write('%s\t%s\t%s\t%s\n' %
                 (seq_id, lineage, confidence, blast_hit_id))
            of.close()
            result = None
            logger.info('Result path: %s' % result_path)
        else:
            # Returning the data as a dict, so no modification to result
            # is necessary.
            pass

            # if no result_path was provided, return the data as a dict
            logger.info('Result path: None, returned as dict.')

        # clean-up temp blastdb files, if a temp blastdb was created
        if 'reference_seqs_filepath' in self.Params:
            map(remove,db_files_to_remove)

        # return the result
        return result

示例#28

0

显示文件

文件： parallel_blast.py 项目： Ecogenomics/FrankenQIIME

def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)    
    # create local copies of command-line options
    input_fasta_fp = opts.infile_path 
    refseqs_path = opts.refseqs_path
    python_exe_fp = opts.python_exe_fp
    num_jobs_to_start = opts.jobs_to_start
    blastall_fp = opts.blastall_fp
    blastmat_dir = opts.blastmat_dir
    cluster_jobs_fp = opts.cluster_jobs_fp
    e_value = opts.e_value
    word_size = opts.word_size
    num_hits = opts.num_hits
    output_dir = opts.output_dir
    suppress_format_blastdb = opts.suppress_format_blastdb
    poller_fp = opts.poller_fp
    retain_temp_files = opts.retain_temp_files
    suppress_polling = opts.suppress_polling
    seconds_to_sleep = opts.seconds_to_sleep
    poll_directly = opts.poll_directly
    disable_low_complexity_filter = opts.disable_low_complexity_filter

    created_temp_paths = []
    
    # split the input filepath into directory and filename, base filename and
    # extension
    input_dir, input_fasta_fn = split(input_fasta_fp)
    input_file_basename, input_fasta_ext = splitext(input_fasta_fn)

    # set the job_prefix either based on what the user passed in,
    # or a random string beginning with BLAST
    job_prefix = opts.job_prefix or get_random_job_prefix('BLAST')
    
    # A temporary output directory is created in output_dir named
    # job_prefix. Output files are then moved from the temporary 
    # directory to the output directory when they are complete, allowing
    # a poller to detect when runs complete by the presence of their
    # output files.
    working_dir = '%s/%s' % (output_dir,job_prefix)
    try:
        makedirs(working_dir)
        created_temp_paths.append(working_dir)
    except OSError:
        # working dir already exists
        pass
        
    # compute the number of sequences that should be included in
    # each file after splitting the input fasta file   
    num_seqs_per_file = compute_seqs_per_file(input_fasta_fp,num_jobs_to_start)
    
    # Build the blast database if necessary
    if not suppress_format_blastdb:
        blast_db, db_files_to_remove = \
         build_blast_db_from_fasta_path(refseqs_path)
    created_temp_paths += db_files_to_remove
    
    # split the fasta files and get the list of resulting files        
    tmp_fasta_fps =\
     split_fasta(open(input_fasta_fp),num_seqs_per_file,job_prefix,output_dir)
    created_temp_paths += tmp_fasta_fps
    
    # build the filepath for the 'jobs script'
    jobs_fp = '%s/%sjobs.txt' % (output_dir, job_prefix)
    created_temp_paths.append(jobs_fp)

    # generate the list of commands to be pushed out to nodes    
    commands, job_result_filepaths = get_commands(tmp_fasta_fps,refseqs_path,
     blastall_fp,blastmat_dir,e_value,word_size,num_hits,output_dir,working_dir,
     disable_low_complexity_filter=disable_low_complexity_filter,
     command_prefix=None,command_suffix=None)
    created_temp_paths += job_result_filepaths
    
    # Set up poller apparatus if the user does not suppress polling
    if not suppress_polling:
        # Write the list of files which must exist for the jobs to be 
        # considered complete
        expected_files_filepath = '%s/expected_out_files.txt' % working_dir
        write_filepaths_to_file(job_result_filepaths,expected_files_filepath)
        created_temp_paths.append(expected_files_filepath)
        
        # Write the mapping file which described how the output files from
        # each job should be merged into the final output files
        merge_map_filepath = '%s/merge_map.txt' % working_dir
        write_merge_map_file_blast(job_result_filepaths,output_dir,\
            merge_map_filepath,input_file_basename)
        created_temp_paths.append(merge_map_filepath)
        
        # Create the filepath listing the temporary files to be deleted,
        # but don't write it yet
        deletion_list_filepath = '%s/deletion_list.txt' % working_dir
        created_temp_paths.append(deletion_list_filepath)
        
        if not poll_directly:
            # Generate the command to run the poller, and the list of temp files
            # created by the poller
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,\
             seconds_to_sleep=seconds_to_sleep)
            # append the poller command to the list of job commands
            commands.append(poller_command)
        else:
            poller_command, poller_result_filepaths =\
             get_poller_command(python_exe_fp,poller_fp,expected_files_filepath,\
             merge_map_filepath,deletion_list_filepath,\
             seconds_to_sleep=seconds_to_sleep,\
             command_prefix='',command_suffix='')
        
        created_temp_paths += poller_result_filepaths
        
        
        if not retain_temp_files:
            # If the user wants temp files deleted, now write the list of 
            # temp files to be deleted
            write_filepaths_to_file(created_temp_paths,deletion_list_filepath)
        else:
            # Otherwise just write an empty file
            write_filepaths_to_file([],deletion_list_filepath)
    
    # write the commands to the 'jobs files'
    write_jobs_file(commands,job_prefix=job_prefix,jobs_fp=jobs_fp)
    
    # submit the jobs file using cluster_jobs, if not suppressed by the
    # user
    if not opts.suppress_submit_jobs:
        submit_jobs(cluster_jobs_fp,jobs_fp,job_prefix)
        
    if poll_directly:
        try:
            check_call(poller_command.split())
        except CalledProcessError, e:
            print '**Error occuring when calling the poller directly. '+\
            'Jobs may have been submitted, but are not being polled.'
            print str(e)
            exit(-1)

示例#29

0

显示文件

文件： assign_taxonomy.py 项目： franny911/qiime

    def __call__(self,
                 seq_path=None,
                 seqs=None,
                 result_path=None,
                 log_path=None):
        """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
        """
        assert seq_path or seqs, \
            "Must provide either seqs or seq_path when calling a BlastTaxonAssigner."

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        # assign the blast database, either as a pre-exisiting database
        # specified as self.Params['blast_db'] or by creating a
        # temporary database from the sequence file specified
        # as self.Params['reference_seqs_filepath']
        try:
            blast_db = self.Params['blast_db']
        except KeyError:
            # build a temporary blast_db
            reference_seqs_path = self.Params['reference_seqs_filepath']
            refseqs_dir, refseqs_name = os.path.split(reference_seqs_path)
            blast_db, db_files_to_remove = \
                build_blast_db_from_fasta_path(reference_seqs_path)

        # build the mapping of sequence identifier
        # (wrt to the blast db seqs) to taxonomy
        id_to_taxonomy_map = self._parse_id_to_taxonomy_file(
            open(self.Params['id_to_taxonomy_filepath'], 'U'))

        # Iterate over the input self.SeqsPerBlastRun seqs at a time.
        # There are two competing issues here when dealing with very large
        # inputs. If all sequences are read in at once, the containing object
        # can be very large, causing the system to page. On the other hand,
        # in such cases it would be very slow to treat each sequence
        # individually, since blast requires a filepath. Each call would
        # therefore involve writing a single sequence to file, opening/closing
        # and removing the file. To balance this, sequences are read in and
        # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time.
        # This appears to solve the problem with the largest sets I've worked
        # with so far.

        if seq_path:
            # Get a seq iterator
            seqs = MinimalFastaParser(open(seq_path))
        # Build object to keep track of the current set of sequence to be
        # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score)
        # mapping)
        current_seqs = []
        result = {}

        # Iterate over the (seq_id, seq) pairs
        for seq_id, seq in seqs:
            # append the current seq_id,seq to list of seqs to be blasted
            current_seqs.append((seq_id, seq))

            # When there are 1000 in the list, blast them
            if len(current_seqs) == self.SeqsPerBlastRun:
                # update the result object
                result.update(
                    self._seqs_to_taxonomy(current_seqs, blast_db,
                                           id_to_taxonomy_map))
                # reset the list of seqs to be blasted
                current_seqs = []
        # Assign taxonomy to the remaining sequences
        result.update(
            self._seqs_to_taxonomy(current_seqs, blast_db, id_to_taxonomy_map))
        # End iteration over the input self.SeqsPerBlastRun seqs at a time.

        # Write log data if we have a path (while the logger can handle
        # being called if we are not logging, some of these steps are slow).
        if log_path is not None:
            num_inspected = len(result)
            logger.info('Number of sequences inspected: %s' % num_inspected)
            num_null_hits = [r[1] for r in result.values()].count(None)
            logger.info('Number with no blast hits: %s' % num_null_hits)

        if result_path:
            # if the user provided a result_path, write the
            # results to file
            of = open(result_path, 'w')
            for seq_id, (lineage, confidence, blast_hit_id) in result.items():
                of.write('%s\t%s\t%s\t%s\n' %
                         (seq_id, lineage, confidence, blast_hit_id))
            of.close()
            result = None
            logger.info('Result path: %s' % result_path)
        else:
            # Returning the data as a dict, so no modification to result
            # is necessary.
            pass

            # if no result_path was provided, return the data as a dict
            logger.info('Result path: None, returned as dict.')

        # clean-up temp blastdb files, if a temp blastdb was created
        if 'reference_seqs_filepath' in self.Params:
            map(remove, db_files_to_remove)

        # return the result
        return result