Exemplo n.º 1
0
    def _precommand_initiation(self, input_fp, output_dir, working_dir,
                               params):
        if params['chimera_detection_method'] == 'blast_fragments':
            blast_db, db_files_to_remove = \
                build_blast_db_from_fasta_path(params['reference_seqs_fp'],
                                               output_dir=working_dir)
            self.files_to_remove += db_files_to_remove
            params['blast_db'] = blast_db
        elif params['chimera_detection_method'] == 'ChimeraSlayer':
            # copy the reference files to working dir
            # ChimeraSlayer creates an index file of the ref and
            # will crash without write permission in the ref seqs dir
            aligned_reference_seqs_fp = params['aligned_reference_seqs_fp']
            _, new_ref_filename = split(aligned_reference_seqs_fp)
            copy(aligned_reference_seqs_fp, working_dir)
            aligned_reference_seqs_fp = working_dir + "/" + new_ref_filename

            self.files_to_remove.append(aligned_reference_seqs_fp)
            params['aligned_reference_seqs_fp'] = aligned_reference_seqs_fp

            # if given, also copy the unaligned ref db
            reference_seqs_fp = params['reference_seqs_fp']
            if reference_seqs_fp:
                _, new_ref_filename = split(reference_seqs_fp)
                copy(reference_seqs_fp, working_dir)
                reference_seqs_fp = working_dir + "/" + new_ref_filename
            else:
                # otherwise create it
                reference_seqs_fp = write_degapped_fasta_to_file(
                    parse_fasta(open(aligned_reference_seqs_fp)),
                    tmp_dir=working_dir)
            # delete it afterwards
            self.files_to_remove.append(reference_seqs_fp)
            params['reference_seqs_fp'] = reference_seqs_fp

            # build blast db of reference, otherwise ChimeraSlayer will do it
            # and parallel jobs clash
            _, db_files_to_remove = \
                build_blast_db_from_fasta_path(reference_seqs_fp)
            self.files_to_remove += db_files_to_remove

            # make the index file globally
            # Reason: ChimeraSlayer first checks to see if the index file is
            # there. If not it tries to create it. This can lead to race
            # condition if several parallel jobs try to create it at the same
            # time.
            make_cidx_file(aligned_reference_seqs_fp)
            self.files_to_remove.append(aligned_reference_seqs_fp + ".cidx")
        else:
            raise ValueError("Unrecognized chimera detection method '%s'." %
                             params['chimera_detection_method'])
Exemplo n.º 2
0
 def _precommand_initiation(self, input_fp, output_dir, working_dir, params):
     if not params["blast_db"]:
         # Build the blast database from the reference_seqs_fp -- all procs
         # will then access one db rather than create one per proc.
         blast_db, db_files_to_remove = build_blast_db_from_fasta_path(params["reference_seqs_fp"])
         self.files_to_remove += db_files_to_remove
         params["blast_db"] = blast_db
Exemplo n.º 3
0
 def _precommand_initiation(self, input_fp, output_dir, working_dir,
                            params):
     if not params['blast_db']:
         # Build the blast database from the reference_seqs_fp -- all procs
         # will then access one db rather than create one per proc
         blast_db, db_files_to_remove = \
             build_blast_db_from_fasta_path(params['refseqs_fp'])
         self.files_to_remove += db_files_to_remove
         params['blast_db'] = blast_db
Exemplo n.º 4
0
 def _precommand_initiation(
         self, input_fp, output_dir, working_dir, params):
     if params['refseqs_path']:
         # Build the blast database from the refseqs_path -- all procs
         # will then access one db rather than create one per proc.
         blast_db, db_files_to_remove = \
             build_blast_db_from_fasta_path(params['refseqs_path'])
         self.files_to_remove += db_files_to_remove
         params['blast_db'] = blast_db
Exemplo n.º 5
0
    def _precommand_initiation(
            self, input_fp, output_dir, working_dir, params):
        if not params['blast_db']:
            # Build the blast database from the reference_seqs_fp -- all procs
            # will then access one db rather than create one per proc
            blast_db, db_files_to_remove = \
                build_blast_db_from_fasta_path(params['template_fp'],
                                               output_dir=get_qiime_temp_dir())
            self.files_to_remove += db_files_to_remove
            params['blast_db'] = blast_db

        if params['min_length'] < 0:
            params['min_length'] = compute_min_alignment_length(
                open(input_fp, 'U'))
Exemplo n.º 6
0
    def _precommand_initiation(self, input_fp, output_dir, working_dir,
                               params):
        if not params['blast_db']:
            # Build the blast database from the reference_seqs_fp -- all procs
            # will then access one db rather than create one per proc
            blast_db, db_files_to_remove = \
                build_blast_db_from_fasta_path(params['template_fp'],
                                               output_dir=get_qiime_temp_dir())
            self.files_to_remove += db_files_to_remove
            params['blast_db'] = blast_db

        if params['min_length'] < 0:
            params['min_length'] = compute_min_alignment_length(
                open(input_fp, 'U'))
Exemplo n.º 7
0
    def __init__(self, params):
        """Return new BlastFragmentsChimeraChecker object with specified params.

        """
        _params = {'max_e_value': 1e-30,
                   'min_pct_id': 0.90,
                   'num_fragments': 3,
                   'taxonomy_depth': 4}
        _params.update(params)

        try:
            id_to_taxonomy_fp = params['id_to_taxonomy_fp']
        except KeyError:
            raise ValueError(
                "id_to_taxonomy_filepath must be provided to %s" %
                self.Name)

        # Create the blast database if it hasn't been provided
        if 'blast_db' not in params or params['blast_db'] is None:
            try:
                reference_seqs_fp = params['reference_seqs_fp']
            except KeyError:
                raise ValueError(
                    "refseqs_fp or blast_db must be provided to  %s" %
                    self.Name)
            blast_db, self._db_files_to_remove = \
                build_blast_db_from_fasta_path(reference_seqs_fp)
        else:
            blast_db = params['blast_db']
            self._db_files_to_remove = []

        self._taxon_assigner = BlastTaxonAssigner(
            {'blast_db': blast_db,
             'id_to_taxonomy_filepath': id_to_taxonomy_fp,
             'Max E value': _params['max_e_value'],
             'Min percent identity': _params['min_pct_id']
             })

        ChimeraChecker.__init__(self, _params)
Exemplo n.º 8
0
 def test_build_blast_db_from_fasta_path_aln(self):
     """build_blast_db_from_fasta_path works with alignment as input
     """
     blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp)
     self.assertEqual(blast_db,self.in_aln1_fp)
     expected_db_files = set([blast_db + ext\
      for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
     self.assertEqual(set(db_files),expected_db_files)
     # result returned when blasting against new db
     self.assertEqual(\
         len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)
     
     # Make sure all db_files exist
     for fp in db_files:
         self.assertTrue(exists(fp))
     
     # Remove all db_files exist   
     remove_files(db_files)
     
     # Make sure nothing weird happened in the remove
     for fp in db_files:
         self.assertFalse(exists(fp))
Exemplo n.º 9
0
    def test_build_blast_db_from_fasta_path(self):
        """build_blast_db_from_fasta_path convenience function works as expected
        """
        blast_db, db_files = \
         build_blast_db_from_fasta_path(self.in_seqs1_fp)
        self.assertEqual(blast_db,self.in_seqs1_fp)
        expected_db_files = set([self.in_seqs1_fp + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files),expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))
        
        # Remove all db_files exist   
        remove_files(db_files)
        
        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Exemplo n.º 10
0
    def __call__(self,
                 seq_path=None,
                 seqs=None,
                 result_path=None,
                 log_path=None):
        """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
        """
        assert seq_path or seqs, \
            "Must provide either seqs or seq_path when calling a BlastTaxonAssigner."

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        # assign the blast database, either as a pre-exisiting database
        # specified as self.Params['blast_db'] or by creating a
        # temporary database from the sequence file specified
        # as self.Params['reference_seqs_filepath']
        try:
            blast_db = self.Params['blast_db']
        except KeyError:
            # build a temporary blast_db
            reference_seqs_path = self.Params['reference_seqs_filepath']
            refseqs_dir, refseqs_name = os.path.split(reference_seqs_path)
            blast_db, db_files_to_remove = \
                build_blast_db_from_fasta_path(reference_seqs_path)

        # build the mapping of sequence identifier
        # (wrt to the blast db seqs) to taxonomy
        id_to_taxonomy_map = self._parse_id_to_taxonomy_file(
            open(self.Params['id_to_taxonomy_filepath'], 'U'))

        # Iterate over the input self.SeqsPerBlastRun seqs at a time.
        # There are two competing issues here when dealing with very large
        # inputs. If all sequences are read in at once, the containing object
        # can be very large, causing the system to page. On the other hand,
        # in such cases it would be very slow to treat each sequence
        # individually, since blast requires a filepath. Each call would
        # therefore involve writing a single sequence to file, opening/closing
        # and removing the file. To balance this, sequences are read in and
        # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time.
        # This appears to solve the problem with the largest sets I've worked
        # with so far.

        if seq_path:
            # Get a seq iterator
            seqs = parse_fasta(open(seq_path))
        # Build object to keep track of the current set of sequence to be
        # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score)
        # mapping)
        current_seqs = []
        result = {}

        # Iterate over the (seq_id, seq) pairs
        for seq_id, seq in seqs:
            # append the current seq_id,seq to list of seqs to be blasted
            current_seqs.append((seq_id, seq))

            # When there are 1000 in the list, blast them
            if len(current_seqs) == self.SeqsPerBlastRun:
                # update the result object
                result.update(
                    self._seqs_to_taxonomy(current_seqs, blast_db,
                                           id_to_taxonomy_map))
                # reset the list of seqs to be blasted
                current_seqs = []
        # Assign taxonomy to the remaining sequences
        result.update(
            self._seqs_to_taxonomy(current_seqs, blast_db, id_to_taxonomy_map))
        # End iteration over the input self.SeqsPerBlastRun seqs at a time.

        # Write log data if we have a path (while the logger can handle
        # being called if we are not logging, some of these steps are slow).
        if log_path is not None:
            num_inspected = len(result)
            logger.info('Number of sequences inspected: %s' % num_inspected)
            num_null_hits = [r[1] for r in result.values()].count(None)
            logger.info('Number with no blast hits: %s' % num_null_hits)

        if result_path:
            # if the user provided a result_path, write the
            # results to file
            of = open(result_path, 'w')
            for seq_id, (lineage, confidence, blast_hit_id) in result.items():
                of.write('%s\t%s\t%s\t%s\n' %
                         (seq_id, lineage, confidence, blast_hit_id))
            of.close()
            result = None
            logger.info('Result path: %s' % result_path)
        else:
            # Returning the data as a dict, so no modification to result
            # is necessary.
            pass

            # if no result_path was provided, return the data as a dict
            logger.info('Result path: None, returned as dict.')

        # clean-up temp blastdb files, if a temp blastdb was created
        if 'reference_seqs_filepath' in self.Params:
            map(remove, db_files_to_remove)

        # return the result
        return result
Exemplo n.º 11
0
    def __call__(self, seq_path=None, seqs=None,
                 result_path=None, log_path=None):
        """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
        """
        assert seq_path or seqs, \
            "Must provide either seqs or seq_path when calling a BlastTaxonAssigner."

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        # assign the blast database, either as a pre-exisiting database
        # specified as self.Params['blast_db'] or by creating a
        # temporary database from the sequence file specified
        # as self.Params['reference_seqs_filepath']
        try:
            blast_db = self.Params['blast_db']
        except KeyError:
            # build a temporary blast_db
            reference_seqs_path = self.Params['reference_seqs_filepath']
            refseqs_dir, refseqs_name = os.path.split(reference_seqs_path)
            blast_db, db_files_to_remove = \
                build_blast_db_from_fasta_path(reference_seqs_path)

        # build the mapping of sequence identifier
        # (wrt to the blast db seqs) to taxonomy
        id_to_taxonomy_map = self._parse_id_to_taxonomy_file(
            open(self.Params['id_to_taxonomy_filepath'], 'U'))

        # Iterate over the input self.SeqsPerBlastRun seqs at a time.
        # There are two competing issues here when dealing with very large
        # inputs. If all sequences are read in at once, the containing object
        # can be very large, causing the system to page. On the other hand,
        # in such cases it would be very slow to treat each sequence
        # individually, since blast requires a filepath. Each call would
        # therefore involve writing a single sequence to file, opening/closing
        # and removing the file. To balance this, sequences are read in and
        # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time.
        # This appears to solve the problem with the largest sets I've worked
        # with so far.

        if seq_path:
            # Get a seq iterator
            seqs = parse_fasta(open(seq_path))
        # Build object to keep track of the current set of sequence to be
        # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score)
        # mapping)
        current_seqs = []
        result = {}

        # Iterate over the (seq_id, seq) pairs
        for seq_id, seq in seqs:
            # append the current seq_id,seq to list of seqs to be blasted
            current_seqs.append((seq_id, seq))

            # When there are 1000 in the list, blast them
            if len(current_seqs) == self.SeqsPerBlastRun:
                # update the result object
                result.update(self._seqs_to_taxonomy(
                    current_seqs, blast_db, id_to_taxonomy_map))
                # reset the list of seqs to be blasted
                current_seqs = []
        # Assign taxonomy to the remaining sequences
        result.update(self._seqs_to_taxonomy(
            current_seqs, blast_db, id_to_taxonomy_map))
        # End iteration over the input self.SeqsPerBlastRun seqs at a time.

        # Write log data if we have a path (while the logger can handle
        # being called if we are not logging, some of these steps are slow).
        if log_path is not None:
            num_inspected = len(result)
            logger.info('Number of sequences inspected: %s' % num_inspected)
            num_null_hits = [r[1] for r in result.values()].count(None)
            logger.info('Number with no blast hits: %s' % num_null_hits)

        if result_path:
            # if the user provided a result_path, write the
            # results to file
            of = open(result_path, 'w')
            for seq_id, (lineage, confidence, blast_hit_id) in result.items():
                of.write('%s\t%s\t%s\t%s\n' %
                         (seq_id, lineage, confidence, blast_hit_id))
            of.close()
            result = None
            logger.info('Result path: %s' % result_path)
        else:
            # Returning the data as a dict, so no modification to result
            # is necessary.
            pass

            # if no result_path was provided, return the data as a dict
            logger.info('Result path: None, returned as dict.')

        # clean-up temp blastdb files, if a temp blastdb was created
        if 'reference_seqs_filepath' in self.Params:
            map(remove, db_files_to_remove)

        # return the result
        return result