Exemplo n.º 1
0
import logging
from pipebricks.PipeLogger import PipeLogger
from pipebricks.Toolset import Tool
from pipebricks.Toolset import ToolException
from busco.BuscoConfig import BuscoConfig

#: working directory
_plot_dir = ''
#: r file name
_r_file = 'busco_figure.R'

# to avoid running R
_no_r = False

#: Get an instance of _logger for keeping track of events
_logger = PipeLogger.get_logger(__name__)

RCODE = '######################################\n'\
        '#\n'\
        '# BUSCO summary figure\n'\
        '# @version 3.0.0\n'\
        '# @since BUSCO 2.0.0\n'\
        '# \n' \
        '# Copyright (c) 2016-2017, Evgeny Zdobnov ([email protected])\n'\
        '# Licensed under the MIT license. See LICENSE.md file.\n'\
        '#\n'\
        '######################################\n'\
        '\n'\
        '# Load the required libraries\n'\
        'library(ggplot2)\n'\
        'library("grid")\n'\
Exemplo n.º 2
0
def main():
    """
    This function runs a BUSCO analysis according to the provided parameters.
    See the help for more details:
    ``python run_BUSCO.py -h``
    :raises SystemExit: if any errors occur
    """
    start_time = time.time()
    # 1) Load a busco config file that will figure out all the params from all sources
    # i.e. provided config file, dataset cfg, and user args
    if os.environ.get('BUSCO_CONFIG_FILE') and os.access(
            os.environ.get('BUSCO_CONFIG_FILE'), os.R_OK):
        config_file = os.environ.get('BUSCO_CONFIG_FILE')
    else:
        config_file = '/BUSCOconfig.ini'
    config = BuscoConfig(config_file, _parse_args())
    # Define a logger, the config is passed to tell the logger if you required the quiet mode
    logger = PipeLogger.get_logger(__name__, config)

    # And now, BUSCO
    # Import needed class here so their logger is set with the proper configuration (quiet or debug)
    from pipebricks.Toolset import ToolException
    from busco.BuscoAnalysis import BuscoAnalysis
    try:

        try:
            logger.info(
                '****************** Start a BUSCO %s analysis, current time: %s **'
                '****************' %
                (BuscoConfig.VERSION, time.strftime('%m/%d/%Y %H:%M:%S')))
            logger.info('Configuration loaded from %s' % config_file)
            # 2) Load the analysis, this will check the dependencies and return the appropriate analysis object
            analysis = BuscoAnalysis.get_analysis(config)

            # 3) Run the analysis
            analysis.run_analysis()

            if not logger.has_warning():
                logger.info(
                    'BUSCO analysis done. Total running time: %s seconds' %
                    str(time.time() - start_time))
            else:
                logger.info(
                    'BUSCO analysis done with WARNING(s). Total running time: %s seconds'
                    % str(time.time() - start_time))

            logger.info('Results written in %s\n' % analysis.mainout)

        except ToolException as e:
            #
            logger.error(e)
            raise SystemExit

    except SystemExit:
        logger.error('BUSCO analysis failed !')
        logger.error(
            'Check the logs, read the user guide, if you still need technical '
            'support, then please contact %s\n' % BuscoConfig.CONTACT)
        raise SystemExit

    except KeyboardInterrupt:
        logger.error('A signal was sent to kill the process')
        logger.error('BUSCO analysis failed !')
        logger.error(
            'Check the logs, read the user guide, if you still need technical '
            'support, then please contact %s\n' % BuscoConfig.CONTACT)
        raise SystemExit

    except BaseException:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        logger.critical(
            'Unhandled exception occurred: %s\n' %
            traceback.format_exception(exc_type, exc_value, exc_traceback))
        logger.error('BUSCO analysis failed !')
        logger.error(
            'Check the logs, read the user guide, if you still need technical '
            'support, then please contact %s\n' % BuscoConfig.CONTACT)
        raise SystemExit
class TranscriptomeAnalysis(BuscoAnalysis):
    """
    Analysis on a transcriptome.
    """

    _logger = PipeLogger.get_logger(__name__)

    #
    # magic or public, meant to be accessed by external scripts [instance]
    #

    def __init__(self, params):
        """
        Initialize an instance.
        :param params: Values of all parameters that have to be defined
        :type params: BuscoConfig
        """
        self._mode = 'transcriptome'
        super(TranscriptomeAnalysis, self).__init__(params)
        self._transcriptome_by_scaff = {}
        # data integrity checks not done by the parent class
        if self.check_nucleotide_file() is False:
            TranscriptomeAnalysis._logger.error(
                'Please provide a nucleotide file as input')
            raise SystemExit

    # @overrides
    def run_analysis(self):
        """
        This function calls all needed steps for running the analysis.
        """

        super(TranscriptomeAnalysis, self).run_analysis()

        if self._restart:
            checkpoint = self.get_checkpoint(reset_random_suffix=True)
            TranscriptomeAnalysis._logger.warning(
                'Restarting an uncompleted run')
        else:
            checkpoint = 0  # all steps will be done
        if checkpoint < 1:
            TranscriptomeAnalysis._logger.info(
                '****** Step 1/2, current time: %s ******' %
                time.strftime("%m/%d/%Y %H:%M:%S"))
            if self._has_variants_file:
                self._run_tblastn(ancestral_variants=True)
            else:
                self._run_tblastn(ancestral_variants=False)
            self._set_checkpoint(1)
        TranscriptomeAnalysis._logger.info(
            '****** Step 2/2, current time: %s ******' %
            time.strftime("%m/%d/%Y %H:%M:%S"))
        self._load_score()
        self._load_length()
        self._get_coordinates()
        self._run_hmmer()
        self._produce_short_summary()
        self.cleanup()
        if self._tarzip:
            self._run_tarzip_hmmer_output()
            self._run_tarzip_translated_proteins()
        # remove the checkpoint, run is done
        self._set_checkpoint()

    # @overrides
    def cleanup(self):
        """
        This function cleans temporary files.
        """
        super(TranscriptomeAnalysis, self).cleanup()
        self._p_open(
            ['rm %s*%s%s_.temp' % (self._tmp, self._out, self._random)],
            'bash',
            shell=True)
        self._p_open([
            'rm %(tmp)s%(abrev)s.*ns? %(tmp)s%(abrev)s.*nin %(tmp)s%(abrev)s.*nhr'
            % {
                'tmp': self._tmp,
                'abrev': self._out + str(self._random)
            }
        ],
                     'bash',
                     shell=True)

    # @overrides
    def _check_dataset(self):
        """
        Check if the dataset integrity, if files and folder are present
        :raises SystemExit: if the dataset miss files or folders
        """
        super(TranscriptomeAnalysis, self)._check_dataset()
        # note: score and length cutoffs are checked when read,
        # see _load_scores and _load_lengths
        # ancestral would cause blast to fail, and be detected, see _blast()
        # dataset.cfg is not mandatory

        # check whether the ancestral_variants file is present
        if os.path.exists('%sancestral_variants' % self._lineage_path):
            self._has_variants_file = True
        else:
            self._has_variants_file = False
            BuscoAnalysis._logger.warning(
                'The dataset you provided does not contain the file '
                'ancestral_variants, likely because it is an old version. '
                'All blast steps will use the file ancestral instead')

    #
    # public, meant to be accessed by external scripts [class]
    #

    # Nothing

    #
    # method that should be used as if protected, for internal use [instance]
    # to move to public and rename if meaningful
    #

    def _sixpack(self, seq):
        """
        Gets the sixframe translation for the provided sequence
        :param seq: the sequence to be translated
        :type seq: str
        :return: the six translated sequences
        :rtype: list
        """
        s1 = seq
        s2 = seq[1:]
        s3 = seq[2:]
        rev = ''
        for letter in seq[::-1]:
            try:
                rev += BuscoAnalysis.COMP[letter]
            except KeyError:
                rev += BuscoAnalysis.COMP['N']
        r1 = rev
        r2 = rev[1:]
        r3 = rev[2:]
        transc = []
        frames = [s1, s2, s3, r1, r3, r2]
        for sequence in frames:
            part = ''
            new = ''
            for letter in sequence:
                if len(part) == 3:
                    try:
                        new += BuscoAnalysis.CODONS[part]
                    except KeyError:
                        new += 'X'
                    part = ''
                    part += letter
                else:
                    part += letter
            if len(part) == 3:
                try:
                    new += BuscoAnalysis.CODONS[part]
                except KeyError:
                    new += 'X'
            transc.append(new)
        return transc

    def _reformats_seq_id(self, seq_id):
        """
        This function reformats the sequence id to its original values
        :param seq_id: the seq id to reformats
        :type seq_id: str
        :return: the reformatted seq_id
        :rtype: str
        """
        return "_".join(seq_id.split('_')[:-1])

    def _get_coordinates(self):
        """
        This function gets coordinates for candidate regions from
        tblastn result file
        """

        TranscriptomeAnalysis._logger.info(
            'Maximum number of candidate transcript per BUSCO limited to: %s' %
            self._region_limit)

        TranscriptomeAnalysis._logger.info(
            'Getting coordinates for candidate transcripts...')
        # open input file
        f = open('%sblast_output/tblastn_%s.tsv' % (self.mainout, self._out))
        transcriptome_by_busco = {}
        self._transcriptome_by_scaff = {}
        maxi = 0
        for i in f:  # get a dictionary of BUSCO matches vs candidate scaffolds
            if i.startswith('#'):
                pass
            else:
                line = i.strip().split()
                if self._has_variants_file:
                    busco = '_'.join(line[0].split("_")[:-1])  # This pattern
                else:  # can support
                    busco = line[0]  # name like
                scaff = line[1]  # EOG00_1234_1
                leng = int(line[3])
                blast_eval = float(line[10])
                if busco not in transcriptome_by_busco.keys():
                    # Simply add it
                    # Use a single entry dict to keep scaffs id and their
                    # blast eval, for each busco
                    transcriptome_by_busco[busco] = [{scaff: blast_eval}]
                    # and keep a list of each busco by scaff
                    try:
                        self._transcriptome_by_scaff[scaff].append(busco)
                    except KeyError:
                        self._transcriptome_by_scaff[scaff] = [busco]
                    maxi = leng
                elif len(transcriptome_by_busco[busco]) < self._region_limit \
                        and leng >= 0.7 * maxi:
                    # check that this transcript is not already in, and update
                    # its eval if needed
                    add = True
                    for scaff_dict in transcriptome_by_busco[busco]:
                        if list(scaff_dict.keys())[0] == scaff:
                            add = False
                            # update the eval for this scaff
                            if blast_eval < list(scaff_dict.values())[0]:
                                scaff_dict[scaff] = blast_eval
                    if add:
                        transcriptome_by_busco[busco].append(
                            {scaff: blast_eval})
                        try:
                            self._transcriptome_by_scaff[scaff].append(busco)
                        except KeyError:
                            self._transcriptome_by_scaff[scaff] = [busco]
                        if leng > maxi:
                            maxi = leng
                elif len(transcriptome_by_busco[busco]) >= self._region_limit \
                        and leng >= 0.7 * maxi:
                    # replace the lowest scoring transcript if the current has
                    # a better score. needed because of multiple blast query
                    # having the same name when using ancestral_variants and
                    # not sorted by eval in the tblastn result file
                    to_replace = None
                    # Define if something has to be replaced
                    for entry in transcriptome_by_busco[busco]:
                        if list(entry.values())[0] > blast_eval:
                            # check if there is already a to_replace entry and
                            # compare the eval
                            if (to_replace and
                                    list(entry.values())[0] >
                                    list(to_replace.values())[0]) or \
                                        not to_replace:
                                to_replace = {
                                    list(entry.keys())[0]:
                                    list(entry.values())[0]
                                }

                    if to_replace:
                        # try to add the new one
                        # check that this scaffold is not already in,
                        # and update the eval if needed
                        # if the scaff was already in, do not replace
                        # the to_replace entry to keep the max number of
                        # candidate regions
                        add = True
                        for scaff_dict in transcriptome_by_busco[busco]:
                            if list(scaff_dict.keys())[0] == scaff:
                                add = False
                                if blast_eval < list(scaff_dict.values())[0]:
                                    # update the eval for this scaff
                                    scaff_dict[scaff] = blast_eval
                        if add:
                            # add the new one
                            transcriptome_by_busco[busco].append(
                                {scaff: blast_eval})
                            try:
                                self._transcriptome_by_scaff[scaff].append(
                                    busco)
                            except KeyError:
                                self._transcriptome_by_scaff[scaff] = [busco]

                            # remove the old one
                            for entry in transcriptome_by_busco[busco]:
                                if list(entry.keys())[0] == \
                                        list(to_replace.keys())[0]:
                                    scaff_to_remove = list(entry.keys())[0]
                                    break
                            transcriptome_by_busco[busco].remove(entry)

                            for entry in self._transcriptome_by_scaff[
                                    scaff_to_remove]:
                                if entry == busco:
                                    break
                            self._transcriptome_by_scaff[
                                scaff_to_remove].remove(entry)

                            if leng > maxi:
                                maxi = leng

        TranscriptomeAnalysis._logger.info(
            'Extracting candidate transcripts...')
        f = open(self._sequences)
        check = 0
        out = None
        for i in f:
            if i.startswith('>'):
                i = i.strip().split()
                i = i[0][1:]
                if i in list(self._transcriptome_by_scaff.keys()):
                    out = open(
                        '%s%s%s%s_.temp' %
                        (self._tmp, i, self._out, self._random), 'w')
                    out.write('>%s\n' % i)
                    check = 1
                else:
                    check = 0
            elif check == 1:
                out.write(i)
        f.close()
        if out:
            out.close()
        if not os.path.exists('%stranslated_proteins' % self.mainout):
            os.makedirs('%stranslated_proteins' % self.mainout)
        files = os.listdir(self._tmp)
        files.sort()
        lista = []
        for entry in files:
            if entry.endswith(self._out + str(self._random) + '_.temp'):
                lista.append(entry)

        TranscriptomeAnalysis._logger.info(
            'Translating candidate transcripts...')
        for entry in lista:
            raw_seq = open(self._tmp + entry)
            # this works even if the runname is in the header
            name = self._out.join(
                entry.replace('_.temp', '').split(self._out)[:-1])
            trans_seq = open(
                self.mainout + 'translated_proteins/' + name + '.faa', 'w')
            nucl_seq = ''
            header = ''
            for line in raw_seq:
                if line.startswith('>'):
                    header = line.strip() + '_'
                else:
                    nucl_seq += line.strip()
            seq_count = 0
            for translation in self._sixpack(nucl_seq):
                seq_count += 1
                trans_seq.write('%s%s\n%s\n' %
                                (header, seq_count, translation))
            raw_seq.close()
            trans_seq.close()

        # open target scores file
        f2 = open('%sscores_cutoff' % self._lineage_path)
        # Load dictionary of HMM expected scores and full list of groups
        score_dic = {}
        for i in f2:
            i = i.strip().split()
            try:
                # float values: [1]=mean; [2]=minimum
                score_dic[i[0]] = float(i[1])
            except IndexError:
                pass
        f2.close()
        self._totalbuscos = len(list(score_dic.keys()))

    #
    # method that should be considered as if protected, for internal use [class]
    # to move to public and rename if meaningful
    #

    # Nothing

    def _run_tarzip_translated_proteins(self):
        """
        This function tarzips results folder
        """
        # translated_proteins
        self._p_open([
            'tar', '-C',
            '%s' % self.mainout, '-zcf',
            '%stranslated_proteins.tar.gz' % self.mainout,
            'translated_proteins', '--remove-files'
        ],
                     'bash',
                     shell=False)

    # @overrides
    def _run_hmmer(self):
        """
        This function runs hmmsearch.
        """
        TranscriptomeAnalysis._logger.info(
            'Running HMMER to confirm transcript orthology:')
        files = os.listdir('%stranslated_proteins/' % self.mainout)
        files.sort()
        if not os.path.exists('%shmmer_output' % self.mainout):
            os.makedirs('%shmmer_output' % self.mainout)

        count = 0

        busco_index = {}

        for f in files:
            if f.endswith('.faa'):
                count += 1
                scaff = f[:-4]
                scaff_buscos = self._transcriptome_by_scaff[scaff]
                for busco in scaff_buscos:

                    try:
                        busco_index[busco] += 1
                    except KeyError:
                        busco_index[busco] = 1

                    hmmer_job = self._hmmer.create_job()
                    hmmer_job.add_parameter('--domtblout')
                    hmmer_job.add_parameter(
                        '%shmmer_output/%s.out.%s' %
                        (self.mainout, busco, busco_index[busco]))
                    hmmer_job.add_parameter('-o')
                    hmmer_job.add_parameter(
                        '%stemp_%s%s' %
                        (self._tmp, self._out, str(self._random)))
                    hmmer_job.add_parameter('--cpu')
                    hmmer_job.add_parameter('1')
                    hmmer_job.add_parameter('%shmms/%s.hmm' %
                                            (self._lineage_path, busco))
                    hmmer_job.add_parameter('%stranslated_proteins/%s' %
                                            (self.mainout, f))

        # Run hmmer
        self._hmmer.run_jobs(self._cpus)
Exemplo n.º 4
0
class BuscoConfig(PipeConfig):
    """
    This class extends pipebricks.PipeConfig to read the config.ini file. Furthermore, it uses extra args that can be
    provided through command line and information available in the dataset.cfg file to produce a single instance
    containing all correct parameters to be injected to a busco.BuscoAnalysis instance.
    """

    FORBIDDEN_HEADER_CHARS = ['ç', '¬', '¢', '´', 'ê', 'î', 'ô', 'ŵ', 'ẑ', 'û', 'â', 'ŝ', 'ĝ', 'ĥ', 'ĵ', 'ŷ',
                              'ĉ', 'é', 'ï', 'ẅ', 'ë', 'ẅ', 'ë', 'ẗ,', 'ü', 'í', 'ö', 'ḧ', 'é', 'ÿ', 'ẍ', 'è', 'é',
                              'à', 'ä', '¨', '€', '£', 'á']

    FORBIDDEN_HEADER_CHARS_BEFORE_SPLIT = ['/', '\'']

    HMMER_VERSION = 3.1

    MAX_FLANK = 20000

    VERSION = busco.__version__

    CONTACT = 'mailto:[email protected]'

    DEFAULT_ARGS_VALUES = {'cpu': 1, 'evalue': 1e-3, 'species': 'fly', 'tmp_path': './tmp/', 'limit': 3,
                           'out_path': os.getcwd(), 'domain': 'eukaryota', 'clade_name': 'N/A',
                           'dataset_creation_date': 'N/A',
                           'dataset_nb_buscos': 'N/A', 'dataset_nb_species': 'N/A', 'augustus_parameters': '',
                           'long': False, 'restart': False, 'quiet': False, 'debug': False, 'force': False,
                           'tarzip': False, 'blast_single_core': False}

    MANDATORY_USER_PROVIDED_PARAMS = ['in', 'out', 'lineage_path', 'mode']

    _logger = PipeLogger.get_logger(__name__)

    def __init__(self, conf_file, args, checks=True):
        """
        :param conf_file: a path to a config.ini file
        :type conf_file: str
        :param args: key and values matching BUSCO parameters to override config.ini values
        :type args: dict
        :param checks: whether to proceed to the mandatory parameters + file dependencies checks,
         used in a main BUSCO analysis. Default True
        :type checks: bool
        """
        try:
            super(BuscoConfig, self).__init__(conf_file)
        except TypeError:
            try:
                PipeConfig.__init__(self, conf_file)  # Python 2.7
            except ParsingError as e:
                BuscoConfig._logger.error('Error in the config file: %s' % e)
                raise SystemExit
        except DuplicateOptionError as e:
            BuscoConfig._logger.error('Duplicated entry in the config.ini file: %s' % e)
            raise SystemExit
        except DuplicateSectionError as e:
            BuscoConfig._logger.error('Duplicated entry in the config.ini file: %s' % e)
            raise SystemExit
        except ParsingError as e:
            BuscoConfig._logger.error('Error in the config file: %s' % e)
            raise SystemExit

        try:

            # Update the config with args provided by the user, else keep config
            for key in args:
                if args[key] is not None and type(args[key]) is not bool:
                    self.set('busco', key, str(args[key]))
                elif args[key] is True:
                    self.set('busco', key, 'True')

            # Validate that all keys that are mandatory are there
            if checks:
                for param in BuscoConfig.MANDATORY_USER_PROVIDED_PARAMS:
                    try:
                        self.get('busco', param)
                    except NoOptionError:
                        BuscoConfig._logger.error('The parameter \'--%s\' was not provided. '
                                                  'Please add it in the config '
                                                  'file or provide it through the command line' % param)
                        raise SystemExit

            # Edit all path in the config to make them clean
            for item in self.items('busco'):
                if item[0].endswith('_path'):
                    self.set('busco', item[0], BuscoConfig.nice_path(item[1]))

            # load the dataset config, or warn the user if not present
            # Update the config with the info from dataset, when appropriate
            domain = None
            try:
                target_species_file = open('%sdataset.cfg' % self.get('busco', 'lineage_path'))
                for l in target_species_file:
                    if l.split("=")[0] == "name":
                        self.set('busco', 'clade_name', l.strip().split("=")[1])
                    elif l.split("=")[0] == "species":
                        try:
                            self.get('busco', 'species')
                            if checks:
                                BuscoConfig._logger.warning('An augustus species is mentioned in the config file, '
                                                            'dataset default species (%s) will be ignored'
                                                            % l.strip().split("=")[1])
                        except NoOptionError:
                            self.set('busco', 'species', l.strip().split("=")[1])
                    elif l.split("=")[0] == "domain":
                        try:
                            self.get('busco', 'domain')
                            if checks:
                                BuscoConfig._logger.warning('A domain for augustus training is mentioned in the config '
                                                            'file, dataset default domain (%s) will be ignored'
                                                            % l.strip().split("=")[1])
                        except NoOptionError:
                            self.set('busco', 'domain', l.strip().split("=")[1])
                        domain = l.strip().split("=")[1]
                    elif l.split("=")[0] == "creation_date":
                        self.set('busco', 'dataset_creation_date', l.strip().split("=")[1])
                    elif l.split("=")[0] == "number_of_BUSCOs":
                        self.set('busco', 'dataset_nb_buscos', l.strip().split("=")[1])
                    elif l.split("=")[0] == "number_of_species":
                        self.set('busco', 'dataset_nb_species', l.strip().split("=")[1])
                if checks and domain != 'prokaryota' and domain != 'eukaryota':
                    BuscoConfig._logger.error(
                        'Corrupted dataset.cfg file: domain is %s, should be eukaryota or prokaryota' % domain)
                    raise SystemExit
            except IOError:
                if checks:
                    BuscoConfig._logger.warning("The dataset you provided does not contain the file dataset.cfg, "
                                                "likely because it is an old version. Default species (%s, %s) will be "
                                                "used as augustus species"
                                                % (BuscoConfig.DEFAULT_ARGS_VALUES['species'],
                                                   BuscoConfig.DEFAULT_ARGS_VALUES['domain']))

            # Fill the other with default values if not present
            for param in list(BuscoConfig.DEFAULT_ARGS_VALUES.keys()):
                try:
                    self.get('busco', param)
                except NoOptionError:
                    self.set('busco', param, str(BuscoConfig.DEFAULT_ARGS_VALUES[param]))

            # Edit all path in the config to make them clean, again
            for item in self.items('busco'):
                if item[0].endswith('_path'):
                    self.set('busco', item[0], BuscoConfig.nice_path(item[1]))

            # Convert the ~ into full home path
            if checks:
                for key in self.sections():
                    for item in self.items(key):
                        if item[0].endswith('_path') or item[0] == 'path' or item[0] == 'in':
                            if item[1].startswith('~'):
                                self.set(key, item[0], os.path.expanduser(item[1]))

            # And check that in and lineage path and file actually exists
            if checks:
                for item in self.items('busco'):
                    if item[0] == 'lineage_path' or item[0] == 'in':
                        BuscoConfig.check_path_exist(item[1])
            # Prevent the user form using "/" in out name
            if checks:
                if '/' in self.get('busco', 'out'):
                    BuscoConfig._logger.error('Please do not provide a full path in --out parameter, no slash.'
                                              ' Use out_path in the config.ini file to specify the full path.')
                    raise SystemExit

            # Check the value of limit
            if checks:
                if self.getint('busco', 'limit') == 0 or self.getint('busco', 'limit') > 20:
                    BuscoConfig._logger.error('Limit must be an integer between 1 and 20 (you have used: %s). '
                                              'Note that this parameter is not needed by the protein mode.'
                                              % self.getint('busco', 'limit'))
                    raise SystemExit

            # Warn if custom evalue
            if checks:
                if self.getfloat('busco', 'evalue') != BuscoConfig.DEFAULT_ARGS_VALUES['evalue']:
                    BuscoConfig._logger.warning('You are using a custom e-value cutoff')

        except NoSectionError:
            BuscoConfig._logger.error('No section [busco] found in %s. Please make sure both the file and this section '
                                      'exist, see userguide.' % conf_file)
            raise SystemExit

        except NoOptionError:
            pass  # if mandatory options are not requiered because the BuscoConfig instance is not meant to be used
            # in a regular Busco Analysis but by an additional script.

        for item in self.items('busco'):
            BuscoConfig._logger.debug(item)

    @staticmethod
    def check_path_exist(path):
        """
        This function checks whether the provided path exists
        :param path: the path to be tested
        :type path: str
        :raises SystemExit: if the path cannot be reached
        """
        if not os.path.exists(path):
            BuscoConfig._logger.error('Impossible to read %s' % path)
            raise SystemExit

    @staticmethod
    def nice_path(path):
        """
        :param path: a path to check
        :type path: str
        :return: the same but cleaned path
        :rtype str:
        """
        try:
            if path[-1] != '/':
                path += '/'
            return path
        except TypeError:
            return None
Exemplo n.º 5
0
class Tool:
    """
    Collection of utility methods used by all tools
    """

    _logger = PipeLogger.get_logger(__name__)

    @staticmethod
    def check_tool_available(name, config, without_path=False):
        """
        Check tool's availability.
        1. The section ['name'] is available in the config
        2. This section contains keys 'path' and 'command'
        3. The string resulted from contatination of values of these two keys
        represents the full path to the command
        :param name: the name of the tool to execute
        :type name: str
        :param config: initialized instance of ConfigParser
        :type config: configparser.ConfigParser
        :param without_path: tells whether it has to be also available without the full path included
        :type without_path: boolean
        :return: True if the tool can be run, False if it is not the case
        :rtype: bool
        """
        if not config.has_section(name):
            raise ToolException('Section for the tool [\'%s\'] is not '
                                'present in the config file' % name)

        if not config.has_option(name, 'path'):
            raise ToolException('Key \'path\' in the section [\'%s\'] is not '
                                'present in the config file' % name)

        if without_path:
            cmd = name
            without_path_check = subprocess.call('type %s' % cmd,
                                                 shell=True,
                                                 stdout=subprocess.PIPE,
                                                 stderr=subprocess.PIPE) == 0
        else:
            without_path_check = True

        cmd = os.path.join(config.get(name, 'path'), name)

        return without_path_check and subprocess.call(
            'type %s' % cmd,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE) == 0

    def __init__(self, name, config):
        """
        Initialize job list for a tool
        :param name: the name of the tool to execute
        :type name: str
        :param config: initialized instance of ConfigParser
        :type config: configparser.ConfigParser
        """
        if not config.has_section(name):
            raise ToolException('Section for the tool [\'%s\'] is not '
                                'configured in the config.ini file' % name)

        if not config.has_option(name, 'path'):
            raise ToolException('Key \'path\' in the section [\'%s\'] is not '
                                'configured in the config.ini file' % name)

        self.name = name

        self.cmd = [os.path.join(config.get(name, 'path'), name)]

        keys = sorted(item[0] for item in config.items(name))

        for key in keys:
            if not key == 'path' and not config.has_option('DEFAULT', key):
                self.cmd.append(config.get(name, key))

        self.jobs_to_run = []
        self.jobs_running = []

    def create_job(self):
        """
        Create one work item
        """
        job_id = 1 + len(self.jobs_to_run) + len(self.jobs_running)
        job = Job(self.name, self.cmd[:], job_id)
        self.jobs_to_run.append(job)
        return job

    def remove_job(self, job):
        """
        Remove one work item
        :param job: the Job to remove
        :type job: Job
        """
        self.jobs_to_run.remove(job)

    def run_jobs(self, max_threads, log_it=True):
        """
        This method run all jobs created for the Tool and redirect
        the standard output and error to the current logger
        :param max_threads: the number or threads to run simultaneously
        :type max_threads: int
        :param log_it: whether to log the progress for the tasks. Default True
        :type log_it: boolean
        """
        # Wait for all threads to finish and log progress
        total = len(self.jobs_to_run)
        already_logged = 0
        while len(self.jobs_to_run) > 0 or len(self.jobs_running) > 0:
            time.sleep(0.001)
            for j in self.jobs_to_run:
                if len(self.jobs_running) < max_threads:
                    self.jobs_running.append(j)
                    self.jobs_to_run.remove(j)
                    j.start()
                    Tool._logger.debug(j.cmd_line)
            for j in self.jobs_running:
                if not j.is_alive():
                    self.jobs_running.remove(j)

            nb_done = total - (len(self.jobs_to_run) + len(self.jobs_running))

            if (nb_done == total or int(nb_done % (float(total) / 10))
                    == 0) and nb_done != already_logged:
                if log_it:
                    Tool._logger.info(
                        '[%s]\t%i of %i task(s) completed at %s' %
                        (self.name, nb_done, total,
                         time.strftime("%m/%d/%Y %H:%M:%S")))
                else:
                    Tool._logger.debug(
                        '[%s]\t%i of %i task(s) completed at %s' %
                        (self.name, nb_done, total,
                         time.strftime("%m/%d/%Y %H:%M:%S")))
                already_logged = nb_done
Exemplo n.º 6
0
class Job(threading.Thread):
    """
    Build and executes one work item in an external process
    """
    _logger = PipeLogger.get_logger(__name__)

    def __init__(self, tool_name, name, thread_id):
        """
        :param name: a name of an executable / script ("a tool") to be run
        :type name: list
        :param thread_id: an int id for the thread
        :type thread_id: int
        """
        # initialize parent
        super(Job, self).__init__()

        self.tool_name = tool_name
        self.cmd_line = name
        self.thread_id = thread_id
        self.stdout_file = None
        self.stderr_file = None
        self.stdout = subprocess.PIPE
        self.stderr = subprocess.PIPE

    def add_parameter(self, parameter):
        """
        Append parameter to the command line
        :parameter: a parameter
        :type parameter: str
        """
        self.cmd_line.append(parameter)

    # @override
    def run(self):
        """
        Start external process and block the current thread's execution
        till the process' run is over
        """
        if self.stdout_file:
            self.stdout = open(self.stdout_file[0], self.stdout_file[1])
        if self.stderr_file:
            self.stderr = open(self.stderr_file[0], self.stderr_file[1])
        process = subprocess.Popen(self.cmd_line,
                                   shell=False,
                                   stderr=self.stderr,
                                   stdout=self.stdout)
        Job._logger.debug('%s thread nb %i has started' %
                          (self.tool_name, self.thread_id))
        process.wait()
        process_out = []
        if process.stdout:
            process_out += process.stdout.readlines()
        if process.stderr:
            process_out += process.stderr.readlines()
        for line in process_out:
            Job._logger.info_external_tool(self.tool_name,
                                           line.decode("utf-8").strip())
        if self.stdout_file:
            self.stdout.close()
        if self.stderr_file:
            self.stderr.close()
Exemplo n.º 7
0
class GeneSetAnalysis(BuscoAnalysis):
    """
    This class runs a BUSCO analysis on a gene set.
    """

    _logger = PipeLogger.get_logger(__name__)

    #
    # magic or public, meant to be accessed by external scripts [instance]
    #

    def __init__(self, params):
        """
        Initialize an instance.
        :param params: Values of all parameters that have to be defined
        :type params: PipeConfig
        """
        self._mode = 'proteins'
        super(GeneSetAnalysis, self).__init__(params)
        if self._params.getboolean('busco', 'restart'):
            GeneSetAnalysis._logger.error(
                'There is no restart allowed for the protein mode')
            raise SystemExit
        # data integrity checks not done by the parent class
        if self.check_protein_file() is False:
            GeneSetAnalysis._logger.error('Please provide a protein file as input')
            raise SystemExit

    # @overrides
    def run_analysis(self):
        """
        This function calls all needed steps for running the analysis.
        """
        super(GeneSetAnalysis, self).run_analysis()
        # validate sequence file
        if super(GeneSetAnalysis, self).check_protein_file() is False:
            GeneSetAnalysis._logger.error('Please provide a protein file as input')
            raise SystemExit
        self._load_score()
        self._load_length()
        self._run_hmmer()
        self._produce_short_summary()
        self.cleanup()
        if self._tarzip:
            self._run_tarzip_hmmer_output()

    #
    # public, meant to be accessed by external scripts [class]
    #

    # Nothing

    #
    # method that should be used as if protected, for internal use [instance]
    # to move to public and rename if meaningful
    #

    # @overrides
    def _init_tools(self):
        """
        Init the tools needed for the analysis
        """
        GeneSetAnalysis._logger.info('Init tools...')
        self._hmmer = Tool('hmmsearch', self._params)
        GeneSetAnalysis._logger.info('Check dependencies...')
        self._check_tool_dependencies()

    # @override
    def _run_hmmer(self):
        """
        This function runs hmmsearch.
        """

        # Run hmmer
        GeneSetAnalysis._logger.info('Running HMMER on the proteins:')

        if not os.path.exists(self.mainout + 'hmmer_output'):
            os.makedirs('%shmmer_output' % self.mainout)
            
        files = os.listdir(self._lineage_path + '/hmms')
        files.sort()
        # open target scores file
        f2 = open('%sscores_cutoff' % self._lineage_path)
        #   Load dictionary of HMM expected scores and full list of groups
        score_dic = {}
        for i in f2:
            i = i.strip().split()
            try:
                score_dic[i[0]] = float(i[1])  # values; [1] = mean; [2] = min
            except IndexError:
                pass
        self._totalbuscos = len(list(score_dic.keys()))
        f2.close()

        hmmer_tool = Tool('hmmsearch', self._params)
        for entry in files:
            name = entry[:-4]
            if name in score_dic:
                hmmer_job = hmmer_tool.create_job()
                hmmer_job.add_parameter('--domtblout')
                hmmer_job.add_parameter('%shmmer_output/%s.out.1' % (self.mainout, name))
                hmmer_job.add_parameter('-o')
                hmmer_job.add_parameter('%stemp_%s%s' % (self._tmp, self._out, self._random))
                hmmer_job.add_parameter('--cpu')
                hmmer_job.add_parameter('1')
                hmmer_job.add_parameter('%shmms/%s.hmm' % (self._lineage_path, name))
                hmmer_job.add_parameter('%s' % self._sequences)
        
        hmmer_tool.run_jobs(self._cpus)

    # @override
    def _check_tool_dependencies(self):
        """
        check dependencies on tools
        :raises SystemExit: if a Tool is not available
        """

        # check 'hmmersearch' command availability
        if not Tool.check_tool_available('hmmsearch', self._params):
            BuscoAnalysis._logger.error(
                '\"hmmsearch\" is not accessible, '
                'add or modify its path in the config file. Do not include the command '
                'in the path !')
            raise SystemExit

        # check version
        if self._get_hmmer_version(self._hmmer.cmd[0]) >= BuscoConfig.HMMER_VERSION:
            pass
        else:
            BuscoAnalysis._logger.error(
                'HMMer version detected is not supported, please use HMMer '
                ' v.%s +' % BuscoConfig.HMMER_VERSION)
            raise SystemExit