Exemplo n.º 1
0
    def run(self, input_tree, msa_file, num_replicates, model, base_type, frac,
            output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert (model in ['wag', 'lg', 'jtt'])
        assert (base_type in ['nt', 'prot'])

        self.model = model
        self.base_type = base_type
        self.frac = frac

        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        self.logger.info('Calculating bootstrap replicates:')
        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, range(num_replicates),
                     self._progress)

        # calculate support values
        rep_tree_files = []
        for rep_index in range(num_replicates):
            rep_tree_files.append(
                os.path.join(self.replicate_dir,
                             'bootstrap_tree.r_' + str(rep_index) + '.tree'))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Exemplo n.º 2
0
    def bootstrap(self, input_tree, msa_file, seq_type, model_str, gamma,
                  num_replicates, output_dir, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        seq_type : str
            Specifies multiple sequences alignment is of 'nt' or 'prot'.
        model_str : str
            Specified either the 'wag' or 'jtt' model.
        gamma : bool
            Indicates if GAMMA model should be used
        num_replicates : int
            Number of replicates to perform.
        output_dir: str
            Output directory to contain bootstrap trees.
        cpus : int
            Number of cpus to use.
        """

        assert (seq_type.upper() in ['NT', 'PROT'])
        assert (model_str.upper() in ['WAG', 'LG', 'JTT', 'GTR'])

        self.output_dir = output_dir
        self.seq_type = seq_type
        self.model = model_str
        self.gamma = gamma
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        replicate_numbers = list(range(num_replicates))
        parallel.run(self._bootstrap, None, replicate_numbers, None)

        # calculate support values
        rep_tree_files = []
        for rep_index in replicate_numbers:
            rep_tree_files.append(
                os.path.join(self.output_dir, 'rep_%d' % rep_index,
                             'bootstrap.tree'))

        tree_name = os.path.splitext(os.path.basename(input_tree))[0]
        output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Exemplo n.º 3
0
    def bootstrap(self, input_tree, msa_file, model_str, num_replicates, output_dir, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        model_str : str
            Specified either the 'WAG' or 'LG' model.
        num_replicates : int
            Number of replicates to perform.
        output_dir: str
            Output directory to contain bootstrap trees.
        cpus : int
            Number of cpus to use.
        """
        
        check_on_path('seqmagick')

        assert(model_str.upper() in ['WAG', 'LG'])

        self.output_dir = output_dir
        self.model = model_str
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        replicate_numbers = list(range(num_replicates))
        parallel.run(self._bootstrap, None, replicate_numbers, None)

        # calculate support values
        rep_tree_files = []
        for rep_index in replicate_numbers:
            rep_tree_files.append(os.path.join(output_dir, 'rep_%d' % rep_index, 'RAxML_bestTree.support'))

        tree_name = os.path.splitext(os.path.basename(input_tree))[0]
        output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)
        
        return output_tree
Exemplo n.º 4
0
    def bootstrap(self, input_tree, msa_file, seq_type, model_str, num_replicates, output_tree, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        seq_type : str
            Specifies multiple sequences alignment is of 'nt' or 'prot'.
        model_str : str
            Specified either the 'wag' or 'jtt' model.
        num_replicates : int
            Number of replicates to perform.
        output_tree: str
            Output file containing tree with bootstrap values.
        cpus : int
            Number of cpus to use.
        """

        assert(seq_type in ['nt', 'prot'])
        assert(model_str in ['wag', 'jtt'])

        self.replicate_dir = tempfile.mkdtemp()
        self.seq_type = seq_type
        self.model = model_str
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        parallel.run(self._bootstrap, None, xrange(num_replicates), None)

        # calculate support values
        rep_tree_files = []
        for rep_index in xrange(num_replicates):
            rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap.tree.' + str(rep_index) + '.tre'))

        bootstrap_support(input_tree, rep_tree_files, output_tree)

        shutil.rmtree(self.replicate_dir)
Exemplo n.º 5
0
    def run(self, 
                input_tree, 
                msa_file, 
                num_replicates, 
                model, 
                gamma,
                base_type, 
                frac,
                boot_dir,
                output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert(model in ['wag', 'lg', 'jtt'])
        assert(base_type in ['nt', 'prot'])

        self.model = model
        self.gamma = gamma
        self.base_type = base_type
        self.frac = frac

        rep_tree_files = []
        if not boot_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            # calculate replicates
            self.logger.info('Calculating bootstrap replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates), self._progress)

            for rep_index in xrange(num_replicates):
                rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree'))
        else:
            for f in os.listdir(boot_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(boot_dir, f))
            self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files))
          
        # calculate support values
        self.logger.info('Calculating bootstrap support values.')
        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Exemplo n.º 6
0
    def run(self, input_tree, msa_file, marker_info_file, mask_file,
            perc_markers_to_keep, num_replicates, model, jk_dir, output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert (model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep

        # determine length of each marker gene in alignment
        rep_tree_files = []
        if not jk_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            marker_lengths = []
            total_len = 0
            with open(marker_info_file) as f:
                f.readline()
                for line in f:
                    line_split = line.split('\t')
                    ml = int(line_split[3])
                    marker_lengths.append(ml)
                    total_len += ml

            self.logger.info('Concatenated length of markers: %d' % total_len)

            # read mask
            mask = open(mask_file).readline().strip()
            start = 0
            self.marker_lengths = []
            total_mask_len = 0
            for ml in marker_lengths:
                end = start + ml
                zeros = mask[start:end].count('0')
                start = end

                self.marker_lengths.append(ml - zeros)
                total_mask_len += ml - zeros

            self.logger.info('Concatenated length of filtered MSA: %d' %
                             total_mask_len)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            if len(self.msa.values()[0]) != total_mask_len:
                self.logger.error(
                    'Length of MSA does not meet length of mask.')
                sys.exit()

            # calculate replicates
            self.logger.info('Calculating jackknife marker replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates),
                         self._progress)

            # calculate support
            self.logger.info('Calculating support for %d replicates.' %
                             num_replicates)
            for rep_index in xrange(num_replicates):
                rep_tree_files.append(
                    os.path.join(self.replicate_dir,
                                 'jk_markers.tree.' + str(rep_index) + '.tre'))
        else:
            for f in os.listdir(jk_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(jk_dir, f))
            self.logger.info('Read %d jackknife replicates.' %
                             len(rep_tree_files))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Exemplo n.º 7
0
    def run(self, input_tree, 
                    msa_file, 
                    marker_info_file, 
                    mask_file, 
                    perc_markers_to_keep, 
                    num_replicates, 
                    model,
                    jk_dir,
                    output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert(model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep
        
        
        # determine length of each marker gene in alignment
        rep_tree_files = []
        if not jk_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)
            
            marker_lengths = []
            total_len = 0
            with open(marker_info_file) as f:
                f.readline()
                for line in f:
                    line_split = line.split('\t')
                    ml = int(line_split[3])
                    marker_lengths.append(ml)
                    total_len += ml
                    
            self.logger.info('Concatenated length of markers: %d' % total_len)
                    
            # read mask
            mask = open(mask_file).readline().strip()
            start = 0
            self.marker_lengths = []
            total_mask_len = 0
            for ml in marker_lengths:
                end = start + ml
                zeros = mask[start:end].count('0')
                start = end
                
                self.marker_lengths.append(ml - zeros)
                total_mask_len += ml - zeros
                
            self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)
            
            if len(self.msa.values()[0]) != total_mask_len:
                self.logger.error('Length of MSA does not meet length of mask.')
                sys.exit()

            # calculate replicates
            self.logger.info('Calculating jackknife marker replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates), self._progress)

            # calculate support
            self.logger.info('Calculating support for %d replicates.' % num_replicates)
            for rep_index in xrange(num_replicates):
                rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre'))
        else:
            for f in os.listdir(jk_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(jk_dir, f))
            self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files))

        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree