Exemplo n.º 1
0
    def from_phytrees ( cls, phytrees_file ) :
        """
        Create a PhyTrees object retrieving all the information from previously
        saved PhyTrees tree and report files. If 'phytrees_file' contains a
        relative path, the current working directory will be used to get the
        absolute path.
        
        Arguments :
            phytrees_file  ( string )
                Tree file generated by PhyTrees.write().

        Raises :
            ValueError
                If the number of trees read doesn't match the number stored in
                the report document.
        """
        data_filepath = get_abspath(phytrees_file)
        report_filepath = os.path.splitext(data_filepath)[0] + '.rep'
        # Load all the contents into a new PhyTrees object
        tree_list = list(Phylo.parse(data_filepath, 'newick'))
        report = []
        with open(report_filepath, 'r') as report_file :
            str_num_trees = report_file.readline()
            num_trees = int(str_num_trees.split(':')[-1])
            if ( len(tree_list) != num_trees ) :
                message = 'The number of trees at report file doesn\'t match ' \
                          'the number of trees loaded'
                raise ValueError(message)
            # Ignore "History:" line
            report_file.readline()
            for line in report_file.readlines() :
                date_time, filepath, fileformat = line.strip().split('    ')
                report.append((date_time, filepath, fileformat))
        return ( cls(tree_list, report) )
Exemplo n.º 2
0
def gen_args(args, infile_path, outfile):
    """
    Return the argument list generated from 'args' and the infile path
    requested.

    Arguments :
        args  ( string )
            Keyword or arguments to use in the call of Consense, excluding
            infile and outfile arguments.
        infile_path  ( string )
            Input alignment file path.
        outfile  ( string )
            Consensus tree output file.

    Returns :
        list
            List of arguments (excluding binary file) to call Consense.
    """
    if (outfile):
        outfile_path = get_abspath(outfile)
    else:
        # Output files will be saved in temporary files to retrieve the
        # consensus tree
        outfile_path = os.path.join(tempfile.gettempdir(),
                                    tempfile.gettempprefix() + \
                                        next(tempfile._get_candidate_names()))
    # Create full command line list
    argument_list = [infile_path, outfile_path]
    return (argument_list)
Exemplo n.º 3
0
    def write(self, bioseqs_file):
        """
        Save all sequences stored at the BioSeqs object in the 'bioseqs_file'
        (in GENBANK format). A file with a detailed report of the sequences will
        be created replacing the extension of 'bioseqs_file' by ".rep". If
        'bioseqs_file' contains a relative path, the current working directory
        will be used to get the absolute path. If any file already exists, it
        will be overwritten without warning.

        Arguments:
            bioseqs_file  (string)
                New BioSeqs sequence file.

        Raises:
            IOError
                If the path provided doesn't exist.
        """
        data_filepath = get_abspath(bioseqs_file)
        report_filepath = os.path.splitext(data_filepath)[0] + '.rep'
        # Generate a single string with all the report content
        str_report = '\n'.join(['    '.join(x) for x in self._report])
        # Write all the information in the BioSeqs files
        try:
            SeqIO.write(viewvalues(self.data), data_filepath, 'genbank')
            with open(report_filepath, 'w') as report_file:
                report_file.write('Num. sequences: {:d}\nHistory:\n' \
                                  '{:s}'.format(len(self), str_report))
        except IOError:
            raise
        except:
            if (os.path.lexists(data_filepath)):
                os.remove(data_filepath)
            if (os.path.lexists(report_filepath)):
                os.remove(report_filepath)
            raise
Exemplo n.º 4
0
    def write ( self, phytrees_file ) :
        """
        Save all trees stored at the PhyTrees object in the 'phytrees_file' (in
        newick format). A file with a detailed report of the trees will be
        created replacing the extension of 'phytrees_file' by ".rep". If
        'phytrees_file' contains a relative path, the current working directory
        will be used to get the absolute path. If any file already exists, it
        will be overwritten without warning.
        
        Arguments :
            phytrees_file  ( string )
                New PhyTrees tree file.

        Raises :
            IOError
                If the path provided doesn't exist.
        """
        data_filepath = get_abspath(phytrees_file)
        report_filepath = os.path.splitext(data_filepath)[0] + '.rep'
        # Generate a single string with all the report content
        str_report = '\n'.join(['    '.join(x)  for x in self._report])
        # Write all the information in the PhyTrees files
        try :
            Phylo.write(self.data, data_filepath, 'newick')
            with open(report_filepath, 'w') as report_file :
                report_file.write('Num. trees: {:d}\nHistory:\n' \
                                  '{:s}'.format(len(self), str_report))
        except IOError :
            raise
        except :
            if ( os.path.isfile(data_filepath) ) :
                os.remove(data_filepath)
            if ( os.path.isfile(report_filepath) ) :
                os.remove(report_filepath)
            raise
Exemplo n.º 5
0
    def from_bioseqs(cls, bioseqs_file):
        """
        Create a BioSeqs object retrieving all the information from previously
        saved BioSeqs sequence and report files. If 'bioseqs_file' contains a
        relative path, the current working directory will be used to get the
        absolute path.

        Arguments:
            bioseqs_file  (string)
                Sequence file generated by BioSeqs.write().

        Raises:
            ValueError
                If the number of sequences read doesn't match the number stored
                in the report document.
        """
        data_filepath = get_abspath(bioseqs_file)
        report_filepath = os.path.splitext(data_filepath)[0] + '.rep'
        # Load all the contents into a new BioSeqs object
        seq_dict = SeqIO.to_dict(SeqIO.parse(data_filepath, 'genbank'))
        report = []
        with open(report_filepath, 'r') as report_file:
            str_num_seqs = report_file.readline()
            num_seqs = int(str_num_seqs.split(':')[-1])
            if (len(seq_dict) != num_seqs):
                message = 'The number of sequences at report file doesn\'t ' \
                          'match the number of sequences loaded'
                raise ValueError(message)
            # Ignore "History:" line
            report_file.readline()
            for line in report_file.readlines():
                date_time, src_type, src, details = line.strip().split('    ')
                report.append((date_time, src_type, src, details))
        return (cls(seq_dict, report))
Exemplo n.º 6
0
def get_results(command, output):
    """
    Extract resultant phylogeny and its log-likelihood score from 'output' and
    files generated during the execution of 'command'.

    Arguments :
        command  ( list )
            FastTree's command line executed.
        output  ( string )
            Output from 'command' execution.

    Returns :
        Bio.Phylo.BaseTree
            Resultant phylogenetic tree.
        float
            Log-likelihood score of the phylogeny.
    """
    phylogeny = Phylo.read(StringIO(output), 'newick')
    # Read the log file to get the log-likelihood score of the final phylogeny
    index = command.index('-log') + 1
    logfile_path = get_abspath(command[index])
    with open(logfile_path, 'r') as logfile:
        # It is located at the last line that matches "TreeLogLk.*" pattern
        for line in logfile.readlines():
            if ('TreeLogLk' in line):
                score = float(line.split('\t')[2])
    return (phylogeny, score)
Exemplo n.º 7
0
def gen_args ( args, infile_path, outfile ) :
    """
    Return the argument list generated from 'args' and the infile path
    requested.

    Arguments :
        args  ( string )
            Keyword or arguments to use in the call of Consense, excluding
            infile and outfile arguments.
        infile_path  ( string )
            Input alignment file path.
        outfile  ( string )
            Consensus tree output file.

    Returns :
        list
            List of arguments (excluding binary file) to call Consense.
    """
    if ( outfile ) :
        outfile_path = get_abspath(outfile)
    else :
        # Output files will be saved in temporary files to retrieve the
        # consensus tree
        outfile_path = os.path.join(tempfile.gettempdir(),
                                    tempfile.gettempprefix() + \
                                        next(tempfile._get_candidate_names()))
    # Create full command line list
    argument_list = [infile_path, outfile_path]
    return ( argument_list )
Exemplo n.º 8
0
    def from_bioseqs(cls, bioseqs_file):
        """
        Create a BioSeqs object retrieving all the information from previously
        saved BioSeqs sequence and report files. If 'bioseqs_file' contains a
        relative path, the current working directory will be used to get the
        absolute path.
        
        Arguments :
            bioseqs_file  ( string )
                Sequence file generated by BioSeqs.write().

        Raises :
            ValueError
                If the number of sequences read doesn't match the number stored
                in the report document.
        """
        data_filepath = get_abspath(bioseqs_file)
        report_filepath = os.path.splitext(data_filepath)[0] + ".rep"
        # Load all the contents into a new BioSeqs object
        seq_dict = SeqIO.to_dict(SeqIO.parse(data_filepath, "genbank"))
        report = []
        with open(report_filepath, "r") as report_file:
            str_num_seqs = report_file.readline()
            num_seqs = int(str_num_seqs.split(":")[-1])
            if len(seq_dict) != num_seqs:
                message = "The number of sequences at report file doesn't " "match the number of sequences loaded"
                raise ValueError(message)
            # Ignore "History:" line
            report_file.readline()
            for line in report_file.readlines():
                date_time, src_type, src, details = line.strip().split("    ")
                report.append((date_time, src_type, src, details))
        return cls(seq_dict, report)
Exemplo n.º 9
0
    def write(self, bioseqs_file):
        """
        Save all sequences stored at the BioSeqs object in the 'bioseqs_file'
        (in GENBANK format). A file with a detailed report of the sequences will
        be created replacing the extension of 'bioseqs_file' by ".rep". If
        'bioseqs_file' contains a relative path, the current working directory
        will be used to get the absolute path. If any file already exists, it
        will be overwritten without warning.
        
        Arguments :
            bioseqs_file  ( string )
                New BioSeqs sequence file.

        Raises :
            IOError
                If the path provided doesn't exist.
        """
        data_filepath = get_abspath(bioseqs_file)
        report_filepath = os.path.splitext(data_filepath)[0] + ".rep"
        # Generate a single string with all the report content
        str_report = "\n".join(["    ".join(x) for x in self._report])
        # Write all the information in the BioSeqs files
        try:
            SeqIO.write(viewvalues(self.data), data_filepath, "genbank")
            with open(report_filepath, "w") as report_file:
                report_file.write("Num. sequences: {:d}\nHistory:\n" "{:s}".format(len(self), str_report))
        except IOError:
            raise
        except:
            if os.path.lexists(data_filepath):
                os.remove(data_filepath)
            if os.path.lexists(report_filepath):
                os.remove(report_filepath)
            raise
Exemplo n.º 10
0
def get_results(command, output):
    """
    Extract resultant phylogeny and its log-likelihood score from 'output' and
    files generated during the execution of 'command'.

    Arguments :
        command  ( list )
            FastTree's command line executed.
        output  ( string )
            Output from 'command' execution.

    Returns :
        Bio.Phylo.BaseTree
            Resultant phylogenetic tree.
        float
            Log-likelihood score of the phylogeny.
    """
    phylogeny = Phylo.read(StringIO(output), "newick")
    # Read the log file to get the log-likelihood score of the final phylogeny
    index = command.index("-log") + 1
    logfile_path = get_abspath(command[index])
    with open(logfile_path, "r") as logfile:
        # It is located at the last line that matches "TreeLogLk.*" pattern
        for line in logfile.readlines():
            if "TreeLogLk" in line:
                score = float(line.split("\t")[2])
    return (phylogeny, score)
Exemplo n.º 11
0
    def from_treefile ( cls, treefile, fileformat ) :
        """
        Create a PhyTrees object retrieving all the information stored at the
        tree file provided. If 'treefile' contains a relative path, the current
        working directory will be used to get the absolute path.
        
        Arguments :
            treefile  ( string )
                Input tree file.
            fileformat  ( string ) 
                Input file format.

        Raises :
            IOError
                If the path or the file provided doesn't exist.

        * The file format must be supported by Bio.Phylo.
        * If the file format provided doesn't correspond to the actual file
        format, an empty tree list will be created.
        """
        filepath = get_abspath(treefile)
        # Read the tree file and create a new PhyTrees object, generating a new
        # report list
        tree_list = list(Phylo.parse(filepath, fileformat))
        date_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
        report = [(date_time, filepath, fileformat)]
        return ( cls(tree_list, report) )
Exemplo n.º 12
0
def cleanup ( command ) :
    """
    Remove the temporary file created (if any) in gen_args() function.

    Arguments :
        command  ( list )
            Consense's command line executed.
    """
    logfile_path = get_abspath(command[2])
    if ( os.path.dirname(logfile_path) == tempfile.gettempdir() ) :
        os.remove(logfile_path)
Exemplo n.º 13
0
def cleanup(command):
    """
    Remove the temporary file created (if any) in gen_args() function.

    Arguments :
        command  ( list )
            Consense's command line executed.
    """
    logfile_path = get_abspath(command[2])
    if (os.path.dirname(logfile_path) == tempfile.gettempdir()):
        os.remove(logfile_path)
Exemplo n.º 14
0
def cleanup(command):
    """
    Remove the temporary files and directories created (if any) in gen_args()
    function.

    Arguments :
        command  ( list )
            FastTree's command line executed.
    """
    index = command.index("-log") + 1
    logfile_path = get_abspath(command[index])
    if (os.path.dirname(logfile_path) == tempfile.gettempdir()) and os.path.lexists(logfile_path):
        os.remove(logfile_path)
Exemplo n.º 15
0
def cleanup(command):
    """
    Remove the temporary files and directories created (if any) in gen_args()
    function.

    Arguments :
        command  ( list )
            FastTree's command line executed.
    """
    index = command.index('-log') + 1
    logfile_path = get_abspath(command[index])
    if ((os.path.dirname(logfile_path) == tempfile.gettempdir())
            and os.path.lexists(logfile_path)):
        os.remove(logfile_path)
Exemplo n.º 16
0
    def from_seqfile(cls, seqfile, fileformat):
        """
        Create a BioSeqs object retrieving all the information stored at the
        sequence file provided. If 'seqfile' contains a relative path, the
        current working directory will be used to get the absolute path.

        Arguments:
            seqfile  (string)
                Input sequences file.
            fileformat  (string)
                Input file format.

        Raises:
            IOError
                If the path or the file provided doesn't exist.

        * The file format must be supported by Bio.SeqIO.
        * If the file format provided doesn't correspond to the actual file
        format, an empty sequence dictionary will be created.
        """
        filepath = get_abspath(seqfile)
        # Read the sequence file and create a new BioSeqs object, generating a
        # new report list
        seq_dict = {}
        for record in SeqIO.parse(filepath, fileformat):
            # When reading or parsing from certain sequence file format
            # (e.g. FASTA), Bio.SeqIO gives a default alphabet to the Seq object
            # created that will raise an error when writing it in a GENBANK
            # file. Thus, we change that alphabet to a more specific one,
            # checking if it is a DNA or a protein sequence
            if (isinstance(record.seq.alphabet,
                           Alphabet.SingleLetterAlphabet)):
                record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACDNA()
                if (not Alphabet._verify_alphabet(record.seq)):
                    record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACProtein()
            seq_dict[record.id] = record
        date_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
        report = [(date_time, 'local', filepath, fileformat)]
        return (cls(seq_dict, report))
Exemplo n.º 17
0
    def from_seqfile(cls, seqfile, fileformat):
        """
        Create a BioSeqs object retrieving all the information stored at the
        sequence file provided. If 'seqfile' contains a relative path, the
        current working directory will be used to get the absolute path.
        
        Arguments :
            seqfile  ( string )
                Input sequences file.
            fileformat  ( string ) 
                Input file format.

        Raises :
            IOError
                If the path or the file provided doesn't exist.

        * The file format must be supported by Bio.SeqIO.
        * If the file format provided doesn't correspond to the actual file
        format, an empty sequence dictionary will be created.
        """
        filepath = get_abspath(seqfile)
        # Read the sequence file and create a new BioSeqs object, generating a
        # new report list
        seq_dict = {}
        for record in SeqIO.parse(filepath, fileformat):
            # When reading or parsing from certain sequence file format
            # (e.g. FASTA), Bio.SeqIO gives a default alphabet to the Seq object
            # created that will raise an error when writing it in a GENBANK
            # file. Thus, we change that alphabet to a more specific one,
            # checking if it is a DNA or a protein sequence
            if isinstance(record.seq.alphabet, Alphabet.SingleLetterAlphabet):
                record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACDNA()
                if not Alphabet._verify_alphabet(record.seq):
                    record.seq.alphabet = Alphabet.IUPAC.ExtendedIUPACProtein()
            seq_dict[record.id] = record
        date_time = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
        report = [(date_time, "local", filepath, fileformat)]
        return cls(seq_dict, report)
Exemplo n.º 18
0
def map_seqs ( record_list, tree_file, file_format, subset_size, overlapping,
               binary = 'dcm' ) :
    """
    Generate a map of the sequences in sets, of at most 'subset_size', with the
    specified overlapping using the padded-Recursive-DMC3 decomposition (PRD)
    from DACTAL system. If 'tree_file' contains a relative path, the current
    working directory will be used to get the absolute path.

    Arguments :
        record_list  ( list )
            List of SeqRecord objects (from Biopython).
        tree_file  ( string )
            Input tree file.
        file_format  ( string )
            Tree file format.
        subset_size  ( int )
            Maximum subset size.
        overlapping  ( int )
            Number of overlapping sequences between any two resultant subsets.
        binary  ( Optional[string] )
            Name or path of the DCM binary file.

    Returns :
        dict
            Dictionary with the set identifiers as keys and the corresponding
            sequences as values in lists of SeqRecord objects.

    Raises :
        ValueError
            When 'subset_size' < (4 * 'overlapping').
        RuntimeError
            If the call to the dcm command raises an exception.
        IOError
            If the dcm tool can't generate a decomposition for the 'subset_size'
            and 'overlapping' values given.

    * The tree file format must be supported by Bio.Phylo.
    """
    if ( subset_size < (4 * overlapping) ) :
        raise ValueError('The maximum subset size must be greater than or ' \
                         'equal to 4 times the overlapping value')
    # else : # subset_size >= (4 * overlapping)
    # If the input file format is not supported by the PRD process, convert it
    # to a temporary supported file
    infile_path = get_abspath(tree_file)
    if ( file_format.lower() != 'newick' ) :
        tmpfile = tempfile.NamedTemporaryFile()
        Phylo.convert(infile_path, file_format, tmpfile.name, 'newick')
        infile_path = tmpfile.name
    # The first decomposition process will be always executed, so there is no
    # need to overload this stage with the multiprocess generation
    set_list, further_decomp = _prd_decomposition(infile_path, subset_size,
                                                  str(overlapping), binary)
    # Parallelization of the recursive decomposition of the different subtrees.
    # All new subtrees are attached to 'further_decomp' file list so we can
    # launch at most one process per core, speeding up the whole process
    start = 0
    to_process = len(further_decomp[start:])
    pool = multiprocessing.Pool(processes=NUMCORES)
    while ( to_process > 0 ) :
        end = start + min(to_process, NUMCORES)
        results = [pool.apply_async(_prd_decomposition,
                                    args=(further_decomp[i], subset_size,
                                          str(overlapping), binary,))
                           for i in range(start, end)]
        # Collect the results of all the processes launched
        for pool_result in results :
            output = pool_result.get()
            set_list += output[0]
            further_decomp += output[1]
        start = end
        to_process = len(further_decomp[start:])
    # Remove all the temporal files created for the multirpocessing stage
    for file_path in further_decomp :
        os.remove(file_path)
    record_dict = {record.id: record  for record in record_list}
    # Map all the resultant sets with an unique set id and replace the sequence
    # ids by their corresponding Bio.SeqRecord object
    set_dict = {}
    num_zeros = len(str(len(set_list)))
    for index, seq_id_list in enumerate(set_list, 1) :
        set_id = 'prdset{}'.format(str(index).zfill(num_zeros))
        set_dict[set_id] = []
        for seq_id in seq_id_list :
            set_dict[set_id].append(record_dict[seq_id])
    return ( set_dict )
Exemplo n.º 19
0
def get_subsets(method, seqfile, fileformat='genbank', *args, **kwargs):
    """
    Division of all the sequences stored in the sequence input file into subsets
    applying the 'method' function. If 'seqfile' contains a relative path, the
    current working directory will be used to get the absolute path.

    Arguments :
        method  ( string )
            Desired partition method (case-insensitive): genes, naive rows or
            cols, padded-Recursive-DCM3.
        seqfile  ( string )
            Input sequences file.
        fileformat  ( string ) 
            Input file format.
        args & kwargs
            Non-keyworded and keyworded arguments passed to the selected method.
            

    Returns :
        dict
            Dictionary with the set identifiers as keys and the corresponding
            sequences as values in lists of SeqRecord objects.

    Raises :
        ValueError
            If there is no corresponding method to 'method' value.
        IOError
            If the path or the file provided doesn't exist.
        IOError
            If the file format provided doesn't correspond to the actual one.

    * The file format must be supported by Bio.SeqIO.
    * For "rows" method, if the number of input sequences is lower than the
    number of sets multiplied by the number of cores, the resulting sets might
    be fewer than the number requested.
    """
    method_key = method.lower()
    if (method_key not in _METHOD_TO_FUNC):
        message = 'The method "{}" isn\'t included in ' \
                  'MEvoLib.Cluster'.format(method)
        raise ValueError(message)
    # else : # method_key in _METHOD_TO_FUNC
    # Get the mapping function and the sequence file path
    mapseqs_func = _METHOD_TO_FUNC[method_key]
    filepath = get_abspath(seqfile)
    if (method_key in ['prd', 'genes']):
        # Non data-driven (throught input slicing) parallelizable methods
        seq_list = (x for x in SeqIO.parse(filepath, fileformat))
        set_dict = mapseqs_func(seq_list, *args, **kwargs)
    else:
        # Data-driven (throught input slicing) parallelizable methods
        manager = multiprocessing.Manager()
        seq_list = manager.list([x for x in SeqIO.parse(filepath, fileformat)])
        num_seqs = len(seq_list)
        # Launch one process per available CPU core
        slice_size = int(math.ceil(num_seqs / NUMCORES))
        pool = multiprocessing.Pool(processes=NUMCORES)
        results = [pool.apply_async(mapseqs_func,
                                    args=(seq_list[start:start+slice_size],) + \
                                          args)
                       for start in range(0, num_seqs, slice_size)]
        # Build the final sets dictionary merging the results of all executed
        # processes
        output = [p.get() for p in iter(results)]
        set_dict = output[0]
        for key in iter(set_dict):
            for result in output[1:]:
                set_dict[key].extend(result[key])
    return (set_dict)
Exemplo n.º 20
0
def get_subsets ( method, seqfile, fileformat = 'genbank', *args, **kwargs ) :
    """
    Division of all the sequences stored in the sequence input file into subsets
    applying the 'method' function. If 'seqfile' contains a relative path, the
    current working directory will be used to get the absolute path.

    Arguments :
        method  ( string )
            Desired partition method (case-insensitive): genes, naive rows or
            cols, padded-Recursive-DCM3.
        seqfile  ( string )
            Input sequences file.
        fileformat  ( string ) 
            Input file format.
        args & kwargs
            Non-keyworded and keyworded arguments passed to the selected method.
            

    Returns :
        dict
            Dictionary with the set identifiers as keys and the corresponding
            sequences as values in lists of SeqRecord objects.

    Raises :
        ValueError
            If there is no corresponding method to 'method' value.
        IOError
            If the path or the file provided doesn't exist.
        IOError
            If the file format provided doesn't correspond to the actual one.

    * The file format must be supported by Bio.SeqIO.
    * For "rows" method, if the number of input sequences is lower than the
    number of sets multiplied by the number of cores, the resulting sets might
    be fewer than the number requested.
    """
    method_key = method.lower()
    if ( method_key not in _METHOD_TO_FUNC ) :
        message = 'The method "{}" isn\'t included in ' \
                  'MEvoLib.Cluster'.format(method)
        raise ValueError(message)
    # else : # method_key in _METHOD_TO_FUNC
    # Get the mapping function and the sequence file path
    mapseqs_func = _METHOD_TO_FUNC[method_key]
    filepath = get_abspath(seqfile)
    if ( method_key in ['prd', 'genes'] ) :
        # Non data-driven (throught input slicing) parallelizable methods
        seq_list = (x  for x in SeqIO.parse(filepath, fileformat))
        set_dict = mapseqs_func(seq_list, *args, **kwargs)
    else :
        # Data-driven (throught input slicing) parallelizable methods
        manager = multiprocessing.Manager()
        seq_list = manager.list([x  for x in SeqIO.parse(filepath, fileformat)])
        num_seqs = len(seq_list)
        # Launch one process per available CPU core
        slice_size = int(math.ceil(num_seqs / NUMCORES))
        pool = multiprocessing.Pool(processes=NUMCORES)
        results = [pool.apply_async(mapseqs_func,
                                    args=(seq_list[start:start+slice_size],) + \
                                          args)
                       for start in range(0, num_seqs, slice_size)]
        # Build the final sets dictionary merging the results of all executed
        # processes
        output = [p.get() for p in iter(results)]
        set_dict = output[0]
        for key in iter(set_dict) :
            for result in output[1:] :
                set_dict[key].extend(result[key])
    return ( set_dict )
Exemplo n.º 21
0
def get_alignment(binary,
                  infile,
                  infile_format,
                  args='default',
                  outfile=None,
                  outfile_format='fasta',
                  **kwargs):
    """
    Align the sequences of the input file using the alignment tool and arguments
    given. The resultant alignment is returned as a Bio.Align.MultipleSeqAlign
    object and saved in the ouput file (if provided). If 'infile' or 'outfile'
    contain a relative path, the current working directory will be used to get
    the absolute path. If the output file already exists, the old file will be
    overwritten without any warning.

    The alignment tool might not be included in MEvoLib, but it can still be
    used passing in '**kwargs' the keys "informats" and "incmd" with the list of
    of supported infile formats and the infile argument, respectively.

    Arguments :
        binary  ( string )
            Name or path of the alignment tool.
        infile  ( string )
            Unaligned input sequence file.
        infile_format  ( string )
            Input file format.
        args  ( Optional[string] )
            Keyword or arguments to use in the call of the aligment tool,
            excluding infile and outfile arguments.  By default, 'default'
            arguments are used.
        outfile  ( Optional[string] )
            Alignment output file.
        outfile_format  ( Optional[string] )
            Output file format. By default, FASTA format.
        **kwargs  ( Optional[dict] )
            Keyworded arguments required to execute alignment tools not included
            in the current version of MEvoLib. It is neccesary to pass a list
            of supported infile formats under "informats" key, and the infile
            argument (e.g. "-in") with "incmd" key.

    Returns :
        Bio.Align.MultipleSeqAlignment
            Resultant alignment.

    Raises :
        IOError
            If the input path or the input file provided doesn't exist.
        RuntimeError
            If the call to the alignment tool command raises an exception.

    * The input file format must be supported by Bio.SeqIO.
    * The output file format must be supported by Bio.AlignIO.
    """
    # Get the variables associated with the given alignment tool, or get those
    # values from **kwargs
    bin_path, bin_name = os.path.split(binary)
    bin_name = bin_name.lower()
    if (bin_name in _TOOL_TO_LIB):
        tool_lib = _TOOL_TO_LIB[bin_name]
        sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS
        infile_cmd = tool_lib.INFILE_CMD
        keywords = tool_lib.KEYWORDS
    else:  # bin_name not in _TOOL_TO_LIB
        # Include the required variables through **kwargs dictionary
        sprt_infile_formats = kwargs['informats']
        infile_cmd = kwargs['incmd']
        keywords = dict()
    # Get the command line to run in order to get the resultant alignment
    infile_path = get_abspath(infile)
    # If the input file format is not supported by the alignment tool, convert
    # it to a temporary supported file
    if (infile_format.lower() not in sprt_infile_formats):
        tmpfile = tempfile.NamedTemporaryFile()
        SeqIO.convert(infile_path, infile_format, tmpfile.name,
                      sprt_infile_formats[0])
        infile_path = tmpfile.name
    # Get argument list from keyword dictionary or 'args' string
    if (args in keywords):
        arg_list = keywords[args]
    else:  # args not in keywords
        # Remove possible empty strings in the given arguments
        arg_list = [arg for arg in args.split(' ')]
    # Create full command line list (removing empty elements)
    command = [x for x in [binary] + arg_list + [infile_cmd, infile_path] if x]
    # Run the alignment process handling any Runtime exception
    try:
        output = subprocess.check_output(command,
                                         stderr=DEVNULL,
                                         universal_newlines=True)
    except subprocess.CalledProcessError as e:
        message = 'Running "{}" raised an exception'.format(' '.join(e.cmd))
        raise RuntimeError(message)
    else:
        alignment = AlignIO.read(StringIO(output), 'fasta')
        if (outfile):
            # Save the resultant alignment in the given outfile and format
            outfile_path = get_abspath(outfile)
            AlignIO.write(alignment, outfile_path, outfile_format)
        # Return the resultant alignment as a Bio.Align.MultipleSeqAligment
        # object
        return (alignment)
Exemplo n.º 22
0
def get_phylogeny ( binary, infile, infile_format, args = 'default',
                    outfile = None, outfile_format = 'newick',
                    bootstraps = 0 ) :
    """
    Infer the phylogeny from the input alignment using the phylogenetic
    inference tool and arguments given. The resultant phylogeny is returned as a
    Bio.Phylo.BaseTree object and saved in the ouput file (if provided). If
    'infile' or 'outfile' contain a relative path, the current working directory
    will be used to get the absolute path. If the output file already exists,
    the old file will be overwritten without any warning.

    Arguments :
        binary  ( string )
            Name or path of the phylogenetic inference tool.
        infile  ( string )
            Sequence alignment file.
        infile_format  ( string )
            Input file format.
        args  ( Optional[string] )
            Keyword or arguments to use in the call of the phylogenetic
            inference tool, excluding infile and outfile arguments. By default,
            'default' arguments are used.
        outfile  ( Optional[string] )
            Phylogenetic tree output file.
        outfile_format  ( Optional[string] )
            Output file format. By default, NEWICK format.
        bootstraps  ( Optional[int] )
            Number of bootstraps to generate. By default, 0 (only use the input
            alignment).

    Returns :
        Bio.Phylo.BaseTree
            Resultant phylogenetic tree.
        float
            Log-likelihood score of the phylogeny.

    Raises :
        ValueError
            If the tool introduced isn't included in MEvoLib.
        IOError
            If the input path or the input file provided doesn't exist.
        RuntimeError
            If the call to the phylogenetic inference tool command raises an
            exception.

    * The input file format must be supported by Bio.AlignIO.
    * The output file format must be supported by Bio.Phylo.
    """
    # Get the variables associated with the given phylogenetic inference tool
    bin_path, bin_name = os.path.split(binary)
    bin_name = bin_name.lower()
    if ( bin_name in _PHYLO_TOOL_TO_LIB ) :
        tool_lib = _PHYLO_TOOL_TO_LIB[bin_name]
        sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS
        gen_args = tool_lib.gen_args
        get_results = tool_lib.get_results
        cleanup = tool_lib.cleanup
    else : # bin_name not in _PHYLO_TOOL_TO_LIB
        message = 'The phylogenetic inference tool "{}" isn\'t included in ' \
                  'MEvoLib.Inference'.format(bin_name)
        raise ValueError(message)
    # Get the command line to run in order to get the resultant phylogeny
    infile_path = get_abspath(infile)
    # If the input file format is not supported by the phylogenetic inference
    # tool, convert it to a temporary supported file
    if ( infile_format.lower() not in sprt_infile_formats ) :
        tmpfile = tempfile.NamedTemporaryFile()
        AlignIO.convert(infile_path, infile_format, tmpfile.name,
                        sprt_infile_formats[0])
        infile_path = tmpfile.name
    # Create full command line list
    command = [binary] + gen_args(args, infile_path, bootstraps)
    # Run the phylogenetic inference process handling any Runtime exception
    try :
        output = subprocess.check_output(command, stderr=DEVNULL,
                                         universal_newlines=True)
    except subprocess.CalledProcessError as e :
        cleanup(command)
        message = 'Running "{}" raised an exception'.format(' '.join(e.cmd))
        raise RuntimeError(message)
    else :
        phylogeny, score = get_results(command, output)
        if ( outfile ) :
            # Save the resultant phylogeny in the given outfile and format
            outfile_path = get_abspath(outfile)
            Phylo.write(phylogeny, outfile_path, outfile_format)
        cleanup(command)
        # Return the resultant phylogeny as a Bio.Phylo.BaseTree object and its
        # log-likelihood score
        return ( phylogeny, score )
Exemplo n.º 23
0
def map_seqs(record_list,
             tree_file,
             file_format,
             subset_size,
             overlapping,
             binary='dcm'):
    """
    Generate a map of the sequences in sets, of at most 'subset_size', with the
    specified overlapping using the padded-Recursive-DMC3 decomposition (PRD)
    from DACTAL system. If 'tree_file' contains a relative path, the current
    working directory will be used to get the absolute path.

    Arguments :
        record_list  ( list )
            List of SeqRecord objects (from Biopython).
        tree_file  ( string )
            Input tree file.
        file_format  ( string )
            Tree file format.
        subset_size  ( int )
            Maximum subset size.
        overlapping  ( int )
            Number of overlapping sequences between any two resultant subsets.
        binary  ( Optional[string] )
            Name or path of the DCM binary file.

    Returns :
        dict
            Dictionary with the set identifiers as keys and the corresponding
            sequences as values in lists of SeqRecord objects.

    Raises :
        ValueError
            When 'subset_size' < (4 * 'overlapping').
        RuntimeError
            If the call to the dcm command raises an exception.
        IOError
            If the dcm tool can't generate a decomposition for the 'subset_size'
            and 'overlapping' values given.

    * The tree file format must be supported by Bio.Phylo.
    """
    if (subset_size < (4 * overlapping)):
        raise ValueError('The maximum subset size must be greater than or ' \
                         'equal to 4 times the overlapping value')
    # else : # subset_size >= (4 * overlapping)
    # If the input file format is not supported by the PRD process, convert it
    # to a temporary supported file
    infile_path = get_abspath(tree_file)
    if (file_format.lower() != 'newick'):
        tmpfile = tempfile.NamedTemporaryFile()
        Phylo.convert(infile_path, file_format, tmpfile.name, 'newick')
        infile_path = tmpfile.name
    # The first decomposition process will be always executed, so there is no
    # need to overload this stage with the multiprocess generation
    set_list, further_decomp = _prd_decomposition(infile_path, subset_size,
                                                  str(overlapping), binary)
    # Parallelization of the recursive decomposition of the different subtrees.
    # All new subtrees are attached to 'further_decomp' file list so we can
    # launch at most one process per core, speeding up the whole process
    start = 0
    to_process = len(further_decomp[start:])
    pool = multiprocessing.Pool(processes=NUMCORES)
    while (to_process > 0):
        end = start + min(to_process, NUMCORES)
        results = [
            pool.apply_async(_prd_decomposition,
                             args=(
                                 further_decomp[i],
                                 subset_size,
                                 str(overlapping),
                                 binary,
                             )) for i in range(start, end)
        ]
        # Collect the results of all the processes launched
        for pool_result in results:
            output = pool_result.get()
            set_list += output[0]
            further_decomp += output[1]
        start = end
        to_process = len(further_decomp[start:])
    # Remove all the temporal files created for the multirpocessing stage
    for file_path in further_decomp:
        os.remove(file_path)
    record_dict = {record.id: record for record in record_list}
    # Map all the resultant sets with an unique set id and replace the sequence
    # ids by their corresponding Bio.SeqRecord object
    set_dict = {}
    num_zeros = len(str(len(set_list)))
    for index, seq_id_list in enumerate(set_list, 1):
        set_id = 'prdset{}'.format(str(index).zfill(num_zeros))
        set_dict[set_id] = []
        for seq_id in seq_id_list:
            set_dict[set_id].append(record_dict[seq_id])
    return (set_dict)
Exemplo n.º 24
0
def get_alignment ( binary, infile, infile_format, args = 'default',
                    outfile = None, outfile_format = 'fasta', **kwargs ) :
    """
    Align the sequences of the input file using the alignment tool and arguments
    given. The resultant alignment is returned as a Bio.Align.MultipleSeqAlign
    object and saved in the ouput file (if provided). If 'infile' or 'outfile'
    contain a relative path, the current working directory will be used to get
    the absolute path. If the output file already exists, the old file will be
    overwritten without any warning.

    The alignment tool might not be included in MEvoLib, but it can still be
    used passing in '**kwargs' the keys "informats" and "incmd" with the list of
    of supported infile formats and the infile argument, respectively.

    Arguments :
        binary  ( string )
            Name or path of the alignment tool.
        infile  ( string )
            Unaligned input sequence file.
        infile_format  ( string )
            Input file format.
        args  ( Optional[string] )
            Keyword or arguments to use in the call of the aligment tool,
            excluding infile and outfile arguments.  By default, 'default'
            arguments are used.
        outfile  ( Optional[string] )
            Alignment output file.
        outfile_format  ( Optional[string] )
            Output file format. By default, FASTA format.
        **kwargs  ( Optional[dict] )
            Keyworded arguments required to execute alignment tools not included
            in the current version of MEvoLib. It is neccesary to pass a list
            of supported infile formats under "informats" key, and the infile
            argument (e.g. "-in") with "incmd" key.

    Returns :
        Bio.Align.MultipleSeqAlignment
            Resultant alignment.

    Raises :
        IOError
            If the input path or the input file provided doesn't exist.
        RuntimeError
            If the call to the alignment tool command raises an exception.

    * The input file format must be supported by Bio.SeqIO.
    * The output file format must be supported by Bio.AlignIO.
    """
    # Get the variables associated with the given alignment tool, or get those
    # values from **kwargs
    bin_path, bin_name = os.path.split(binary)
    bin_name = bin_name.lower()
    if ( bin_name in _TOOL_TO_LIB ) :
        tool_lib = _TOOL_TO_LIB[bin_name]
        sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS
        infile_cmd = tool_lib.INFILE_CMD
        keywords = tool_lib.KEYWORDS
    else : # bin_name not in _TOOL_TO_LIB
        # Include the required variables through **kwargs dictionary
        sprt_infile_formats = kwargs['informats']
        infile_cmd = kwargs['incmd']
        keywords = dict()
    # Get the command line to run in order to get the resultant alignment
    infile_path = get_abspath(infile)
    # If the input file format is not supported by the alignment tool, convert
    # it to a temporary supported file
    if ( infile_format.lower() not in sprt_infile_formats ) :
        tmpfile = tempfile.NamedTemporaryFile()
        SeqIO.convert(infile_path, infile_format, tmpfile.name,
                      sprt_infile_formats[0])
        infile_path = tmpfile.name
    # Get argument list from keyword dictionary or 'args' string
    if ( args in keywords ) :
        arg_list = keywords[args]
    else : # args not in keywords
        # Remove possible empty strings in the given arguments
        arg_list = [arg  for arg in args.split(' ')]
    # Create full command line list (removing empty elements)
    command = [x  for x in [binary] + arg_list + [infile_cmd, infile_path] if x]
    # Run the alignment process handling any Runtime exception
    try :
        output = subprocess.check_output(command, stderr=DEVNULL,
                                         universal_newlines=True)
    except subprocess.CalledProcessError as e :
        message = 'Running "{}" raised an exception'.format(' '.join(e.cmd))
        raise RuntimeError(message)
    else :
        alignment = AlignIO.read(StringIO(output), 'fasta')
        if ( outfile ) :
            # Save the resultant alignment in the given outfile and format
            outfile_path = get_abspath(outfile)
            AlignIO.write(alignment, outfile_path, outfile_format)
        # Return the resultant alignment as a Bio.Align.MultipleSeqAligment
        # object
        return ( alignment )
Exemplo n.º 25
0
def get_consensus_tree ( binary, infile, infile_format, args = 'default',
                         outfile = None, outfile_format = 'newick' ) :
    """
    Calculate the consensus tree of the input trees file with the given
    arguments. The resultant consensus tree is returned as a Bio.Phylo.BaseTree
    object and saved in the ouput file (if provided). If 'infile' or 'outfile'
    contain a relative path, the current working directory will be used to get
    the absolute path. If the output file already exists, the old file will be
    overwritten without any warning.

    Arguments :
        binary  ( string )
            Name or path of the consensus tool.
        infile  ( string )
            Input phylogenetic trees file.
        infile_format  ( string )
            Input file format.
        args  ( Optional[string] )
            Keyword or arguments to use in the call of the consensus tool,
            excluding infile and outfile arguments. By default, 'default'
            arguments are used.
            * For Consense, the second character will be used as separator of
            the different arguments. 
        outfile  ( Optional[string] )
            Consensus tree output file.
        outfile_format  ( Optional[string] )
            Output file format. By default, NEWICK format.

    Returns :
        Bio.Phylo.BaseTree
            Resultant consensus tree.

    Raises :
        ValueError
            If the tool introduced isn't included in MEvoLib.
        IOError
            If the input path or the input file provided doesn't exist.
        RuntimeError
            If the call to the phylogenetic inference tool command raises an
            exception.
        IOError
            If the consensus tool didn't generate a consensus tree (indicated by
            user's options/arguments).

    * The input file format must be supported by Bio.Phylo.
    * The output file format must be supported by Bio.Phylo.
    """
    # Get the variables associated with the given consensus tool
    bin_path, bin_name = os.path.split(binary)
    bin_name = bin_name.lower()
    if ( bin_name in _CONS_TOOL_TO_LIB ) :
        tool_lib = _CONS_TOOL_TO_LIB[bin_name]
        sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS
        gen_args = tool_lib.gen_args
        gen_stdin_content = tool_lib.gen_stdin_content
        get_results = tool_lib.get_results
        cleanup = tool_lib.cleanup
    else : # bin_name not in _CONS_TOOL_TO_LIB
        message = 'The consensus tool "{}" isn\'t included in ' \
                  'MEvoLib.PhyloAssemble'.format(bin_name)
        raise ValueError(message)
    # Get the command line to run in order to get the consensus tree
    infile_path = get_abspath(infile)
    # If the input file format is not supported by the consensus tool, convert
    # it to a temporary supported file
    if ( infile_format.lower() not in sprt_infile_formats ) :
        tmpfile = tempfile.NamedTemporaryFile()
        Phylo.convert(infile_path, infile_format, tmpfile.name,
                      sprt_infile_formats[0])
        infile_path = tmpfile.name
    # Create full command line list
    command = [binary] + gen_args(args, infile_path, outfile)
    # Generate the standard input file content
    stdin_content = gen_stdin_content(args)
    # Create the input file with the given options
    with tempfile.NamedTemporaryFile(mode='w+') as stdin_file :
        stdin_file.write(stdin_content)
        stdin_file.seek(0)
        # Run the consensus process handling any Runtime exception
        try :
            subprocess.check_call(command, stdin=stdin_file, stdout=DEVNULL,
                                  stderr=DEVNULL, universal_newlines=True)
        except subprocess.CalledProcessError as e :
            cleanup(command)
            message = 'Running "{}" raised an exception'.format(' '.join(e.cmd))
            raise RuntimeError(message)
        else :
            consensus_tree = get_results(command)
            cleanup(command)
            # Return the resultant consensus tree as a Bio.Phylo.BaseTree object
            return ( consensus_tree )
Exemplo n.º 26
0
def get_consensus_tree(binary,
                       infile,
                       infile_format,
                       args='default',
                       outfile=None,
                       outfile_format='newick'):
    """
    Calculate the consensus tree of the input trees file with the given
    arguments. The resultant consensus tree is returned as a Bio.Phylo.BaseTree
    object and saved in the ouput file (if provided). If 'infile' or 'outfile'
    contain a relative path, the current working directory will be used to get
    the absolute path. If the output file already exists, the old file will be
    overwritten without any warning.

    Arguments :
        binary  ( string )
            Name or path of the consensus tool.
        infile  ( string )
            Input phylogenetic trees file.
        infile_format  ( string )
            Input file format.
        args  ( Optional[string] )
            Keyword or arguments to use in the call of the consensus tool,
            excluding infile and outfile arguments. By default, 'default'
            arguments are used.
            * For Consense, the second character will be used as separator of
            the different arguments. 
        outfile  ( Optional[string] )
            Consensus tree output file.
        outfile_format  ( Optional[string] )
            Output file format. By default, NEWICK format.

    Returns :
        Bio.Phylo.BaseTree
            Resultant consensus tree.

    Raises :
        ValueError
            If the tool introduced isn't included in MEvoLib.
        IOError
            If the input path or the input file provided doesn't exist.
        RuntimeError
            If the call to the phylogenetic inference tool command raises an
            exception.
        IOError
            If the consensus tool didn't generate a consensus tree (indicated by
            user's options/arguments).

    * The input file format must be supported by Bio.Phylo.
    * The output file format must be supported by Bio.Phylo.
    """
    # Get the variables associated with the given consensus tool
    bin_path, bin_name = os.path.split(binary)
    bin_name = bin_name.lower()
    if (bin_name in _CONS_TOOL_TO_LIB):
        tool_lib = _CONS_TOOL_TO_LIB[bin_name]
        sprt_infile_formats = tool_lib.SPRT_INFILE_FORMATS
        gen_args = tool_lib.gen_args
        gen_stdin_content = tool_lib.gen_stdin_content
        get_results = tool_lib.get_results
        cleanup = tool_lib.cleanup
    else:  # bin_name not in _CONS_TOOL_TO_LIB
        message = 'The consensus tool "{}" isn\'t included in ' \
                  'MEvoLib.PhyloAssemble'.format(bin_name)
        raise ValueError(message)
    # Get the command line to run in order to get the consensus tree
    infile_path = get_abspath(infile)
    # If the input file format is not supported by the consensus tool, convert
    # it to a temporary supported file
    if (infile_format.lower() not in sprt_infile_formats):
        tmpfile = tempfile.NamedTemporaryFile()
        Phylo.convert(infile_path, infile_format, tmpfile.name,
                      sprt_infile_formats[0])
        infile_path = tmpfile.name
    # Create full command line list
    command = [binary] + gen_args(args, infile_path, outfile)
    # Generate the standard input file content
    stdin_content = gen_stdin_content(args)
    # Create the input file with the given options
    with tempfile.NamedTemporaryFile(mode='w+') as stdin_file:
        stdin_file.write(stdin_content)
        stdin_file.seek(0)
        # Run the consensus process handling any Runtime exception
        try:
            subprocess.check_call(command,
                                  stdin=stdin_file,
                                  stdout=DEVNULL,
                                  stderr=DEVNULL,
                                  universal_newlines=True)
        except subprocess.CalledProcessError as e:
            cleanup(command)
            message = 'Running "{}" raised an exception'.format(' '.join(
                e.cmd))
            raise RuntimeError(message)
        else:
            consensus_tree = get_results(command)
            cleanup(command)
            # Return the resultant consensus tree as a Bio.Phylo.BaseTree object
            return (consensus_tree)