Exemplo n.º 1
0
def post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali,
                   quiet):
    """
    After the alignment of all proteins by family:

    - concatenate all alignment files
    - group the alignment by genome

    Parameters
    ----------
    fam_nums : []
        list of family numbers
    all_genomes : []
        list of all genomes in dataset
    prefix : str
        path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files easily)
    outdir : str
        path to output directory, containing Aldir and Listdir, and that will also contain Treedir
    dname : str
        name of dataset (used to name concat and grouped files, as well as tree folder)
    prot_ali : bool
        true: also give concatenated alignment in aa
    quiet : bool
        True if nothing must be sent to sdtout/stderr, False otherwise
    """
    all_alns_nucl, status_nucl = concat_alignments(fam_nums, prefix, "nucl",
                                                   quiet)
    treedir = os.path.join(outdir, "Phylo-" + dname)
    os.makedirs(treedir, exist_ok=True)
    outfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln")
    res_nucl = launch_group_by_genome(all_genomes, all_alns_nucl, status_nucl,
                                      outfile_nucl, dname, "nucleic", quiet)
    if not res_nucl:
        utils.remove(all_alns_nucl)
        utils.remove(outfile_nucl)
        logger.error(
            "An error occurred. We could not group DNA alignments by genome.")
        sys.exit(1)
    if prot_ali:
        all_alns_aa, status_aa = concat_alignments(fam_nums, prefix, "aa",
                                                   quiet)
        outfile_aa = os.path.join(treedir, dname + ".aa.grp.aln")
        res_aa = launch_group_by_genome(all_genomes, all_alns_aa, status_aa,
                                        outfile_aa, dname, "protein", quiet)
        if not res_aa:
            utils.remove(all_alns_aa)
            utils.remove(outfile_aa)
            logger.error(
                "An error occurred. We could not group protein alignments by genome."
            )
    return outfile_nucl
Exemplo n.º 2
0
def align_all_families(prefix, all_fams, ngenomes, dname, quiet, threads):
    """
    For each family:

    - align all its proteins with mafft
    - back-translate to nucleotides
    - add missing genomes

    Parameters
    ----------
    prefix :  str
        path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files
        easily)
    all_fams : []
        list of all family numbers
    ngenomes : int
        total number of genomes in dataset
    dname : str
        name of dataset (used to name concat and grouped files, as well as tree folder)
    quiet : bool
        True if nothing must be written in stdout/stderr, False otherwise
    threads : int
        max number of threads that can be used by mafft

    Returns
    -------
    bool
        True if everything went well, False if there was a problem in at least 1 family.
    """
    main_logger.info(("Starting alignment of all families: protein alignment, "
                      "back-translation to nucleotides, and add missing genomes in the family"))
    nbfam = len(all_fams)
    bar = None
    if not quiet:
        # Create progressbar
        widgets = ['Alignment: ', progressbar.Bar(marker='█', left='', right='', fill=' '),
                   ' ', progressbar.Counter(), "/{}".format(nbfam), ' (',
                   progressbar.Percentage(), ') - ', progressbar.Timer(), ' - '
                   ]
        bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfam,
                                      term_width=79).start()
    final = []
    if threads == 1:
        update_bar = 1
        for num_fam in all_fams:
            f = handle_family_1thread((prefix, num_fam, ngenomes))
            final.append(f)
            bar.update(update_bar)
            update_bar+=1

    else:
        pool = multiprocessing.Pool(threads)

        # Create a Queue to put logs from processes, and handle them after from a single thread
        m = multiprocessing.Manager()
        q = m.Queue()
        # arguments : (gpath, cores_prokka, name, force, nbcont, q) for each genome
        arguments = [(prefix, num_fam, ngenomes, q) for num_fam in all_fams]
        try:
            final = pool.map_async(handle_family, arguments, chunksize=1)
            pool.close()
            # Listen for logs in processes
            lp = threading.Thread(target=utils.logger_thread, args=(q,))
            lp.start()
            if not quiet:
                while True:
                    if final.ready():
                        break
                    remaining = final._number_left
                    bar.update(nbfam - remaining)
                bar.finish()
            pool.join()
            q.put(None)
            lp.join()
            final = final.get()
        # If an error occurs (or user kills with keybord), terminate pool and exit
        except Exception as excp:  # pragma: no cover
            pool.terminate()
            main_logger.error(excp)
            sys.exit(1)
    # We re-aligned (or added missing genomes) at least one family 
    # -> remove concatenated files and groupby files (if they exist)
    if set(final) != {"OK"}:
        aldir = os.path.split(prefix)[0]
        concat_nucl = os.path.join(aldir, f"{dname}-complete.nucl.cat.aln")
        concat_aa = os.path.join(aldir, f"{dname}-complete.aa.cat.aln")
        outdir = os.path.split(aldir)[0]
        treedir = os.path.join(outdir, "Phylo-" + dname)
        grpfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln")
        grpfile_aa = os.path.join(treedir, dname + ".aa.grp.aln")
        utils.remove(concat_nucl)
        utils.remove(concat_aa)
        utils.remove(grpfile_nucl)
        utils.remove(grpfile_aa)
    return False not in final
Exemplo n.º 3
0
def family_alignment(prt_file, gen_file, miss_file, mafft_file, btr_file,
                     num_fam, ngenomes, logger):
    """
    From a given family, align all its proteins with mafft, back-translate
    to nucleotides, and add missing genomes in this family.

    Parameters
    ----------
    prt_file : str
        path to file containing proteins extracted
    gen_file : str
        path to file containing genes extracted
    miss_file : str
        path to file containing list of genomes missing
    mafft_file : str
        path to file which will contain the protein alignment
    btr_file : str
        path to file which will contain the nucleotide alignment back-translated from protein
        alignment
    num_fam : int
        current family number
    ngenomes : int
        total number of genomes in dataset
    logger : logging.Logger
        logger with queueHandler to give logs to main logger

    Returns
    -------
    bool or str
        - False if problem with extractions or with alignment or with backtranslation
        - 'nb_seqs' = number of sequences aligned if everything went well (extractions and
          alignment ok, btr created without problem)
        - "OK" if extractions and alignments went well, and btr already exists and is ok
    """
    # Check number of proteins extracted
    # =check that nb_prt (or nb_gen, which should be the same) + nb_miss = nb_genomes
    # returns number of genomes extracted (so, excludes missing genomes for the family)
    nbfprt = check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger)
    nbfal = None
    # If problem with extractions (0 proteins extracted), remove mafft and btr files if they exist, so that they will
    # be regenerated
    if not nbfprt:
        utils.remove(mafft_file)
        utils.remove(btr_file)
        return False
    # If mafft file already exists, check the number of proteins aligned corresponds to number of
    #  proteins extracted. If not, remove mafft and btr files.
    if os.path.isfile(mafft_file):
        # There can be nbfprt (number of proteins extracted) 
        # or nb_genomes (proteins extracted + missing added with '-')
        nbfal1 = check_nb_seqs(mafft_file, nbfprt, logger, "")
        nbfal2 = check_nb_seqs(mafft_file, ngenomes, logger, "")
        # if nbfal1: missing genomes have not been added yet. Save this value for later
        if nbfal1:
            nbfal = nbfal1
        # if nbfal2: missing genomes already there, save for later
        elif nbfal2:
            nbfal = nbfal2
        # If not any of those 2 numbers: error
        else:
            message = (f"fam {num_fam}: Will redo alignment, because found a different number of proteins "
                       f"extracted in {prt_file} ({nbfprt}) and proteins aligned in "
                       f"existing {mafft_file}")
            logger.error(message)
            os.remove(mafft_file)
            utils.remove(btr_file)
    # If mafft file does not exist (removed because problem in its alignment, or just not generated
    # yet), remove btr (will be regenerated), and do alignment with mafft
    if not os.path.isfile(mafft_file):
        utils.remove(btr_file)  # remove if exists...
        nbfal = mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger)
    # If problem with alignment, return False
    if not nbfal:
        return False
    # If btr file already exists, means that it was already done before, and not removed because
    # extractions and mafft files are ok. So, return True, saying that btr file is done,
    # next step will be to check it, add missing genomes etc.
    if os.path.isfile(btr_file):
        message = (f"fam {num_fam}: Will redo back-translation, because found a different number of "
                   f"proteins aligned in {mafft_file} ({nbfal}) and genes back-translated in "
                   f"existing {btr_file}")
        # btr file contains either nbfal entries (number of proteins extracted) if it was not completed 
        # with missing genomes, or ngenomes if it was completed. If it is not the case, remove it
        # (will be regenerated)
        res = check_nb_seqs(btr_file, [nbfal, ngenomes], logger, message)
        if not res:
            utils.remove(btr_file)
        else:
            return "OK"
    # If btr file does not exist (removed because problem with mafft generated before,
    # or just not generated yet), do back-translation, and return:
    # - number of sequences back-translated if it went well,
    # - False otherwise
    return back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger)
Exemplo n.º 4
0
def launch_group_by_genome(all_genomes, all_alns, status, outfile, dname,
                           type_ali, quiet):
    """
    Function calling group_by_genome in a pool, while giving information to user
    (time elapsed)

    Parameters
    ----------
    all_genomes : []
        list of all genomes in the dataset
    all_alns : str
        path to file containing all alignments concatenated
    status : str
        "OK" if concatenation file already existed before running, "Done" if just did concatenation
    outfile : str
        file containing all families align by genome
    dname : str
        name of dataset
    type_ali : str
        nucleic or protein
    quiet : bool
        True if nothing must be sent to sdtout/stderr, False otherwise

    Returns
    -------
    bool
        - True if everything went well or was already done
        - False if error occurred in at least one step
    """
    # Status = Done means that we just did the concatenation. So, if grouped by genome
    # file already exists, remove it.
    if status == "Done":
        if os.path.isfile(outfile):
            utils.remove(outfile)
    # Status was not 'Done' (it was 'ok', concat file already existed). And by_genome file
    # also already exists. Warn user
    if os.path.isfile(outfile):
        logger.info(f"{type_ali} alignments already grouped by genome")
        logger.warning(
            (f"{type_ali} alignments already grouped by genome in {outfile}. "
             "Program will end. "))
        return True
    logger.info(f"Grouping {type_ali} alignments per genome")
    bar = None
    if not quiet:
        widgets = [
            progressbar.BouncingBar(marker=progressbar.RotatingMarker(
                markers="◐◓◑◒")), "  -  ",
            progressbar.Timer()
        ]
        bar = progressbar.ProgressBar(widgets=widgets,
                                      max_value=20,
                                      term_width=50)
    pool = multiprocessing.Pool(1)
    args = [all_genomes, all_alns, outfile]
    final = pool.map_async(group_by_genome, [args], chunksize=1)
    pool.close()
    if not quiet:
        while True:
            if final.ready():
                break
            bar.update()
        bar.finish()
    pool.join()
    return False not in final.get()
Exemplo n.º 5
0
def check_existing_extract(all_fams, aldir, dname):
    """
    For each family, check if its prt and gen extraction file already exist.
    If both exist, no need to re-extract for those families.
    If only one or no one exists, put to list to extract.

    Parameters
    ----------
    all_fams : list
        list of all family numbers
    aldir : str
        path to directory where extraction files must be saved
    dname : str
        name of the dataset

    Returns
    -------
    []
        list of files that must be generated (prt and gen files)
    """
    extract_fams = []
    for fam in all_fams:
        genfile = os.path.join(aldir, "{}-current.{}.gen".format(dname, fam))
        prtfile = os.path.join(aldir, "{}-current.{}.prt".format(dname, fam))
        if not os.path.isfile(genfile) or not os.path.isfile(prtfile):
            # At least 1 file missing: re-extract all proteins and all genes
            utils.remove(genfile)
            utils.remove(prtfile)
            # As we re-extract proteins and genes, redo alignments
            mafft_file = os.path.join(aldir, "{}-mafft-align.{}.aln".format(dname, fam))
            btr_file = os.path.join(aldir, "{}-mafft-prt2nuc.{}.aln".format(dname, fam))
            utils.remove(mafft_file)
            utils.remove(btr_file)
            extract_fams.append(genfile)
            extract_fams.append(prtfile)
    # If we re-extract at least 1 family, redo the final files (concatenation and group by genome)
    if len(extract_fams) > 0:
        concat = os.path.join(aldir, "{}-complete.cat.aln".format(dname))
        outdir = os.path.split(aldir)[0]
        treedir = os.path.join(outdir, "Phylo-" + dname)
        grpfile = os.path.join(treedir, dname + ".grp.aln")
        utils.remove(concat)
        utils.remove(grpfile)
    return extract_fams
Exemplo n.º 6
0
def do_pangenome(outdir,
                 prt_bank,
                 mmseqdb,
                 mmseqclust,
                 tmpdir,
                 logmmseq,
                 min_id,
                 clust_mode,
                 just_done,
                 threads,
                 panfile,
                 quiet=False):
    """
    Use mmseqs to cluster proteins

    Parameters
    ----------
    outdir : str
        directory where output files are saved
    prt_bank : str
        name of the file containing all proteins to cluster, without path
    mmseqdb : str
        path to base filename of output of mmseqs createdb
    mmseqclust : str
        mmseqs clust
    tmp_dir : str
        path to tmp directory
    logmmseq : str
        path to file for mmseqs logs
    min_id : float
        min percentage of identity to be considered in the same family (between 0 and 1)
    clust_mode : [0, 1, 2]
        0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'
    just_done : str
        True if mmseqs db was just (re)created -> remove mmseqs clust. 
        False if mmseqs db was kept from previous run -> no need to rerun mmseqs clust if already exists
    threads : int
        max number of threads to use
    panfile : str
        if a pangenome file is specified. Otherwise, default pangenome name will be used
    quiet : bool
        true if nothing must be print on stdout/stderr, false otherwise (show progress bar)

    Returns
    -------
    (families, outfile) : tuple

        - families : {fam_num: [all members]}
        - outfile : pangenome filename
    """
    mmseqstsv = mmseqclust + ".tsv"
    # If we just made the database, we must redo all next steps
    # -> if existing, remove
    # mmseqsclust (created by run_mmseqs_clust)
    # mmseqstsv (created by mmseqs_to_pangenome)
    # pangenome file
    if just_done and os.path.isfile(mmseqclust) or os.path.isfile(
            mmseqstsv) or os.path.isfile(panfile):
        logger.details("Removing existing clustering and/or pangenome files.")
        utils.remove(mmseqclust)
        utils.remove(mmseqstsv)
        utils.remove(panfile)
    bar = None
    logger.debug(mmseqclust)
    if os.path.isfile(mmseqclust):
        logger.warning((
            f"mmseqs clustering {mmseqclust} already exists. The program will now convert "
            "it to a pangenome file."))
    else:
        logger.info("Clustering proteins...")
        try:
            stop_bar = False
            if quiet:
                widgets = []
            # If not quiet, start a progress bar while clustering proteins. We cannot guess
            # how many time it will take, so we start an "infinite" bar, and send it a signal
            # when it has to stop. If quiet, we start a thread that will immediatly stop
            else:
                widgets = [
                    progressbar.BouncingBar(marker=progressbar.RotatingMarker(
                        markers="◐◓◑◒")), "  -  ",
                    progressbar.Timer()
                ]
            x = threading.Thread(target=utils.thread_progressbar,
                                 args=(
                                     widgets,
                                     lambda: stop_bar,
                                 ))
            x.start()
            args = (mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads,
                    clust_mode)
            run_mmseqs_clust(args)
        # except KeyboardInterrupt: # pragma: no cover
        except:  # pragma: no cover
            stop_bar = True
            x.join()
            sys.exit(1)
        # Clustering done, stop bar and join (if quiet, it was already finished, so we just join it)
        stop_bar = True
        x.join()
    # Convert output to tsv file (one line per comparison done)
    #  # Convert output to tsv file (one line per comparison done)
    # -> returns (families, outfile)
    families = mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, panfile)
    return families, panfile