def post_alignment(fam_nums, all_genomes, prefix, outdir, dname, prot_ali, quiet): """ After the alignment of all proteins by family: - concatenate all alignment files - group the alignment by genome Parameters ---------- fam_nums : [] list of family numbers all_genomes : [] list of all genomes in dataset prefix : str path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files easily) outdir : str path to output directory, containing Aldir and Listdir, and that will also contain Treedir dname : str name of dataset (used to name concat and grouped files, as well as tree folder) prot_ali : bool true: also give concatenated alignment in aa quiet : bool True if nothing must be sent to sdtout/stderr, False otherwise """ all_alns_nucl, status_nucl = concat_alignments(fam_nums, prefix, "nucl", quiet) treedir = os.path.join(outdir, "Phylo-" + dname) os.makedirs(treedir, exist_ok=True) outfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln") res_nucl = launch_group_by_genome(all_genomes, all_alns_nucl, status_nucl, outfile_nucl, dname, "nucleic", quiet) if not res_nucl: utils.remove(all_alns_nucl) utils.remove(outfile_nucl) logger.error( "An error occurred. We could not group DNA alignments by genome.") sys.exit(1) if prot_ali: all_alns_aa, status_aa = concat_alignments(fam_nums, prefix, "aa", quiet) outfile_aa = os.path.join(treedir, dname + ".aa.grp.aln") res_aa = launch_group_by_genome(all_genomes, all_alns_aa, status_aa, outfile_aa, dname, "protein", quiet) if not res_aa: utils.remove(all_alns_aa) utils.remove(outfile_aa) logger.error( "An error occurred. We could not group protein alignments by genome." ) return outfile_nucl
def align_all_families(prefix, all_fams, ngenomes, dname, quiet, threads): """ For each family: - align all its proteins with mafft - back-translate to nucleotides - add missing genomes Parameters ---------- prefix : str path to ``aldir/<name of dataset>`` (used to get extraction, alignment and btr files easily) all_fams : [] list of all family numbers ngenomes : int total number of genomes in dataset dname : str name of dataset (used to name concat and grouped files, as well as tree folder) quiet : bool True if nothing must be written in stdout/stderr, False otherwise threads : int max number of threads that can be used by mafft Returns ------- bool True if everything went well, False if there was a problem in at least 1 family. """ main_logger.info(("Starting alignment of all families: protein alignment, " "back-translation to nucleotides, and add missing genomes in the family")) nbfam = len(all_fams) bar = None if not quiet: # Create progressbar widgets = ['Alignment: ', progressbar.Bar(marker='█', left='', right='', fill=' '), ' ', progressbar.Counter(), "/{}".format(nbfam), ' (', progressbar.Percentage(), ') - ', progressbar.Timer(), ' - ' ] bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfam, term_width=79).start() final = [] if threads == 1: update_bar = 1 for num_fam in all_fams: f = handle_family_1thread((prefix, num_fam, ngenomes)) final.append(f) bar.update(update_bar) update_bar+=1 else: pool = multiprocessing.Pool(threads) # Create a Queue to put logs from processes, and handle them after from a single thread m = multiprocessing.Manager() q = m.Queue() # arguments : (gpath, cores_prokka, name, force, nbcont, q) for each genome arguments = [(prefix, num_fam, ngenomes, q) for num_fam in all_fams] try: final = pool.map_async(handle_family, arguments, chunksize=1) pool.close() # Listen for logs in processes lp = threading.Thread(target=utils.logger_thread, args=(q,)) lp.start() if not quiet: while True: if final.ready(): break remaining = final._number_left bar.update(nbfam - remaining) bar.finish() pool.join() q.put(None) lp.join() final = final.get() # If an error occurs (or user kills with keybord), terminate pool and exit except Exception as excp: # pragma: no cover pool.terminate() main_logger.error(excp) sys.exit(1) # We re-aligned (or added missing genomes) at least one family # -> remove concatenated files and groupby files (if they exist) if set(final) != {"OK"}: aldir = os.path.split(prefix)[0] concat_nucl = os.path.join(aldir, f"{dname}-complete.nucl.cat.aln") concat_aa = os.path.join(aldir, f"{dname}-complete.aa.cat.aln") outdir = os.path.split(aldir)[0] treedir = os.path.join(outdir, "Phylo-" + dname) grpfile_nucl = os.path.join(treedir, dname + ".nucl.grp.aln") grpfile_aa = os.path.join(treedir, dname + ".aa.grp.aln") utils.remove(concat_nucl) utils.remove(concat_aa) utils.remove(grpfile_nucl) utils.remove(grpfile_aa) return False not in final
def family_alignment(prt_file, gen_file, miss_file, mafft_file, btr_file, num_fam, ngenomes, logger): """ From a given family, align all its proteins with mafft, back-translate to nucleotides, and add missing genomes in this family. Parameters ---------- prt_file : str path to file containing proteins extracted gen_file : str path to file containing genes extracted miss_file : str path to file containing list of genomes missing mafft_file : str path to file which will contain the protein alignment btr_file : str path to file which will contain the nucleotide alignment back-translated from protein alignment num_fam : int current family number ngenomes : int total number of genomes in dataset logger : logging.Logger logger with queueHandler to give logs to main logger Returns ------- bool or str - False if problem with extractions or with alignment or with backtranslation - 'nb_seqs' = number of sequences aligned if everything went well (extractions and alignment ok, btr created without problem) - "OK" if extractions and alignments went well, and btr already exists and is ok """ # Check number of proteins extracted # =check that nb_prt (or nb_gen, which should be the same) + nb_miss = nb_genomes # returns number of genomes extracted (so, excludes missing genomes for the family) nbfprt = check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger) nbfal = None # If problem with extractions (0 proteins extracted), remove mafft and btr files if they exist, so that they will # be regenerated if not nbfprt: utils.remove(mafft_file) utils.remove(btr_file) return False # If mafft file already exists, check the number of proteins aligned corresponds to number of # proteins extracted. If not, remove mafft and btr files. if os.path.isfile(mafft_file): # There can be nbfprt (number of proteins extracted) # or nb_genomes (proteins extracted + missing added with '-') nbfal1 = check_nb_seqs(mafft_file, nbfprt, logger, "") nbfal2 = check_nb_seqs(mafft_file, ngenomes, logger, "") # if nbfal1: missing genomes have not been added yet. Save this value for later if nbfal1: nbfal = nbfal1 # if nbfal2: missing genomes already there, save for later elif nbfal2: nbfal = nbfal2 # If not any of those 2 numbers: error else: message = (f"fam {num_fam}: Will redo alignment, because found a different number of proteins " f"extracted in {prt_file} ({nbfprt}) and proteins aligned in " f"existing {mafft_file}") logger.error(message) os.remove(mafft_file) utils.remove(btr_file) # If mafft file does not exist (removed because problem in its alignment, or just not generated # yet), remove btr (will be regenerated), and do alignment with mafft if not os.path.isfile(mafft_file): utils.remove(btr_file) # remove if exists... nbfal = mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger) # If problem with alignment, return False if not nbfal: return False # If btr file already exists, means that it was already done before, and not removed because # extractions and mafft files are ok. So, return True, saying that btr file is done, # next step will be to check it, add missing genomes etc. if os.path.isfile(btr_file): message = (f"fam {num_fam}: Will redo back-translation, because found a different number of " f"proteins aligned in {mafft_file} ({nbfal}) and genes back-translated in " f"existing {btr_file}") # btr file contains either nbfal entries (number of proteins extracted) if it was not completed # with missing genomes, or ngenomes if it was completed. If it is not the case, remove it # (will be regenerated) res = check_nb_seqs(btr_file, [nbfal, ngenomes], logger, message) if not res: utils.remove(btr_file) else: return "OK" # If btr file does not exist (removed because problem with mafft generated before, # or just not generated yet), do back-translation, and return: # - number of sequences back-translated if it went well, # - False otherwise return back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger)
def launch_group_by_genome(all_genomes, all_alns, status, outfile, dname, type_ali, quiet): """ Function calling group_by_genome in a pool, while giving information to user (time elapsed) Parameters ---------- all_genomes : [] list of all genomes in the dataset all_alns : str path to file containing all alignments concatenated status : str "OK" if concatenation file already existed before running, "Done" if just did concatenation outfile : str file containing all families align by genome dname : str name of dataset type_ali : str nucleic or protein quiet : bool True if nothing must be sent to sdtout/stderr, False otherwise Returns ------- bool - True if everything went well or was already done - False if error occurred in at least one step """ # Status = Done means that we just did the concatenation. So, if grouped by genome # file already exists, remove it. if status == "Done": if os.path.isfile(outfile): utils.remove(outfile) # Status was not 'Done' (it was 'ok', concat file already existed). And by_genome file # also already exists. Warn user if os.path.isfile(outfile): logger.info(f"{type_ali} alignments already grouped by genome") logger.warning( (f"{type_ali} alignments already grouped by genome in {outfile}. " "Program will end. ")) return True logger.info(f"Grouping {type_ali} alignments per genome") bar = None if not quiet: widgets = [ progressbar.BouncingBar(marker=progressbar.RotatingMarker( markers="◐◓◑◒")), " - ", progressbar.Timer() ] bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50) pool = multiprocessing.Pool(1) args = [all_genomes, all_alns, outfile] final = pool.map_async(group_by_genome, [args], chunksize=1) pool.close() if not quiet: while True: if final.ready(): break bar.update() bar.finish() pool.join() return False not in final.get()
def check_existing_extract(all_fams, aldir, dname): """ For each family, check if its prt and gen extraction file already exist. If both exist, no need to re-extract for those families. If only one or no one exists, put to list to extract. Parameters ---------- all_fams : list list of all family numbers aldir : str path to directory where extraction files must be saved dname : str name of the dataset Returns ------- [] list of files that must be generated (prt and gen files) """ extract_fams = [] for fam in all_fams: genfile = os.path.join(aldir, "{}-current.{}.gen".format(dname, fam)) prtfile = os.path.join(aldir, "{}-current.{}.prt".format(dname, fam)) if not os.path.isfile(genfile) or not os.path.isfile(prtfile): # At least 1 file missing: re-extract all proteins and all genes utils.remove(genfile) utils.remove(prtfile) # As we re-extract proteins and genes, redo alignments mafft_file = os.path.join(aldir, "{}-mafft-align.{}.aln".format(dname, fam)) btr_file = os.path.join(aldir, "{}-mafft-prt2nuc.{}.aln".format(dname, fam)) utils.remove(mafft_file) utils.remove(btr_file) extract_fams.append(genfile) extract_fams.append(prtfile) # If we re-extract at least 1 family, redo the final files (concatenation and group by genome) if len(extract_fams) > 0: concat = os.path.join(aldir, "{}-complete.cat.aln".format(dname)) outdir = os.path.split(aldir)[0] treedir = os.path.join(outdir, "Phylo-" + dname) grpfile = os.path.join(treedir, dname + ".grp.aln") utils.remove(concat) utils.remove(grpfile) return extract_fams
def do_pangenome(outdir, prt_bank, mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, clust_mode, just_done, threads, panfile, quiet=False): """ Use mmseqs to cluster proteins Parameters ---------- outdir : str directory where output files are saved prt_bank : str name of the file containing all proteins to cluster, without path mmseqdb : str path to base filename of output of mmseqs createdb mmseqclust : str mmseqs clust tmp_dir : str path to tmp directory logmmseq : str path to file for mmseqs logs min_id : float min percentage of identity to be considered in the same family (between 0 and 1) clust_mode : [0, 1, 2] 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit' just_done : str True if mmseqs db was just (re)created -> remove mmseqs clust. False if mmseqs db was kept from previous run -> no need to rerun mmseqs clust if already exists threads : int max number of threads to use panfile : str if a pangenome file is specified. Otherwise, default pangenome name will be used quiet : bool true if nothing must be print on stdout/stderr, false otherwise (show progress bar) Returns ------- (families, outfile) : tuple - families : {fam_num: [all members]} - outfile : pangenome filename """ mmseqstsv = mmseqclust + ".tsv" # If we just made the database, we must redo all next steps # -> if existing, remove # mmseqsclust (created by run_mmseqs_clust) # mmseqstsv (created by mmseqs_to_pangenome) # pangenome file if just_done and os.path.isfile(mmseqclust) or os.path.isfile( mmseqstsv) or os.path.isfile(panfile): logger.details("Removing existing clustering and/or pangenome files.") utils.remove(mmseqclust) utils.remove(mmseqstsv) utils.remove(panfile) bar = None logger.debug(mmseqclust) if os.path.isfile(mmseqclust): logger.warning(( f"mmseqs clustering {mmseqclust} already exists. The program will now convert " "it to a pangenome file.")) else: logger.info("Clustering proteins...") try: stop_bar = False if quiet: widgets = [] # If not quiet, start a progress bar while clustering proteins. We cannot guess # how many time it will take, so we start an "infinite" bar, and send it a signal # when it has to stop. If quiet, we start a thread that will immediatly stop else: widgets = [ progressbar.BouncingBar(marker=progressbar.RotatingMarker( markers="◐◓◑◒")), " - ", progressbar.Timer() ] x = threading.Thread(target=utils.thread_progressbar, args=( widgets, lambda: stop_bar, )) x.start() args = (mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode) run_mmseqs_clust(args) # except KeyboardInterrupt: # pragma: no cover except: # pragma: no cover stop_bar = True x.join() sys.exit(1) # Clustering done, stop bar and join (if quiet, it was already finished, so we just join it) stop_bar = True x.join() # Convert output to tsv file (one line per comparison done) # # Convert output to tsv file (one line per comparison done) # -> returns (families, outfile) families = mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, panfile) return families, panfile