def eukcc_folder(args): state = eukcc_state(workdir=os.path.join(args.out, "refine_workdir"), options=vars(args)) file.isdir(state["workdir"]) if state["db"] is None: if os.environ.get("EUKCC2_DB") is not None: state["db"] = os.environ.get("EUKCC2_DB") logging.debug( "Defined db via env variable EUKCC2_DB as '{}'".format( state["db"])) else: logging.error( "No database was provided via --db or EUKCC2_DB env variable") exit(202) logging.info("EukCC version {}".format(version.__version__)) # discover all bins logging.debug("Loading all bins") state["input_bins"] = glob.glob( os.path.join(state["binfolder"], "*{}".format(state["suffix"]))) logging.info("Found {} bins".format(len(state["input_bins"]))) if len(state["input_bins"]) == 0: logging.error("Stopping as no bins in folder") exit(222) refine(state)
def __init__( self, program, workdir, infiles, outfiles, cores=1, touch=False, **kwargs ): # check software is in path: if which(program) is None: raise EnvironmentError("Could not find executable: {}".format(program)) self.success = None if not isinstance(infiles, list): infiles = [infiles] if not isinstance(outfiles, list): outfiles = [outfiles] # create workdir file.isdir(workdir) # check for all infiles for infile in infiles: if infile is None or ( not os.path.exists(infile) and not os.path.exists("{}.h3f".format(infile)) ): raise OSError("File not found: {}".format(infile)) # make sure all infiles are abspath infiles = [os.path.abspath(x) for x in infiles] # check if running is required if self.run_needed(workdir, infiles, outfiles): logging.debug("Running {}".format(program)) self.run(program, workdir, infiles, outfiles, cores, kwargs)
def align_marker_genes(self, fasta, markers, workdir, output): """ Align marker genes identified using hmmsearch to be passed on the EPA :params fasta: path to a protein fasta file :params markers: dictionary of marker genes obtained by read_hmmer :params workdir: Folder to write tmp files to :params output: Path to the final expected output file """ wd = workdir file.isdir(wd) md = defaultdict(list) for row in markers: md[row["query"]].append(row["target"]) # prots file prots = {} all_profiles = set( self.state["dbinfo"]["files"]["hmm_placement_place"].keys()) for profile, ids in md.items(): if profile not in self.state["dbinfo"]["files"][ "hmm_placement_place"].keys(): logging.error( "Profile {} misses the hmm and alignment files".format( profile)) exit(200) hmmfile = self.state["dbinfo"]["files"]["hmm_placement_place"][ profile]["hmm"] alnfile = self.state["dbinfo"]["files"]["hmm_placement_place"][ profile]["aln"] prot_file = os.path.join(wd, "{}.faa".format(profile)) prot_file = Fasta.reduce_fasta(fasta, prot_file, ids) aligned = os.path.join(wd, "{}.fasta".format(profile)) hmmalign( "hmmalign", wd, [alnfile, hmmfile, prot_file], [aligned], ) # check that alignment has also non gap sites, else epa-ng will fail if check_alignment(aligned, prot_file): prots[profile] = aligned all_profiles.remove(profile) else: logging.debug("Skipped alignment, as it is only gaps") if len(prots.keys()) == 0: logging.debug("No alignments could be done") return None for profile in all_profiles: prots[profile] = self.state["dbinfo"]["files"][ "hmm_placement_place"][profile]["aln"] logging.debug("Concatenating alignments") return horizontal_concatenate(output, [v for k, v in prots.items()], list(prots.keys()))
def __init__(self, state): # state is kept internally self.state = state self.state.checkpoint("launch") for key in ["db", "fasta", "out"]: if key in state.keys(): self.state[key] = make_absolute(state[key]) logging.debug("Using database located at: {}".format(self.state["db"])) if self.state["workdir"] is None or self.state["workdir"] == "": self.state["workdir"] = os.path.join(state["out"], "workdir") file.isdir(self.state["workdir"]) # check if the database is accessible self.state["loaded_dbs"] = {} self.state["dbinfo"] = self.load_db(state["db"], "base") if state["clade"] != "base": self.state["dbinfo"] = self.load_db(state["db"], state["clade"]) logging.debug("Using database '{}' with PANTHER version {}".format( self.state["dbinfo"]["name"], state["dbinfo"]["panther_version"])) # validate AA type self.state["seqtype"] = state["seqtype"].upper() if self.state["seqtype"] not in ["AA", "DNA"]: raise ValueError("seqtype should be either DNA or AA") if Fasta.validate_fasta(self.state["fasta"]) is False: raise ValueError("The provided fasta is corrupt or maybe gzipped?") if self.state["set_number_species"] < 3: logging.warning( "You included less than 3 species in the set creation, this might lead to unstable results" ) if self.state["marker_prevalence"] < 95: logging.warning( "You selected a marker prevalence below 95%, this can lead to wrong results" ) if self.state["set_size"] > state["max_set_size"]: self.state["max_set_size"] = state["set_minimal_size"] logging.warning( "Your minimal set size is bigger than the maximial set size. Thus we increased the maximal set size to fit the minimal set size. Both are now {}" .format(self.state["max_set_size"])) logging.debug( "Using {threads} threads".format(threads=self.state["threads"])) if self.state["rerun"]: logging.debug("deleting workdir to rerun all of the analysis") self.remove_workdir(self.state["workdir"]) # check dependencies self.check_dependencies()
def merge_bins(parent, children, workdir): """ Function to merge a sinlge parent with n children of class bin Returns an object of eukcc_state """ # create an all_bins container all_bins = [parent] for c in children: all_bins.append(c) # define a name for this merged bin, this is used in # file paths and later for the output names = [x.name for x in all_bins] name = "merged" + "_".join(names) wd = os.path.join(workdir, name) # We use the childerns faa file to search for missing parent markers faa_path = os.path.abspath(os.path.join(wd, "{}.faa".format(name))) if not os.path.exists(faa_path): faas = [x.state["faa"] for x in children] file.isdir(wd) merge_fasta(faas, faa_path, seperator=None) # initial a new eukcc state usign the parent as a starting point state = eukcc_state(workdir, parent.state) state["faa"] = faa_path state["name"] = name state["workdir"] = os.path.abspath(wd) # Caching is done for relaunchs svp = os.path.join(state["workdir"], "save.json.gz") if os.path.exists(svp): state.load_state(svp) return state else: # Initialize a eukcc class E = eukcc(state) # hmmersearc is done using parents pressed hmms E.state["scmg_data"] = E.hmmsearch_scmg( E.state["workdir"], E.state["faa"], E.state["marker_set"]["profiles"], use_hmm=parent.state["estimate_hmm_path"], ) # merge parent results with kids E.state["scmg_data"].extend(parent.state["scmg_data"]) E.compute_quality(E.state["scmg_data"], E.state["marker_set"]["profiles"]) # saving this computation for a quick reload E.state.save_state(svp) return E.state
def split_contig_faa(path, workdir, delim="_binsep_"): faa_dir = os.path.join(workdir, "faa") if os.path.exists(faa_dir): logging.info( "Faa folder exists, remove if you want to rerun this analsys, will reuse it for now" ) return [ os.path.abspath(os.path.join(faa_dir, x)) for x in os.listdir(faa_dir) ] elif file.isdir(faa_dir): fls = {} for seq in Fasta(path): b = seq.name.split(delim, 1)[0] contig = seq.name.split(delim, 1)[1] if b not in fls.keys(): fls[b] = open(os.path.join(faa_dir, b), "w") fls[b].write(">{name}\n{seq}\n".format(name=contig, seq=seq.seq)) # close all files for key, fl in fls.items(): fl.close() return [ os.path.abspath(os.path.join(faa_dir, x)) for x in os.listdir(faa_dir) ] else: logging.warning("Could not create fasta split dir: {}".format(faa_dir))
def refine(state): """ main refinement pipeline It takes a state argument and does the rest. It is basically a per bin EukCC run with added mergin features """ state["contigs"] = os.path.join(state["workdir"], "contigs.fasta") if not os.path.exists(state["contigs"]): logging.debug("Merging bins into temp contig file") state["contigs"] = merge_fasta(state["input_bins"], state["contigs"], seperator="_binsplit_") else: logging.debug( "Using existing merged contigs, delete output folder if thats not what you want." ) # initialize output tables result_table = os.path.join(state["out"], "eukcc.csv") write_table(None, result_table, header=True) merged_table = os.path.join(state["out"], "merged_bins.csv") note_merges(None, None, merged_table, header=True) # predict proteins using metaeuk state["fasta"] = state["contigs"] logging.debug("Initialize EukCC") E = eukcc(state) E.predict_protein() # split proteins into Bin files # this also gets rid of bins with zero proteins state["faas"] = split_contig_faa(state["faa"], state["workdir"], delim="_binsplit_") if state["links"] is not None: link_table = read_link_table(state["links"]) # for each bin create a EukCC run and try to place and estimate its completeness bins = [] for i, path in enumerate(state["faas"]): logging.debug("Running EukCC on: {} ({}/{})".format( os.path.basename(path), i, len(state["faas"]))) wd = os.path.join(state["workdir"], "refine", "bin_{}".format(os.path.basename(path))) bins.append(bin(state, wd, path, protein=True)) smallbins = [] for i, b in enumerate(bins): if b.state["quality"] is None: smallbins.append(i) continue elif b.state["quality"]["completeness"] < 50: smallbins.append(i) write_table(b.state, result_table) continue else: write_table(b.state, result_table) # get all combinations of small bins # s_cmb = n_combi(smallbins, state["n_combine"]) # logging.info("Created {} possible combinations of small bins".format(len(s_cmb))) n_large_bins = len(bins) - len(smallbins) logging.info("Found {} large bins to merge with".format(n_large_bins)) refined = [] def get_name(i): return bins[i].name.split("_", 1)[1] def combine_bins(idx, children, smallbins, stop=0): all_comb = [] possible_kids = [] if state["links"] is not None: possible_kids = [] connected_kids = connected_bins(get_name(idx), link_table, state["min_links"]) for k in children: connected_kids = connected_kids.union( connected_bins(get_name(k), link_table, state["min_links"])) # convert names back to indicies for i, b in enumerate(bins): if get_name(i) in connected_kids: possible_kids.append(i) else: possible_kids = smallbins possible_kids = set(possible_kids) - set(children) # add a single kid and evaluate again if stop < 1: s = list(set(children) | set([idx])) s.sort() return [tuple(s)] else: for kid in possible_kids: k = children.copy() k.append(kid) all_comb.extend(combine_bins(idx, k, smallbins, stop=stop - 1)) return all_comb # mergers = defaultdict(lambda: {'parent': None, "children": [], "gain": []}) parent_bins = [i for i, b in enumerate(bins) if i not in smallbins] for parent_bin in parent_bins: parent_name = bins[parent_bin].name.split("_", 1)[1] # we find all combinations that are valid to merge with combies = [] for i in range(state["n_combine"]): ks = combine_bins(parent_bin, [], smallbins, stop=i) combies.extend(ks) combies = list(set(combies)) # sort combinations by length combies = sorted(combies, key=len, reverse=False) logging.info("For bin {} we found {} merging combinations".format( parent_name, len(combies))) for kid_ids in combies: # turn kid indexec into sorted list, so they are reproducible turn kid indexec into sorted list, so they are reproducible kid_ids = list(kid_ids) kid_ids.sort() kid_ids = [i for i in kid_ids if i != parent_bin] children = [bins[i] for i in kid_ids] if len(children) == 0: continue logging.debug( "Testing bin combination for bin {}".format(parent_name)) logging.debug("Adding in bins {}".format(",".join( [get_name(i) for i in kid_ids]))) # make all merges and see if they are great or not merged = merge_bins(bins[parent_bin], children, os.path.join(state["workdir"], "mergers")) # identify parent bin, if [A, B, C] then parent can be [A,B] or [A,C] or if none of them exists # it should be [A] compare_state = bins[parent_bin].state if len(children) > 1: for c in combinations(kid_ids, len(kid_ids) - 1): for potential_parent in refined: if set(c) == set(potential_parent["children_idx"]): compare_state = potential_parent break gain_cp = round( merged["quality"]["completeness"] - compare_state["quality"]["completeness"], 2, ) gain_ct = round( merged["quality"]["contamination"] - compare_state["quality"]["contamination"], 2, ) if gain_cp >= state["improve_percent"] and gain_cp > ( state["improve_ratio"] * gain_ct): logging.warning( "Successfull merge: +{}% Compl. +{}% Cont.".format( gain_cp, gain_ct)) merged["children_idx"] = kid_ids merged["parent_idx"] = parent_bin refined.append(merged) # logging.info("Iterated all possible combinations") # refined = remove_double_kids(refined, bins) # Here we just have to export the bins at this point. Thats easy and quick. # For each new bin we rerun eukcc with a enw metaeuk run, so we get a final verdict ref_dir = os.path.join(state["out"], "merged_bins") if os.path.exists(ref_dir): logging.warning( "Folder merged_bins alerady exist. Will not overwrite. Please remove the folder." ) merged_fnas = [os.path.join(ref_dir, x) for x in os.listdir(ref_dir)] else: file.isdir(ref_dir) merged_fnas = [] for merged_i, merged in enumerate(refined): new_name = "{}{}{}".format(state["prefix"], merged_i, state["suffix"]) merged_fna = os.path.join(ref_dir, new_name) # combined index list idxs = [merged["parent_idx"]] for i in merged["children_idx"]: idxs.append(i) names = [os.path.basename(bins[i].state["faa"]) for i in idxs] names = [n.split("_", 1)[1] for n in names] # write that to file note_merges(new_name, names, merged_table) fnas = [os.path.join(state["binfolder"], name) for name in names] merged_fna = merge_fasta(fnas, merged_fna, seperator=None) merged_fnas.append(merged_fna) logging.info("Created combined fasta {}".format(merged_fna)) evaluate_multiples(merged_fnas, result_table, state) logging.info("Created {} merged bins".format(len(refined)))
def run_EPA(self): wd = os.path.join(self.state["workdir"], "epa-ng", self.state["clade"]) file.isdir(wd) aligned_file = os.path.join(wd, "alignment.fasta") if file.isnewer(self.state["faa"], aligned_file): align_wd = os.path.join(wd, "align") aligned_file = self.align_marker_genes(self.state["faa"], self.state["marker_genes"], align_wd, aligned_file) if aligned_file is None: logging.debug("No marker genes aligned") return None # once the horizontal alignment is created we can delete # all small alignments file.remove(align_wd) # then split into query RS = epa_split( "epa-ng", wd, [ self.state["dbinfo"]["files"]["backbone_alignment"], aligned_file ], ["query.fasta", "reference.fasta"], ) # Return False if placement failed if RS.success is False: return None # then epa place them R = epa_ng( "epa-ng", wd, [ os.path.join(wd, "reference.fasta"), self.state["dbinfo"]["files"]["backbone_tree"], os.path.join(wd, "query.fasta"), ], ["epa_result.jplace"], model=self.state["dbinfo"]["iqtree_model"], cores=self.state["threads_epa"], ) # Return False if placement failed if R.success is False: return None # load the results try: with open(os.path.join(wd, "epa_result.jplace"), "r") as info_file: data = info_file.read() info = json.loads(data) except json.decoder.JSONDecodeError: jfile = os.path.join(wd, "epa_result.jplace") logging.error( "Malformed epa-ng result file. Did you ungracefully terminate? Remove the file {} and try again" .format(jfile)) exit(203) info["tog"] = self.guppy_tree(self.state["workdir"], os.path.join(wd, "epa_result.jplace")) logging.debug("Placed {} markers in the reference tree".format( len(info["placements"]))) return info
def hmmsearch_scmg(self, workdir, fasta_faa, profiles, keep_hmm=False, use_hmm=None, cut_ga=True): """ Fetch and compress and search markers specified by profiles list. Will call hmmfetch, hmmpress and hmmsearch internally :param workdir: folder to create tmp files under :param fasta_faa: predicted proteome :param profiles: list or set of profiles, must be in reference database """ # creating a hash from the profiles # so set adjustments will lead to new results profiles = list(profiles) profiles.sort() m = hashlib.sha256() # concatenate all parameters to a single hash hash_str = "-".join(profiles) hash_str = hash_str + str(cut_ga) m.update(hash_str.encode()) key = m.hexdigest()[0:12] logging.debug("Searching for selected markers") wd = os.path.join(workdir, "hmm", "singlecopy", self.state["clade"]) file.isdir(wd) outfile = "found_markers_{}.tbl".format(key) cutoff_file = "found_markers_{}_GA.csv".format(key) # only run if outfile is older than fasta_faa if file.isnewer(fasta_faa, os.path.join(wd, outfile)) or file.isnewer( fasta_faa, os.path.join(wd, cutoff_file)): if use_hmm is None: # fetch and press hmms from database hmmfile = self.hmmfetch(wd, profiles) logging.debug("Pressing hmms") hmmpress("hmmpress", wd, hmmfile, "selected_hmms.hmm.h3m") else: hmmfile = use_hmm logging.info("Searching fasta for selected markers") hmmsearch( "hmmsearch", workdir=wd, infiles=[hmmfile, fasta_faa], outfiles=[outfile], cores=self.state["threads"], cut_ga=cut_ga, ) # read GA values from fetched hmm file, needed for some stats in the end cutoff_file = hmmer_cutoffs(hmmfile, os.path.join(wd, cutoff_file)) if keep_hmm: self.state["estimate_hmm_path"] = hmmfile else: # we can remove hmms and pressed hmms file.delete_but( wd, keep=[ os.path.basename(x) for x in glob(os.path.join(wd, "found_markers_*")) ], ) return read_hmmer(os.path.join(wd, outfile), os.path.join(wd, cutoff_file))
def main(): # set arguments # arguments are passed to classes parser = argparse.ArgumentParser( description="Evaluate completeness and contamination of a MAG." ) parser.add_argument( "genomes", type=str, help="Find marker for these genomes", nargs="+" ) parser.add_argument( "--out", "-o", type=str, required=False, help="Path to output folder (Default: .)", default=".", ) parser.add_argument("--db", type=str, default=None, help="Path to EukCC DB") parser.add_argument( "--threads", type=int, help="Number of threads to use (Default: 1)", default=1 ) parser.add_argument( "--tree", type=int, help="Number of profiles to use at target for tree profiles (default: 30)", default=30, ) parser.add_argument( "--clade", default="base", type=str, help="Define clade as base, fungi, protozoa or plants (Defaut: base)", ) parser.add_argument( "--quiet", "-q", dest="quiet", action="store_true", default=False, help="Silcence most output", ) parser.add_argument( "--debug", "-d", action="store_true", default=False, help="Debug and thus ignore safety", ) parser.add_argument( "-v", "--version", action="version", version="EukCC version {}".format(version.__version__), ) args = parser.parse_args() state = eukcc_state( workdir=os.path.join(args.out, "refine_workdir"), options=vars(args) ) file.isdir(state["workdir"]) # define logging logLevel = logging.INFO if state["quiet"]: logLevel = logging.WARNING elif state["debug"]: logLevel = logging.DEBUG logging.basicConfig( format="%(asctime)s %(message)s", datefmt="%d-%m-%Y %H:%M:%S: ", level=logLevel, ) # if db is not set, we check for env variable if state["db"] is None: if os.environ.get("EUKCC2_DB") is not None: state["db"] = os.environ.get("EUKCC2_DB") logging.debug( "Defined db via env variable EUKCC2_DB as '{}'".format(state["db"]) ) else: logging.error("No database was provided via --db or EUKCC2_DB env variable") exit(202) logging.info("EukCC version {}".format(version.__version__)) logging.info( "Looking for shared markers across {} genomes".format(len(state["genomes"])) ) n_per_worker = 4 # using more threads for hmmer makes no sense, so we parallize accroos genomes if state["threads"] > (2 * n_per_worker): # multithreading pool n_processes = math.floor(state["threads"] / n_per_worker) logging.info( "Launching {} threads with {} threads each".format( n_processes, n_per_worker ) ) pool = Pool(processes=n_processes) # change threads not opt = {k: v for k, v in state.opt.items()} opt["threads"] = n_per_worker search_genome_p = partial(search_genome, state=opt) data = pool.map(search_genome_p, state["genomes"]) pool.close() pool.join() else: data = [] for genome in state["genomes"]: data.append(search_genome(genome, state)) tree_profiles = define_tree_set(data, n_target=args.tree) result = find_intersection(data, missing=3) outfile = os.path.join(state["out"], "profiles.txt") with open(outfile, "w") as fout: for key, profiles in result.items(): for profile in profiles: fout.write("{}\t{}\n".format(key, profile)) for profile in tree_profiles: fout.write("{}\t{}\n".format("tree", profile)) logging.info("wrote profiles to {}".format(outfile))
def test_file_isdir(self): self.assertTrue(file.isdir(TESTDATA_dir))