def runPlacedHMM(self, hmmfile, proteinfaa, bedfile): # run hmmer and strip down # define output files hmmDir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer", "estimations") file.isdir(hmmDir) hmmOut = os.path.join(hmmDir, "placement.tsv") hmmOus = os.path.join(hmmDir, "placement.out") hitOut = os.path.join(hmmDir, "hits.tsv") h = hmmer( "hmmsearch", proteinfaa, hmmOut, self.cfg["debug"], touch=self.cfg["touch"], ) if h.doIneedTorun( self.cfg["force"]) or self.cfg["fplace"] or file.isnewer( hmmfile, hmmOut): logging.info("Running hmmer for chosen locations") h.run( hmmOus, hmmfiles=hmmfile, modus=self.cfg["dbinfo"]["modus"], evalue=self.cfg["evalue"], cores=self.cfg["ncores"], training=self.cfg["training"], ) # clean hmmer outpout logging.info("Processing Hmmer results") hitOut = h.clean(hmmOut, bedfile, hitOut, self.cfg["mindist"]) return hitOut
def gmes(self, fasta): """ predict proteins using gmes """ logging.debug("Starting gmes function") gmesDir = os.path.join(self.cfg["outdir"], "workfiles", "gmes") file.isdir(gmesDir) gmesOut = os.path.join(gmesDir, "prot_seq.faa") gtffile = os.path.join(gmesDir, "genemark.gtf") inputfasta = os.path.abspath(os.path.join(gmesDir, "input.fna")) # GeneMark-ES g = gmes("runGMES", fasta, [gtffile, gmesOut], touch=self.cfg["touch"]) logging.debug("Defined gmes run") if g.doIneedTorun(self.cfg["force"]): # rename fasta entries, so we dont have white spaces in them # can be turned of via cleanfasta in config file if not self.cfg["touch"]: g.input = base.clearFastaNames(fasta, inputfasta) else: g.input = inputfasta logging.info("Running GeneMark-ES") g.run(cores=self.cfg["ncores"]) else: logging.debug("I do not need to run gmes, output exists:") logging.debug(gtffile) # always check if gtffile exists, if not Genemark-ES failed and # we can stop here if not file.exists(gtffile): # log and document failing # then stop pipeline logging.error("GeneMark-ES failed on this bin") self.write_outfile() exit(1) elif self.cfg["clean"]: # clean temp dirs _tmpdirs = ["data", "run", "info", "output/data", "output/gmhmm"] tempdirs = [os.path.join(gmesDir, x) for x in _tmpdirs] g.cleanup(tempdirs) # make a bed file from GTF bedf = os.path.join(gmesDir, "proteins.bed") if self.cfg["force"] or file.isnewer(gtffile, bedf) and not self.cfg["touch"]: logging.info("Extracting protein locations") bedf = base.gmesBED(gtffile, bedf) # touch files expected for next step if self.cfg["touch"]: g.touch([bedf, gmesOut]) self._clean_fasta = inputfasta return (gmesOut, bedf)
def concatHMM(self): # create a dir for this hmmdir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer", "estimations") file.isdir(hmmdir) hmmconcat = os.path.join(hmmdir, "all.hmm") if self.cfg["touch"]: file.touch(hmmconcat) return hmmconcat profiles = set() for p in self.placements[self.cfg["placementMethod"]]: localpath = os.path.join(self.cfg["db"], "sets", "{}.set".format(p["node"])) with open(localpath) as f: for line in f: profiles.add(line.strip()) # make profiles to sorted list profiles = list(profiles) profiles.sort() # create all paths for all hmms hmmerpaths = [os.path.join(self.cfg["db"], "hmms", "panther", "{}.hmm".format(profile)) for profile in profiles] # sort and check if we already have the hmm for this canuseprev = False profilehash = hashlib.sha256("_".join(profiles).encode()).hexdigest() hashpath = os.path.join(hmmdir, "all.hash") if file.exists(hashpath): with open(hashpath) as f: for line in f: prevhash = line.strip() break canuseprev = prevhash == profilehash if canuseprev: # we can use the existing file, so no need to continue log("Using pressed hmms from last run") return hmmconcat # concatenate if len(profiles) == 0: logging.error("We have no profiles to evaluate") exit(1) log("{} hmm profiles need to be used for estimations".format(len(profiles))) log("Concatenating hmms, this might take a while (IO limited)") hmmconcat = base.concatenate(hmmconcat, hmmerpaths) # press log("Pressing hmms") hp = hmmpress("hmmpress", hmmconcat, None, touch=self.cfg["touch"]) hp.run() # save profile hash with open(hashpath, "w") as f: f.write(f"{profilehash}") return hmmconcat
def __init__(self, program, inf, outf, debug=False, touch=False): # check software is in path: if run.which(program) is None: print("{} is not installed".format(program)) self.debug = debug self.touchonly = touch self.program = program self.input = inf # in case multiple output fiules are defined # we set the first one as output but use all for testing # is a rule has to be run if isinstance(outf, list): self.output = outf[0] self.output_test = outf else: self.output = outf self.output_test = [outf] if outf is not None: # create output dir file.isdir(os.path.dirname(self.output))
def checkIO(self, fastapath, outdir): # create outdir if not exists file.isdir(self.cfg["outdir"]) # check if input and output can be accessed logging.debug("Warning: IO check not yet implemented") return False
def place(self, fasta, bedfile): """ main function to place a bin in the tree. will subsequently run hmmer """ # test if we can open the input files first if not base.exists(fasta): logging.error("Could not open fasta file") self.write_outfile() exit(1) if not base.exists(bedfile): logging.error("Could not open bed file") self.write_outfile() exit(1) # define output files hmmDir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer") file.isdir(hmmDir) hmmOut = os.path.join(hmmDir, "placement.tsv") hmmOus = os.path.join(hmmDir, "placement.out") hitOut = os.path.join(hmmDir, "hits.tsv") # run hmmer if forced or input newer than output h = hmmer("hmmsearch", fasta, hmmOut, touch=self.cfg["touch"]) if h.doIneedTorun(self.cfg["force"]) or self.cfg["fplace"]: logging.info("Searching for proteins to place in the tree") h.run( hmmOus, hmmfiles=self.config.placementHMMs, modus=self.cfg["dbinfo"]["modus"], evalue=self.cfg["evalue"], cores=self.cfg["ncores"], ) # clean hmmer outpout logging.info("Processing Hmmer results") hitOut = h.clean(hmmOut, bedfile, hitOut, self.cfg["mindist"]) self.updateStep("findprots", "looked for proteins") # pplacer paths placerDir = os.path.join(self.cfg["outdir"], "workfiles", "pplacer") placerDirTmp = os.path.join(placerDir, "tmp") pplaceAlinment = os.path.join(placerDir, "horizontalAlignment.fasta") pplaceOut = os.path.join(placerDir, "placement.jplace") pplaceLog = os.path.join(placerDir, "placement.log") pplaceOutReduced = os.path.join(placerDir, "placementReduced.jplace") file.isdir(placerDirTmp) # pplacer logging.debug("Preparing pplacer") pp = pplacer("pplacer", fasta, pplaceOut, touch=self.cfg["touch"]) if pp.doIneedTorun(self.cfg["force"]) or self.cfg["fplace"]: logging.debug("Preparing alignments") pp.prepareAlignment( pplaceAlinment, hitOut, os.path.join(self.cfg["db"], "profile.list"), fasta, self.config, self.cfg, placerDirTmp, ) if pp.lenscmgs == 0 and not self.cfg["touch"]: logging.error("Could not find any marker genes") self.write_outfile() exit(1) else: logging.info("Placing proteins in tree") self.updateStep("pplacer", "starting") pplacer_success = pp.run( os.path.join(self.cfg["db"], "refpkg", "concat.refpkg"), logfile=pplaceLog, cores=self.cfg["ncorespplacer"], ) if pplacer_success is False: logging.warning("Pplacer could not finish. Exiting now") self.write_outfile() exit(1) # reduce placements to the placements with at least posterior of p logging.debug("Reducing placements") if not self.cfg["touch"]: pplaceOutReduced = pp.reduceJplace( pplaceOut, pplaceOutReduced, self.cfg["minPlacementLikelyhood"]) else: pp.touch([pplaceOutReduced]) logging.debug("Reducing placements done") # run TOG to get a tree togTree = os.path.join(placerDir, "placement.tree") tg = tog("guppy", pplaceOutReduced, togTree, touch=self.cfg["touch"]) if tg.doIneedTorun(self.cfg["force"]): logging.debug("Fetching tree") r = tg.run() if r is False: logging.debug("No placement found") self.write_outfile() logging.debug("Getting best placements") # save path to togtree for plotting later self.cfg["togtreepath"] = togTree self.cfg["togjson"] = pplaceOutReduced # now we can place the bin using the tree if not self.cfg["touch"]: t = treelineage.treeHandler(togTree, annotate=False) t2 = treelineage.treeHandler(self.config.tree, annotate=False) sets = self.getSets() # get HCA and LCA placements self.placements = {} for method in ["LCA", "HPA"]: self.placements[method] = t.getPlacement( method, sets, t2, self.cfg["nPlacements"], self.cfg["minSupport"], maximum=self.cfg["nEvals"], debug=self.cfg["debug"], ) else: self.placements = {"LCA": "touch", "HCA": "touch"} logging.info("MAG succesfully placed in tree")
default=False, help='silcence most output') parser.add_argument('--debug', '-d', action='store_true', default=False, help='debug and thus ignore safety') args = parser.parse_args() ############################################### # starting the analysis log("Running eukcc for {} bin{}".format(len(args.fasta), "s" if len(args.fasta) > 1 else "")) # create output if not exists if not file.isdir(args.outdir): exit() # check if a protein fasta was passed (implied ) if args.bed is not None: # set no glob args.noglob = True args.isprotein = True else: args.isprotein = False # check if we can expand glob: if len(args.fasta) == 1 and not args.noglob: log("Expanding paths using glob", not args.quiet) args.fasta = glob.glob(args.fasta[0])
def main(): # set arguments # arguments are passed to classes parser = configargparse.ArgumentParser( description="Evaluate completeness and contamination of a MAG.") parser.add_argument("fasta", type=str, help="Run script on this bin (fasta file)") parser.add_argument("--db", type=str, required=True, help="Path to EukCC DB") parser.add_argument( "--outdir", "-o", type=str, default="./", help= "Location for the output. Names will be prefixed using the bin filenames", ) parser.add_argument( "--config", "-c", type=str, required=False, is_config_file=True, help="Config file to define parameters, YAML", ) parser.add_argument( "--ncores", "-n", metavar="int", type=int, default=1, help="set number of cores for GeneMark-ES, pplacer and Hmmer", ) parser.add_argument( "--ncorespplacer", metavar="int", type=int, default=0, help="Pplacer requires a lot of memory. If you want \ you can set less cores for pplacer,\ which improves memory consumption significantly", ) parser.add_argument( "--hmm", dest="hmm", type=str, default=None, help="run hmmer on all these HMMs instead", ) parser.add_argument( "--training", dest="training", action="store_true", default=False, help= "Run EukCC in training mode (needed to create a new release of the DB)", ) parser.add_argument("--proteins", default=False, action="store_true", dest="proteins", help="Input fasta is proteins") parser.add_argument( "--bed", "-b", metavar="file.bed", type=str, default=None, help= "You can pass a bedfile of the protein location to omit fragmented proteins being detected twice", ) parser.add_argument( "--force", "-f", dest="force", action="store_true", default=False, help="Force rerun of computation even if \ output is newer than input. Don't resume previous run.", ) parser.add_argument( "--keeptemp", dest="clean", action="store_false", default=True, help= "Keep all temporary files, by default EukCC will remove some temp files", ) parser.add_argument( "--fplace", "-p", dest="fplace", action="store_true", default=False, help="Force rerun of placement and subsequent steps", ) parser.add_argument( "--noglob", "-g", dest="noglob", action="store_true", default=False, help="Do not expand paths using glob", ) parser.add_argument( "--quiet", "-q", dest="quiet", action="store_true", default=False, help="Silcence most output", ) parser.add_argument( "--debug", "-d", action="store_true", default=False, help="Debug and thus ignore safety", ) parser.add_argument( "--HPA", default=False, action="store_true", help="Set placement method to HPA", ) parser.add_argument( "--nPlacements", type=int, default=2, metavar="n", help="Set number of proteins to support location \ in tree (default: 2)", ) parser.add_argument( "--minGenomes", type=int, default=3, metavar="n", help="Minimal number of genomes to support a set (default: 3)", ) parser.add_argument( "--fullineage", default=False, action="store_true", help="Output full lineage for MAGs", ) parser.add_argument( "--minPlacementLikelyhood", default=0.4, type=float, metavar="float", help="minimal pplacer likelyhood (default: 0.4)", ) parser.add_argument( "--mindist", type=int, default=2000, metavar="n", help="Distance to collapse hits (default: 2000)", ) parser.add_argument( "--touch", default=False, action="store_true", help="Do not run, but touch all output files", ) parser.add_argument( "--gmes", default=False, action="store_true", help="only run GeneMark-ES", ) parser.add_argument( "--pygmes", default=False, action="store_true", help= "Use pygmes, will improve eukccs capability of running on highly fragmented bins but will take longer", ) parser.add_argument("--diamond", default=None, type=str, help="required to use pygmes option") parser.add_argument("--plot", default=False, action="store_true", help="produce plots") parser.add_argument("-v", "--version", action="version", version=f"EukCC version {version.__version__}") options = parser.parse_args() # define logging logLevel = logging.INFO if options.quiet: logLevel = logging.WARNING elif options.debug: logLevel = logging.DEBUG logging.basicConfig( format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %H:%M:%S: ", level=logLevel, ) # for pygmes we need a diamond DB if options.pygmes and options.diamond is None: logging.error( "For pygmes you need to provide a diamond database with taxonomic information" ) exit(1) logging.debug("Launching EukCC in debug mode") logging.info("Starting EukCC") # Now we start the run with EukCC # All magic numbers should be defined in info.py if they are not # part of the configuration options m = workflow.eukcc(options) # skip gene predition if this is already protein sequences if options.bed is None and options.proteins is False and options.pygmes is False: # run gmes proteinfaa, bedfile = m.gmes(options.fasta) elif options.bed is None and options.proteins is False and options.pygmes is True: proteinfaa, bedfile = m.pygmes(options.fasta, options.diamond) else: proteinfaa = options.fasta if options.bed is None: # create bed file bedpath = os.path.join(options.outdir, "workfiles", "proteins_tmp.bed") file.isdir(os.path.join(options.outdir, "workfiles")) bedfile = faabed(proteinfaa, bedpath) else: bedfile = options.bed # terminate if only gmes step was to be run if m.cfg["gmes"]: logging.info("Finished running GeneMark-ES") logging.info("Terminating as requested") exit(0) # run hmm file if we are asked to # this is needed during for training if m.cfg["training"] or m.cfg["hmm"]: logging.info("Running on custom hmm for training mode") m.runPlacedHMM(m.cfg["hmm"], proteinfaa, bedfile) logging.info("Stopping now as we are only doing training") exit(0) # place using pplacer and hmmer m.place(proteinfaa, bedfile) # concat hmms for hmmer hmmfile = m.concatHMM() # run Hmmer for sets of placement hits = m.runPlacedHMM(hmmfile, proteinfaa, bedfile) # infer lineage _ = m.inferLineage(m.placements[m.cfg["placementMethod"]]) # estimate completeness and contamiantion outputfile = os.path.join(m.cfg["outdir"], "eukcc.tsv") m.estimate(hits, outputfile, m.placements[m.cfg["placementMethod"]]) if m.cfg["plot"]: _ = m.plot()