def createMikadoGTFlist(self, out_file, out_dir, searchPath, searchQuery="*.gtf", strand=False): """Create a file to be used by mikado configure """ files = pe.find_files(searchPath, searchQuery) args = files #create out dir if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) outFilePath = os.path.join(out_dir, out_file + ".txt") gtfs = [] for l in args: thisName = pu.get_file_basename(l) if thisName: gtfs.append("\t".join([l, thisName, str(strand)])) f = open(outFilePath, "w") f.write("\n".join(gtfs)) f.close() pu.print_green("Mikado list file written to:" + outFilePath) return outFilePath
def create_lock(self, target_list, message): """ Cretes a temporary .Lock file associated with a target file and write a message in it. Parameters ---------- target_list : List List of target files. message : Str Message to write in file. Returns ------- templist : List A list of .Lock file names coressponding to the target files. """ templist = [] for f in target_list: temp_path = pu.get_file_directory(f) if not pu.check_paths_exist(temp_path): pu.mkdir(temp_path) prefix = pu.get_filename(f) + '_' temp = tempfile.NamedTemporaryFile(prefix=prefix, suffix='.Lock', dir=temp_path, delete=False) #TODO: dump command in lock timestamp = pu.get_timestamp() temp.write(str.encode(timestamp + '\t' + message)) templist.append(temp.name) return templist
def download_gtex_bams(manifest_file, outdir): #load list of bam files with open(manifest_file, 'r') as fi: thisdata = json.load(fi) flist = [] #check existing files for d in thisdata: f = d["file_name"] gid = f.split('.Aligned')[0] outfile = os.path.join(outdir, gid, f) #if pu.check_files_exist(outfile) and pu.get_mdf(outfile)==d["md5sum"]: if pu.check_files_exist(outfile): print("Outfile {} exists. Skipping...".format(outfile)) #copy it back to out dir os.rename(outfile, os.path.join(outdir, f)) flist.append(d["file_name"]) cmd = 'gen3-client download-multiple --profile={} --manifest={} --download-path={} --protocol=s3 --numparallel={} --skip-completed --no-prompt'.format( profile, m, outdir, threads) cdcmd = 'cd {}'.format(cwd) sshcmd = dtn_ssh + " '{}; {}'".format(cdcmd, cmd) out = pe.get_shell_output(sshcmd, verbose=True) #move the files for f in flist: source = os.path.join(outdir, f) gid = f.split('.Aligned')[0] destdir = os.path.join(outdir, gid) pu.mkdir(destdir) dest = os.path.join(destdir, f) #print('Moving {}-->{}'.format(source,dest)) os.rename(source, dest)
def generate_multiqc_from_log(logFile, filterList, tempDir, outDir="", coverage='a', verbose=False, cleanup=False): #dump stdout from logs to temp directory stdout = getStdoutFromLog(logFile, filterList, coverage) #create tmpdir pu.mkdir(tempDir) flist = [] for o in stdout: thisName = o + ".txt" tempFile = os.path.join(tempDir, thisName) # print("opening:"+tempFile) f = open(tempFile, "w") f.write(stdout[o]) #rint(stdout[o]) f.close() flist.append(tempFile) #run multiqc #tempDir stores .txt files for MQC to read mc.run(analysis_dir=tempDir, outdir=outDir) #cleanup if cleanup: for f in flist: pu.print_blue("Removing {}".format(f)) os.remove(f)
def build_index(self, in_fasta, dbname, out_dir=None, threads=None, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Build a diamond index and store its path in self """ #check input files if not pu.check_files_exist(in_fasta): pu.print_boldred( "Input fasta: {} not found...\n diamond makedb failed".format( in_fasta)) return False #create out_dir if not out_dir: out_dir = os.getcwd() if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #check if index already exists index_path = os.path.join(out_dir, dbname) self.index = index_path if self.check_index(): pu.print_green("Diamond index: {} exists, using it...".format( self.index)) self.index = index_path return True if not threads: threads = self.threads newOpts = { "--in": in_fasta, "-d": index_path, "--threads": str(threads) } #add input files to kwargs, overwrite newOpts with kwargs mergedOpts = {**newOpts, **kwargs} #call run_diamond status = self.run_diamond("makedb", verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if status: self.index = index_path return True return False
def perform_qc(self,sra_object,out_dir="",out_suffix="_bbduk",objectid="NA"): """Run bbduk on fastq files specified by the sra_object sra_object: SRA An SRA object whose fastq files will be used out_dir: str Path to output directory out_suffix: string Suffix for the output sam file objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired. :rtype: tuple """ #make out_dir if not out_dir: out_dir=sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout=='PAIRED': fq1=sra_object.fastq_path fq2=sra_object.fastq2_path out_fileName1=pu.get_file_basename(fq1)+out_suffix+".fastq" out_fileName2=pu.get_file_basename(fq2)+out_suffix+".fastq" out_file1Path=os.path.join(out_dir,out_fileName1) out_file2Path=os.path.join(out_dir,out_fileName2) internal_args=() internal_kwargs={"in":fq1,"in2":fq2,"out":out_file1Path,"out2":out_file2Path} #run bbduk status=self.run(*internal_args,objectid=objectid,target=[out_file1Path,out_file2Path],**internal_kwargs) if status: if not pu.check_files_exist(out_file1Path,out_file2Path) and not _dryrun: return("",) return(out_file1Path,out_file2Path) else: fq=sra_object.fastq_path out_fileName=pu.get_file_basename(fq)+out_suffix+".fastq" out_filePath=os.path.join(out_dir,out_fileName) internal_args=() internal_kwargs={"in":fq,"out":out_filePath} #run bbduk status=self.run(*internal_args,objectid=objectid,target=out_filePath,**internal_kwargs) if status: if not pu.check_files_exist(out_filePath) and not _dryrun: return("",) return(out_filePath,)
def perform_alignment(self, sra_object, out_suffix="_bowtie2", out_dir="", objectid="NA"): """Function to perform alignment using sra_object. Parameters ---------- sra_object SRA object An object of type SRA. The path to fastq files will be obtained from this object. out_suffix: string Suffix for the output sam file out_dir: string Directory to save the results. Default value is sra_object.directory objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the sorted bam file path after converting sam to bam and sorting it :rtype: string """ if not out_dir: out_dir = sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #create path to output sam file outSamFile = os.path.join( out_dir, sra_object.srr_accession + out_suffix + ".sam") #outBamFile=os.path.join(out_dir,sra_object.srr_accession+out_suffix+"_sorted.bam") #find layout and fq file paths if sra_object.layout == 'PAIRED': internal_kwargs = { "-1": sra_object.fastq_path, "-2": sra_object.fastq2_path, "-S": outSamFile } else: internal_kwargs = {"-U": sra_object.fastq_path, "-S": outSamFile} status = self.run(None, objectid=sra_object.srr_accession, target=outSamFile, **internal_kwargs) if status: if not pu.check_files_exist(outSamFile) and not _dryrun: return "" #convert to bam before returning; returns outBamFile return tools.Samtools().sam_sorted_bam(outSamFile) return ""
def run_transdecoder_predict(self, infasta, longorfs_dir, out_dir=None, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): if not pu.check_files_exist(infasta): pu.print_boldred("Please check input file:" + infasta) if not pu.check_paths_exist(longorfs_dir): pu.print_boldred("Path {} doesn't exist".format(longorfs_dir)) move_flag = True if not out_dir: out_dir = os.getcwd() move_flag = False if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) newOpts = {"-t": infasta, "-O": longorfs_dir} mergedOpts = {**newOpts, **kwargs} #execute Predict status = self.run_transdecoder('TransDecoder.Predict', verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if not status: pu.print_boldred("Transdecoder failed") return "" #move output files to outdir if move_flag: outfile_prefix = pu.get_filename(infasta) + ".transdecoder" pe.move_file(outfile_prefix + ".bed", os.path.join(out_dir, outfile_prefix + ".bed"), verbose) pe.move_file(outfile_prefix + ".cds", os.path.join(out_dir, outfile_prefix + ".cds"), verbose) pe.move_file(outfile_prefix + ".gff3", os.path.join(out_dir, outfile_prefix + ".gff3"), verbose) pe.move_file(outfile_prefix + ".pep", os.path.join(out_dir, outfile_prefix + ".pep"), verbose) return out_dir
def generate_multiqc(directory, tempDir, outDir="", coverage='a', verbose=False, cleanup=False): """ Generate reports using multiqc Parameters ---------- directory : str path to directory containing logs. tempDir : str temp dir. outDir : str, optional output dir. The default is "". coverage : char, optional commands to use in pyrpipe log: fa(i)led (p)assed or (a)ll. The default is 'a'. verbose : bool, optional print messages. The default is False. cleanup : bool, optional remove temp files. The default is False. Returns ------- None. """ #searg all _pyrpipe.log files under current directory files = pu.find_files(directory, ".*_pyrpipe\.log$", recursive=True) #extract stdout from each file and save to temp if not outDir: outDir = 'MultiQC_out' #create tempdir if not pu.check_paths_exist(tempDir): pu.mkdir(tempDir) for f in files: #dump stdout from logs to temp directory stdout = getStdoutFromLog(f, None, coverage) fid = f.split('_pyrpipe')[0].split('_')[-1] for o in stdout: thisName = o + "_" + fid + ".txt" tempFile = os.path.join(tempDir, thisName) f = open(tempFile, "w") f.write(stdout[o]) #print('written',tempFile) f.close() #run multiqc mc.run(analysis_dir=directory, outdir=outDir) pass
def perform_assembly(self, bam_file, out_dir=None, out_suffix="_stringtie", objectid="NA"): """Function to run stringtie using a bam file. Parameters ---------- bam_file: string path to the bam file out_dir: string Path to out file out_suffix: string Suffix for the output gtf file objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname = pu.get_file_basename(bam_file) if not out_dir: out_dir = pu.get_file_directory(bam_file) if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf") #Add output file name and input bam internal_args = (bam_file, ) internal_kwargs = {"-o": out_gtf_file} #add positional args internal_kwargs['--'] = internal_args #call stringtie status = self.run(None, objectid=objectid, target=out_gtf_file, **internal_kwargs) if status: #check if sam file is present in the location directory of sraOb if not pu.check_files_exist(out_gtf_file) and not _dryrun: return "" return out_gtf_file return ""
def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"): """Run kallisto quant sra_object: SRA SRA object contatining paths to fastq files out_suffix: str suffix for output file out_dir: str path to output directory objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Path to kallisto out directory :rtype: string """ if not out_dir: out_dir=os.path.join(sra_object.directory,"kallisto_out") else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout == 'PAIRED': args=(sra_object.fastq_path,sra_object.fastq2_path) internal_kwargs={"-o":out_dir,"-i":self.index} else: args=(sra_object.fastq_path,) internal_kwargs={"-o":out_dir,"--single":"","-i":self.index} #targets outfile=os.path.join(out_dir,"abundance.tsv") newfile=os.path.join(out_dir,"abundance"+out_suffix+".tsv") #check if final files already exists if not _force and pu.check_files_exist(newfile): pu.print_green('Target files {} already exist.'.format(newfile)) return newfile #call kallisto status=self.run(*args,subcommand='quant',objectid=sra_object.srr_accession,target=outfile,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(outfile,newfile) if not pu.check_files_exist(newfile): return "" return newfile return ""
def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"): """run salmon quant sra_object: SRA An SRA object with valid fastq files out_suffix: str suffix string fout out file out_dir: str path to outdir objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Path to salmon out file :rtype: string """ if not out_dir: out_dir=os.path.join(sra_object.directory,"salmon_out") else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout == 'PAIRED': internal_kwargs={"-o":out_dir,"-l":"A","-1":sra_object.fastq_path,"-2":sra_object.fastq2_path,"-i":self.index} else: internal_kwargs={"-o":out_dir,"-l":"A","-r":sra_object.fastq_path,"-i":self.index} #targets outfile=os.path.join(out_dir,"quant.sf") newfile=os.path.join(out_dir,"quant"+out_suffix+".sf") #check if final files already exists if not _force and pu.check_files_exist(newfile): pu.print_green('Target files {} already exist.'.format(newfile)) return newfile #call salmon status=self.run(None,subcommand='quant',objectid=sra_object.srr_accession,target=newfile,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(outfile,newfile) if not pu.check_files_exist(newfile): return "" return newfile return ""
def multiqc(): print("Generating html report with multiqc") parser = argparse.ArgumentParser( description='pyrpipe diagnostic utility\nGenerate report with multiqc.', usage='''pyrpipe_diagnostic multiqc [<args>] <logfile> ''') parser.add_argument('-o', help='out directory \ndefault: <./>',action="store") parser.add_argument('-c',help='Dump command options [(a)ll,fa(i)l,(p)ass]\ndefault: a',default='a',action="store") parser.add_argument('-v',help='verbose',action="store_true") parser.add_argument('-f',help='Filter by programs. Provide a comma-separated list e.g., prefetch,STAR,bowtie2 \ndefault None') parser.add_argument('-t',help='Temporary directory. \ndefault ./tmp',action="store") parser.add_argument('-r',help='Remove stdout files after processing. \ndefault ./tmp',action="store_true") parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store") args = parser.parse_args(sys.argv[2:]) logFile=args.logfile #parse args vFlag=args.v if vFlag: print("Generating MutiQC report") outDir="" if args.o is None: outDir=os.getcwd() else: outDir=args.o filters=[] if args.f is not None: filters= args.f.split(',') #create temp dir tempDir="" if args.t is not None: tempDir= args.t else: tempDir=os.path.join(os.getcwd(),"tmp") #create tmp dir if not pu.check_paths_exist(tempDir): pu.mkdir(tempDir) #run multiqc generateMultiqcReport(logFile,filters,tempDir,outDir=outDir,coverage=args.c,verbose=args.v,cleanup=args.r)
def benchmark(): print("Generating benchmarks") parser = argparse.ArgumentParser( description='pyrpipe diagnostic utility\nGenerate benchmark report.', usage='''pyrpipe_diagnostic report [<args>] <logfile> ''') parser.add_argument('-o', help='out file \ndefault: same as input logfile',action="store") parser.add_argument('-e', help='report output type: [MD,PDF,HTML] \ndefault: PDF',default='PDF',action="store") parser.add_argument('-v',help='verbose',action="store_true") parser.add_argument('-f',help='Filter by programs. Provide a comma-separated list e.g., prefetch,STAR,bowtie2 \ndefault None') parser.add_argument('-t',help='Temporary directory. \ndefault ./tmp',action="store") parser.add_argument('logfile', help='The log file generated by pyrpipe',action="store") args = parser.parse_args(sys.argv[2:]) logFile=args.logfile envLog=checkEnvLog(logFile) #parse args vFlag=args.v if vFlag: print("Generating benchmarks") outFile="" if args.o is None: outFile=pu.get_file_basename(args.logfile) else: outFile=args.o outFile+='.'+args.e filters=[] if args.f is not None: filters= args.f.split(',') #create temp dir tempDir="" if args.t is not None: tempDir= args.t else: tempDir=os.path.join(os.getcwd(),"tmp") #create tmp dir if not pu.check_paths_exist(tempDir): pu.mkdir(tempDir) generateBenchmarkReport(logFile,envLog,filters,tempDir,outFile=outFile,verbose=args.v)
def createMikadoGTFlist(self, out_file, out_dir, searchPath, searchQuery="*.gtf", strand=False): """Create a file to be used by mikado configure out_file: str outfile name out_dir: str path to out_dir searchPath: str Path where gtf/gff files will be searched searchQuery: str Query to perform search. Default: "*.gtf" strand: bool Stranded flag: Default false """ files = pe.find_files(searchPath, searchQuery, recursive=True) args = files #create out dir if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) outFilePath = os.path.join(out_dir, out_file + ".txt") gtfs = [] for l in args: thisName = pu.get_file_basename(l) if thisName: gtfs.append("\t".join([l, thisName, str(strand)])) f = open(outFilePath, "w") f.write("\n".join(gtfs)) f.close() pu.print_green("Mikado list file written to:" + outFilePath) return outFilePath
def build_index(self,index_path,index_name,fasta,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """ build salmon index and store the path to index in self index_path: str path to the output directory index_name: str index name verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to salmon. This will override the existing options :return: status of salmon index :rtype: bool """ #check input if not pu.check_files_exist(fasta): pu.print_boldred("{} does not exist. Exiting".format(fasta)) return False #create out dir if not pu.check_paths_exist(index_path): if not pu.mkdir(index_path): print("ERROR in building hisat2 index. Failed to create index directory.") return False indexOut=os.path.join(index_path,index_name) newOpts={"-t":fasta,"-i":indexOut} mergedOpts={**kwargs,**newOpts} #call salmon status=self.run_salmon("index",verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**mergedOpts) if status: #check if sam file is present in the location directory of sra_object #if check_files_exist(os.path.join(indexOut,"versionInfo.json")): #not sure if this is reliable if pu.check_paths_exist(indexOut): self.salmon_index=indexOut self.passedArgumentDict['-i']=self.salmon_index pu.print_green("salmon index is:"+self.salmon_index) return True pu.print_boldred("Failed to create salmon index") return False
def build_index(self,index_path,index_name,fasta,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """Function to build kallisto index index_path: str path to the output directory index_name: str index name verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to kallisto. This will override the existing options in self.passed_args_dict (only replace existing arguments and not replace all the arguments). :return: Status of kallisto index :rtype: bool """ #check input if not pu.check_files_exist(fasta): pu.print_boldred("{} does not exist. Exiting".format(fasta)) return False #create out dir if not pu.check_paths_exist(index_path): if not pu.mkdir(index_path): print("ERROR in building kallisto index. Failed to create index directory.") return False indexOut=os.path.join(index_path,index_name) newOpts={"--":(fasta,),"-i":indexOut} mergedOpts={**kwargs,**newOpts} #call salmon status=self.run_kallisto("index",verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**mergedOpts) if status: #check if sam file is present in the location directory of sra_object if pu.check_files_exist(indexOut): self.kallisto_index=indexOut self.passedArgumentDict['-i']=self.kallisto_index pu.print_green("kallisto_index is:"+self.kallisto_index) return True else: pu.print_boldred("Failed to create kallisto index") return False
def __init__(self,log_file,env_log,out_dir=""): if not pu.check_files_exist(log_file,env_log): raise Exception("Please check input for benchmark report. {} {}".format(log_file,env_log)) if not out_dir: out_dir=os.getcwd() self.log_file=log_file self.env_log=env_log self.runtimes_by_prog={} self.runtimes_by_object={} #init pu.print_blue("parsing log...") self.parse_logs() pu.print_blue("done.") #out_dir self.benchmark_dir=os.path.join(out_dir,'benchmark_reports') if not pu.check_paths_exist(self.benchmark_dir): if not pu.mkdir(self.benchmark_dir): raise Exception("Error running benchmarks. Can not create output directory {}".format(self.benchmark_dir))
def perform_assembly(self,bam_file,out_dir="",out_suffix="_cufflinks",reference_gtf=None,threads=None,overwrite=True,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """Function to run cufflinks with BAM file as input. Parameters ---------- bam_file: string path to bam file out_dir: output directory out_suffix: string Suffix for the output gtf file reference_gtf: str Path to reference gtf threads: int Number of threads to use overwrite: bool Overwrite if output file already exists. verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. kwargs: dict Options to pass to cufflinks. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname=pu.get_file_basename(bam_file) if not out_dir: out_dir=pu.get_file_directory(bam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf") """ Handle overwrite """ if not overwrite: #check if file exists. return if yes if os.path.isfile(out_gtf_file): print("The file "+out_gtf_file+" already exists. Exiting..") return out_gtf_file if not threads: threads=self.threads #Add output file name and input bam new_opts={"-o":out_dir,"--":(bam_file,),"-p":str(threads)} #add ref gtf if reference_gtf: if not pu.check_files_exist(reference_gtf): pu.print_boldred("Error: Provided reference GTF {} doesn't exist. Exiting...".format(reference_gtf)) return "" new_opts["-g"]=reference_gtf merged_opts={**new_opts,**kwargs} #call cufflinks status=self.run_cufflinks(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts) if status: #move out_dir/transcripts.gtf to outfile pe.move_file(os.path.join(out_dir,"transcripts.gtf"),out_gtf_file) #check if sam file is present in the location directory of sraOb if pu.check_files_exist(out_gtf_file): return out_gtf_file else: return ""
def runMikadoConfigure(self, listFile, genome, mode, scoring, junctions, out_file, out_dir=os.getcwd(), verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Wrapper to run mikado configure Make sure the paths in list file are global. Parameters ---------- :return: Path to the created configuration file :rtype: string """ #check all file exists if not pu.check_files_exist(listFile, genome, junctions, scoring): print("Please check mikado input") return "" #create out dir if not pu.check_paths_exist(out_dir): if not pu.mkdir(out_dir): raise Exception("Exception in mikado configure.") outFilePath = os.path.join(out_dir, out_file + ".yaml") newOpts = { "--list": listFile, "--reference": genome, "--mode": mode, "--scoring": scoring, "--junctions": junctions, "--": (outFilePath, ) } #merge with kwargs mergedOpts = {**kwargs, **newOpts} status = self.runMikado("configure", verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if not status: pu.print_boldred( "Mikado configure failed.\nPlease make sure the paths in list file are global." ) return "" #check if bam file exists if not pu.check_files_exist(outFilePath): return "" return outFilePath
def perform_qc(self, sra_object, out_dir="", out_suffix="_bbduk", overwrite=True, threads=None, max_memory=None, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Run bbduk on fastq files specified by the sra_object Parameters ---------- sra_object: SRA an SRA object out_dir: string Path to out dir. Default: sra_object.location out_suffix: string Suffix for output file name overwrite: bool overwrite existing files threads: int Num threads to use max_memory: float Max memory to use in GB verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict options passed to bbduk :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired. :rtype: tuple """ #make out_dir if not out_dir: out_dir = sra_object.location else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if not threads: threads = self.threads if not max_memory: max_memory = self.max_memory memory_flag = "-Xmx" + str(max_memory) + "g" #optimize parameters #if optimize: # print("generating suggested parameters XXX TD") if sra_object.layout == 'PAIRED': fq1 = sra_object.localfastq1Path fq2 = sra_object.localfastq2Path out_fileName1 = pu.get_file_basename(fq1) + out_suffix + ".fastq" out_fileName2 = pu.get_file_basename(fq2) + out_suffix + ".fastq" out_file1Path = os.path.join(out_dir, out_fileName1) out_file2Path = os.path.join(out_dir, out_fileName2) newOpts = { "in": fq1, "in2": fq2, "out": out_file1Path, "out2": out_file2Path, "--": (memory_flag, ), "threads": str(threads) } mergedOpts = {**newOpts, **kwargs} #run bbduk if self.run_bbduk(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts): if pu.check_files_exist(out_file1Path, out_file2Path): return (out_file1Path, out_file2Path) return ("", ) else: fq = sra_object.localfastqPath out_fileName = pu.get_file_basename(fq) + out_suffix + ".fastq" out_filePath = os.path.join(out_dir, out_fileName) newOpts = {"in": fq, "out": out_filePath, "--": (memory_flag, )} mergedOpts = {**newOpts, **kwargs} #run bbduk if self.run_bbduk(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts): if pu.check_files_exist(out_filePath): return (out_filePath, ) return ("", )
def sam_to_bam(self, sam_file, out_dir="", out_suffix="", delete_sam=False, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Convert sam file to a bam file. Output bam file will have same name as input sam. out_suffix: string Suffix for the output sam file delete_sam: bool delete the sam file after conversion verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to trimgalore. This will override the existing options :return: Returns the path to the bam file. Returns empty string if operation failed. :rtype: string """ if not out_dir: out_dir = pu.get_file_directory(sam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) fname = pu.get_file_basename(sam_file) #output will be out_bam out_bam = os.path.join(out_dir, fname + out_suffix + '.bam') newOpts = {"--": (sam_file, ), "-o": out_bam, "-b": ""} mergedOpts = {**kwargs, **newOpts} status = self.run_samtools("view", verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if not status: print("Sam to bam failed for:" + sam_file) return "" #check if bam file exists if not pu.check_files_exist(out_bam): return "" #delete_sam_file if delete_sam: if not pe.deleteFileFromDisk(sam_file): print("Error deleting sam file:" + sam_file) #return path to file return out_bam
def merge_bam(self, *args, out_file="merged", out_dir="", delete_bams=False, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Merge multiple bam files into a single file Parameters ---------- out_file: string Output file name to save the results. .bam will be added at the end. args:tuple Paths to bam files to combine out_dir: string Path where to save the merged bam file. Default path is the same as the first bam_file's verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to trimgalore. This will override the existing options :return: Returns the path to the merged bam file. :rtype: string """ if len(args) < 2: print("Please supply at least 2 files to merge") return "" if not out_dir: out_dir = pu.get_file_directory(args[0]) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) outMergedFile = os.path.join(out_dir, out_file + ".bam") newOpts = {"--": (outMergedFile, ) + args} mergedOpts = {**kwargs, **newOpts} status = self.run_samtools("merge", verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if not status: print("Bam merge failed for:" + outMergedFile) return "" #check if bam file exists if not pu.check_files_exist(outMergedFile): return "" if delete_bams: for bam_file in args: if not pe.deleteFileFromDisk(bam_file): print("Error deleting sam file:" + bam_file) return outMergedFile
def build_index(self,index_path,transcriptome,objectid="NA"): """ Parameters ---------- index_path : TYPE DESCRIPTION. transcriptome : TYPE DESCRIPTION. objectid : TYPE, optional DESCRIPTION. The default is "NA". Raises ------ OSError DESCRIPTION. Returns ------- bool DESCRIPTION. """ #if index already exists then exit if not _force: #check if files exists if pu.check_salmonindex(index_path): pu.print_green("Salmon index {} already exists.".format(index_path)) self.index=index_path return True #check input if not pu.check_files_exist(transcriptome): pu.print_boldred("{} does not exist. Exiting".format(transcriptome)) return False #create out dir indexdir=pu.get_file_directory(index_path) #create the out dir if not pu.check_paths_exist(indexdir): if not pu.mkdir(indexdir): raise OSError("Error creating salmon index. Failed to create index directory.") validArgsIndex=valid_args._args_SALMON_INDEX internal_kwargs={"--threads":_threads,"-t":transcriptome,"-i":index_path} #read build parameters yamlfile=os.path.join(_params_dir,'salmon_index.yaml') if pu.check_files_exist(yamlfile): yaml_params=pl.YAML_loader(yamlfile) yaml_kwargs=yaml_params.get_kwargs() internal_kwargs={**yaml_kwargs,**internal_kwargs} salmon_cmd=['salmon','index'] salmon_cmd.extend(pu.parse_unix_args(validArgsIndex,internal_kwargs)) #call salmon status=pe.execute_command(salmon_cmd,objectid=objectid) if status: if pu.check_salmonindex(index_path) and not _dryrun: #update object's index self.index=index_path if self.check_index(): return True else: raise OSError("Error building salmon index") return False
def build_index(self,index_path,transcriptome,objectid="NA"): """Function to build kallisto index index_path: str path to the index transcriptome: str Path to transcriptome objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Status of kallisto index :rtype: bool """ #if index already exists then exit if not _force: #check if files exists if pu.check_files_exist(index_path): pu.print_green("Kallisto index {} already exists.".format(index_path)) self.index=index_path return True #check input if not pu.check_files_exist(transcriptome): pu.print_boldred("{} does not exist. Exiting".format(transcriptome)) raise ValueError("Please check input to kallisto index") #create out dir indexdir=pu.get_file_directory(index_path) #create the out dir if not pu.check_paths_exist(indexdir): if not pu.mkdir(indexdir): raise OSError("Error creating kallisto index. Failed to create index directory.") args=(transcriptome,) internal_kwargs={"-i":index_path} #read build parameters yamlfile=os.path.join(_params_dir,'kallisto_index.yaml') if pu.check_files_exist(yamlfile): yaml_params=pl.YAML_loader(yamlfile) yaml_kwargs=yaml_params.get_kwargs() internal_kwargs={**yaml_kwargs,**internal_kwargs} #add positional args internal_kwargs['--']=args validArgsIndex=valid_args._args_KALLISTO_INDEX kallisto_cmd=['kallisto','index'] kallisto_cmd.extend(pu.parse_unix_args(validArgsIndex,internal_kwargs)) #call kallisto status=pe.execute_command(kallisto_cmd,objectid=objectid) if status: if pu.check_files_exist(index_path) and not _dryrun: #update object's index self.index=index_path if self.check_index(): return True else: raise OSError("Error building kallisto index") return False
def perform_qc(self,sra_object,out_dir="",out_suffix="_trimgalore",objectid="NA"): """Function to perform qc using trimgalore. The function perform_qc() is consistent for all QC classess. Parameters ---------- sra_object: SRA An SRA object whose fastq files will be used out_dir: str Path to output directory out_suffix: string Suffix for the output sam file objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired. :rtype: tuple """ if not out_dir: out_dir=sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #get layout if sra_object.layout=='PAIRED': fq1=sra_object.fastq_path fq2=sra_object.fastq2_path internal_args=(fq1,fq2) internal_kwargs={"--paired":"","-o":out_dir} """ running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq move these files to the specified out files """ file1=os.path.join(out_dir,pu.get_file_basename(fq1)+"_val_1.fq") file2=os.path.join(out_dir,pu.get_file_basename(fq2)+"_val_2.fq") #targets out_file1=os.path.join(out_dir,pu.get_file_basename(fq1)+out_suffix+".fastq") out_file2=os.path.join(out_dir,pu.get_file_basename(fq2)+out_suffix+".fastq") #check if final files already exists if not _force and pu.check_files_exist(out_file1,out_file2): pu.print_green('Target files {}, {} already exist.'.format(out_file1,out_file2)) return out_file1,out_file2 #run trimgalore status=self.run(*internal_args,objectid=objectid,target=[file1,file2],**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(file1,out_file1,verbose=False) pe.move_file(file2,out_file2,verbose=False) if not pu.check_files_exist(out_file1,out_file2): return "" return out_file1,out_file2 return ("",) else: fq=sra_object.fastq_path internal_args=(fq,) internal_kwargs={"-o":out_dir} """ running trim galore will create one file named <input>_trimmed.fq move these files to the specified out files """ file=os.path.join(out_dir,pu.get_file_basename(fq)+"_trimmed.fq") #target out_file=os.path.join(out_dir, pu.get_file_basename(fq)+out_suffix+".fastq") #check if final files already exists if not _force and pu.check_files_exist(out_file): pu.print_green('Target files {} already exist.'.format(out_file)) return (out_file,) #run trimgalore status=self.run(*internal_args,objectid=objectid,target=file,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(file,out_file) if not pu.check_files_exist(out_file): return "" return (out_file,) return ("",)
def perform_cleaning(self,sra_object,bbsplit_index,out_dir="",out_suffix="_bbsplit",objectid="NA",**kwargs): """ Remove contaminated reads mapping to given reference using bbsplit Parameters ---------- sra_object: SRA an SRA object bbsplit_index: string Path to bbsplit index or fasta file which will generate index out_dir: string Path to output dir. Default: sra_object.directory out_suffix: string Suffix for output file name objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict options passed to bbsplit :return: Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired. :rtype: tuple """ #check index indexPath="" if not pu.check_paths_exist(bbsplit_index): #index folder doesn't exist #check if input is path to fasta if not pu.check_files_exist(bbsplit_index): print("Error: Please check bbsplit index") return ("",) #check if index folder "ref" exists in this directory indexPath=os.path.join(pu.get_file_directory(bbsplit_index),"ref") if pu.check_paths_exist(indexPath): print("Using bbsplit index: "+indexPath) else: #create new index print("Creating new index"+indexPath) newOpts={"ref_x":bbsplit_index,"path": pu.get_file_directory(bbsplit_index)} mergedOpts={**kwargs,**newOpts} #run bbduk if not self.run_bbsplit(objectid=objectid,**mergedOpts): print("Error creating bbsplit index.") return ("",) if not pu.check_paths_exist(indexPath): print("Error creating bbsplit index.") return ("",) else: indexPath=bbsplit_index #indexPath point to the ref directory, go one directory higher indexPath=os.path.dirname(indexPath) #make out_dir if not out_dir: out_dir=sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout=='PAIRED': fq1=sra_object.fastq_path fq2=sra_object.fastq2_path #append input and output options out_fileName1=pu.get_file_basename(fq1)+out_suffix+".fastq" out_fileName2=pu.get_file_basename(fq2)+out_suffix+".fastq" out_file1Path=os.path.join(out_dir,out_fileName1) out_file2Path=os.path.join(out_dir,out_fileName2) newOpts={"in1":fq1,"in2":fq2,"outu1":out_file1Path,"outu2":out_file2Path,"path":indexPath} mergedOpts={**kwargs,**newOpts} #run bbsplit if self.run_bbsplit(objectid=objectid,target=[out_file1Path,out_file2Path],**mergedOpts): if pu.check_files_exist(out_file1Path,out_file2Path): return(out_file1Path,out_file2Path) return("",) else: fq=sra_object.fastq_path #append input and output options out_fileName=pu.get_file_basename(fq)+out_suffix+".fastq" out_filePath=os.path.join(out_dir,out_fileName) newOpts={"in":fq,"outu":out_filePath,"path":indexPath} mergedOpts={**kwargs,**newOpts} #run bbsplit if self.run_bbsplit(objectid=objectid,target=out_filePath,**mergedOpts): if pu.check_files_exist(out_filePath): return(out_filePath,) return("",)
from pyrpipe import sra, mapping, assembly, qc, tools from pyrpipe import pyrpipe_utils as pu from pyrpipe import pyrpipe_engine as pe maizeRun = [ 'SRR1573523', 'SRR999058', 'SRR520999', 'SRR1168424', 'SRR1621015', 'SRR3084882', 'SRR1620828', 'SRR3053545', 'SRR1620949', 'SRR1620947' ] workingDir = "maize_out" if not pu.check_paths_exist(workingDir): pu.mkdir(workingDir) GENOME = workingDir + "/Zm-B73-REFERENCE-NAM-5.0.fa" if not pu.check_files_exist(GENOME): print("Downloading genome fasta file") wget = "wget https://download.maizegdb.org/Zm-B73-REFERENCE-NAM-5.0/Zm-B73-REFERENCE-NAM-5.0.fa.gz -q -O " + GENOME + ".gz" pe.execute_command(wget.split(), verbose=True, logs=False) pe.execute_command(['gunzip', GENOME + ".gz"], verbose=True, logs=False) sraObjects = [] for x in maizeRun: thisSraOb = sra.SRA(x, workingDir) if thisSraOb.download_fastq(): sraObjects.append(thisSraOb) else: print("Download failed:" + x) print("Following runs downloaded:") for ob in sraObjects: print(ob.srr_accession)
def perform_qc(self, sra_object, out_dir="", out_suffix="_trimgalore", verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Function to perform qc using trimgalore. The function perform_qc() is consistent for all QC classess. Parameters ---------- sra_object: SRA An SRA object whose fastq files will be used out_suffix: string Suffix for the output sam file verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to trimgalore. This will override the existing options :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired. :rtype: tuple """ if not out_dir: out_dir = sra_object.location else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #create new options based on parametrs newOpts = {} #get layout if sra_object.layout == 'PAIRED': fq1 = sra_object.localfastq1Path fq2 = sra_object.localfastq2Path out_file1 = os.path.join( out_dir, pu.get_file_basename(fq1) + out_suffix + ".fastq") out_file2 = os.path.join( out_dir, pu.get_file_basename(fq2) + out_suffix + ".fastq") newOpts = {"--paired": "", "--": (fq1, fq2), "-o": out_dir} mergedOpts = {**kwargs, **newOpts} #run trimgalore self.run_trimgalore(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) """ running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq move these files to the specified out files """ oldFile1 = os.path.join(out_dir, pu.get_file_basename(fq1) + "_val_1.fq") oldFile2 = os.path.join(out_dir, pu.get_file_basename(fq2) + "_val_2.fq") pe.move_file(oldFile1, out_file1) pe.move_file(oldFile2, out_file2) if not pu.check_files_exist(out_file1, out_file2): print("Trimgalore failed") return ("", ) return out_file1, out_file2 else: fq = sra_object.localfastqPath out_file = os.path.join( out_dir, pu.get_file_basename(fq) + out_suffix + ".fastq") #giving input arguments as a tuple "--":(fq,) newOpts = {"--": (fq, ), "-o": out_dir} #run trimgalore mergedOpts = {**kwargs, **newOpts} self.run_trimgalore(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) """ running trim galore will create one file named <input>_trimmed.fq move these files to the specified out files """ oldFile = os.path.join(out_dir, pu.get_file_basename(fq) + "_trimmed.fq") pe.move_file(oldFile, out_file) if not pu.check_files_exist(out_file): print("Trimgalore failed") return ("", ) return (out_file, )
def perform_qc(self, sra_object, out_dir="", out_suffix="_bbduk", overwrite=True, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Run bbduk on fastq files specified by the sra_object Parameters ---------- arg1: SRA an SRA object arg2: string Suffix for output file name arg3: bool overwrite existing files verbose (bool): Print stdout and std error quiet (bool): Print nothing logs (bool): Log this command to pyrpipe logs objectid (str): Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. arg3: dict options passed to bbduk Returns tuple Returns the path of fastq files after QC. tuple has one item for single end files and 2 for paired. """ #make out_dir if not out_dir: out_dir = sra_object.location else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout == 'PAIRED': fq1 = sra_object.localfastq1Path fq2 = sra_object.localfastq2Path out_fileName1 = pu.get_file_basename(fq1) + out_suffix + ".fastq" out_fileName2 = pu.get_file_basename(fq2) + out_suffix + ".fastq" out_file1Path = os.path.join(out_dir, out_fileName1) out_file2Path = os.path.join(out_dir, out_fileName2) newOpts = { "in": fq1, "in2": fq2, "out": out_file1Path, "out2": out_file2Path } mergedOpts = {**kwargs, **newOpts} #run bbduk if self.run_bbduk(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts): if pu.check_files_exist(out_file1Path, out_file2Path): return (out_file1Path, out_file2Path) return ("", ) else: fq = sra_object.localfastqPath out_fileName = pu.get_file_basename(fq) + out_suffix + ".fastq" out_filePath = os.path.join(out_dir, out_fileName) newOpts = {"in": fq, "out": out_filePath} mergedOpts = {**kwargs, **newOpts} #run bbduk if self.run_bbduk(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts): if pu.check_files_exist(out_filePath): return (out_filePath, ) return ("", )