def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"): """Run kallisto quant sra_object: SRA SRA object contatining paths to fastq files out_suffix: str suffix for output file out_dir: str path to output directory objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Path to kallisto out directory :rtype: string """ if not out_dir: out_dir=os.path.join(sra_object.directory,"kallisto_out") else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout == 'PAIRED': args=(sra_object.fastq_path,sra_object.fastq2_path) internal_kwargs={"-o":out_dir,"-i":self.index} else: args=(sra_object.fastq_path,) internal_kwargs={"-o":out_dir,"--single":"","-i":self.index} #targets outfile=os.path.join(out_dir,"abundance.tsv") newfile=os.path.join(out_dir,"abundance"+out_suffix+".tsv") #check if final files already exists if not _force and pu.check_files_exist(newfile): pu.print_green('Target files {} already exist.'.format(newfile)) return newfile #call kallisto status=self.run(*args,subcommand='quant',objectid=sra_object.srr_accession,target=outfile,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(outfile,newfile) if not pu.check_files_exist(newfile): return "" return newfile return ""
def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"): """run salmon quant sra_object: SRA An SRA object with valid fastq files out_suffix: str suffix string fout out file out_dir: str path to outdir objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Path to salmon out file :rtype: string """ if not out_dir: out_dir=os.path.join(sra_object.directory,"salmon_out") else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) if sra_object.layout == 'PAIRED': internal_kwargs={"-o":out_dir,"-l":"A","-1":sra_object.fastq_path,"-2":sra_object.fastq2_path,"-i":self.index} else: internal_kwargs={"-o":out_dir,"-l":"A","-r":sra_object.fastq_path,"-i":self.index} #targets outfile=os.path.join(out_dir,"quant.sf") newfile=os.path.join(out_dir,"quant"+out_suffix+".sf") #check if final files already exists if not _force and pu.check_files_exist(newfile): pu.print_green('Target files {} already exist.'.format(newfile)) return newfile #call salmon status=self.run(None,subcommand='quant',objectid=sra_object.srr_accession,target=newfile,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(outfile,newfile) if not pu.check_files_exist(newfile): return "" return newfile return ""
def run_transdecoder_predict(self, infasta, longorfs_dir, out_dir=None, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): if not pu.check_files_exist(infasta): pu.print_boldred("Please check input file:" + infasta) if not pu.check_paths_exist(longorfs_dir): pu.print_boldred("Path {} doesn't exist".format(longorfs_dir)) move_flag = True if not out_dir: out_dir = os.getcwd() move_flag = False if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) newOpts = {"-t": infasta, "-O": longorfs_dir} mergedOpts = {**newOpts, **kwargs} #execute Predict status = self.run_transdecoder('TransDecoder.Predict', verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) if not status: pu.print_boldred("Transdecoder failed") return "" #move output files to outdir if move_flag: outfile_prefix = pu.get_filename(infasta) + ".transdecoder" pe.move_file(outfile_prefix + ".bed", os.path.join(out_dir, outfile_prefix + ".bed"), verbose) pe.move_file(outfile_prefix + ".cds", os.path.join(out_dir, outfile_prefix + ".cds"), verbose) pe.move_file(outfile_prefix + ".gff3", os.path.join(out_dir, outfile_prefix + ".gff3"), verbose) pe.move_file(outfile_prefix + ".pep", os.path.join(out_dir, outfile_prefix + ".pep"), verbose) return out_dir
def perform_assembly(self,bam_file,out_dir="",out_suffix="_cufflinks",reference_gtf=None,threads=None,overwrite=True,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs): """Function to run cufflinks with BAM file as input. Parameters ---------- bam_file: string path to bam file out_dir: output directory out_suffix: string Suffix for the output gtf file reference_gtf: str Path to reference gtf threads: int Number of threads to use overwrite: bool Overwrite if output file already exists. verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. kwargs: dict Options to pass to cufflinks. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname=pu.get_file_basename(bam_file) if not out_dir: out_dir=pu.get_file_directory(bam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) out_gtf_file=os.path.join(out_dir,fname+out_suffix+".gtf") """ Handle overwrite """ if not overwrite: #check if file exists. return if yes if os.path.isfile(out_gtf_file): print("The file "+out_gtf_file+" already exists. Exiting..") return out_gtf_file if not threads: threads=self.threads #Add output file name and input bam new_opts={"-o":out_dir,"--":(bam_file,),"-p":str(threads)} #add ref gtf if reference_gtf: if not pu.check_files_exist(reference_gtf): pu.print_boldred("Error: Provided reference GTF {} doesn't exist. Exiting...".format(reference_gtf)) return "" new_opts["-g"]=reference_gtf merged_opts={**new_opts,**kwargs} #call cufflinks status=self.run_cufflinks(verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**merged_opts) if status: #move out_dir/transcripts.gtf to outfile pe.move_file(os.path.join(out_dir,"transcripts.gtf"),out_gtf_file) #check if sam file is present in the location directory of sraOb if pu.check_files_exist(out_gtf_file): return out_gtf_file else: return ""
def perform_qc(self,sra_object,out_dir="",out_suffix="_trimgalore",objectid="NA"): """Function to perform qc using trimgalore. The function perform_qc() is consistent for all QC classess. Parameters ---------- sra_object: SRA An SRA object whose fastq files will be used out_dir: str Path to output directory out_suffix: string Suffix for the output sam file objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired. :rtype: tuple """ if not out_dir: out_dir=sra_object.directory else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #get layout if sra_object.layout=='PAIRED': fq1=sra_object.fastq_path fq2=sra_object.fastq2_path internal_args=(fq1,fq2) internal_kwargs={"--paired":"","-o":out_dir} """ running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq move these files to the specified out files """ file1=os.path.join(out_dir,pu.get_file_basename(fq1)+"_val_1.fq") file2=os.path.join(out_dir,pu.get_file_basename(fq2)+"_val_2.fq") #targets out_file1=os.path.join(out_dir,pu.get_file_basename(fq1)+out_suffix+".fastq") out_file2=os.path.join(out_dir,pu.get_file_basename(fq2)+out_suffix+".fastq") #check if final files already exists if not _force and pu.check_files_exist(out_file1,out_file2): pu.print_green('Target files {}, {} already exist.'.format(out_file1,out_file2)) return out_file1,out_file2 #run trimgalore status=self.run(*internal_args,objectid=objectid,target=[file1,file2],**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(file1,out_file1,verbose=False) pe.move_file(file2,out_file2,verbose=False) if not pu.check_files_exist(out_file1,out_file2): return "" return out_file1,out_file2 return ("",) else: fq=sra_object.fastq_path internal_args=(fq,) internal_kwargs={"-o":out_dir} """ running trim galore will create one file named <input>_trimmed.fq move these files to the specified out files """ file=os.path.join(out_dir,pu.get_file_basename(fq)+"_trimmed.fq") #target out_file=os.path.join(out_dir, pu.get_file_basename(fq)+out_suffix+".fastq") #check if final files already exists if not _force and pu.check_files_exist(out_file): pu.print_green('Target files {} already exist.'.format(out_file)) return (out_file,) #run trimgalore status=self.run(*internal_args,objectid=objectid,target=file,**internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(file,out_file) if not pu.check_files_exist(out_file): return "" return (out_file,) return ("",)
def perform_qc(self, sra_object, out_dir="", out_suffix="_trimgalore", verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Function to perform qc using trimgalore. The function perform_qc() is consistent for all QC classess. Parameters ---------- sra_object: SRA An SRA object whose fastq files will be used out_suffix: string Suffix for the output sam file verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. kwargs: dict Options to pass to trimgalore. This will override the existing options :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired. :rtype: tuple """ if not out_dir: out_dir = sra_object.location else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #create new options based on parametrs newOpts = {} #get layout if sra_object.layout == 'PAIRED': fq1 = sra_object.localfastq1Path fq2 = sra_object.localfastq2Path out_file1 = os.path.join( out_dir, pu.get_file_basename(fq1) + out_suffix + ".fastq") out_file2 = os.path.join( out_dir, pu.get_file_basename(fq2) + out_suffix + ".fastq") newOpts = {"--paired": "", "--": (fq1, fq2), "-o": out_dir} mergedOpts = {**kwargs, **newOpts} #run trimgalore self.run_trimgalore(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) """ running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq move these files to the specified out files """ oldFile1 = os.path.join(out_dir, pu.get_file_basename(fq1) + "_val_1.fq") oldFile2 = os.path.join(out_dir, pu.get_file_basename(fq2) + "_val_2.fq") pe.move_file(oldFile1, out_file1) pe.move_file(oldFile2, out_file2) if not pu.check_files_exist(out_file1, out_file2): print("Trimgalore failed") return ("", ) return out_file1, out_file2 else: fq = sra_object.localfastqPath out_file = os.path.join( out_dir, pu.get_file_basename(fq) + out_suffix + ".fastq") #giving input arguments as a tuple "--":(fq,) newOpts = {"--": (fq, ), "-o": out_dir} #run trimgalore mergedOpts = {**kwargs, **newOpts} self.run_trimgalore(verbose=verbose, quiet=quiet, logs=logs, objectid=objectid, **mergedOpts) """ running trim galore will create one file named <input>_trimmed.fq move these files to the specified out files """ oldFile = os.path.join(out_dir, pu.get_file_basename(fq) + "_trimmed.fq") pe.move_file(oldFile, out_file) if not pu.check_files_exist(out_file): print("Trimgalore failed") return ("", ) return (out_file, )
def download_sra(self,**kwargs): """This function downloads .sra file from NCBI SRA servers using the prefetch command. NCBI sra-toolkit 2.9 or higher must be installed on the system in order to use prefetch. prefetch will create a folder with name same as <srr_accession> under the directory (path) specified. The path of downloaded file is saved in the object as localSRAPath. This localSRAPath is then used by other functions to access the downloaded data. The **kwargs is for passing arguments to the prefetch command. Parameters ---------- kwargs: dict dict containing additional prefetch arguments :return: Return status of the prefetch command. True if successful download and False if failed. :rtype: bool Examples -------- >>> object.download_sra() True """ #store path to the downloaded sra file self.sra_path=os.path.join(self.directory,self.srr_accession+".sra") #check if already exists if pu.check_files_exist(self.sra_path): #pu.print_green("File already exists:"+self.sra_path) #save file .sra file size self.sraFileSize=pu.get_file_size(self.sra_path) #test if file is paired or single end if pe.is_paired(self.sra_path): self.layout="PAIRED" else: self.layout="SINGLE" return True #scan for prefetch arguments prefetchArgsList=['-f','-t','-l','-n','-s','-R','-N','-X','-o','-a','--ascp-options','-p','--eliminate-quals','-c','-o','-O','-h','-V','-L','-v','-q'] #ignore directory and file name arguments if given if '-O' in kwargs: print("Ignoring -O flag."+" directory is: "+self.directory) #delete -O parameter del kwargs['-O'] if '-o' in kwargs: print("Ignoring -o flag."+" File name is: "+self.srr_accession) #delete -o parameter del kwargs['-o'] prefetch_Cmd=['prefetch'] prefetch_Cmd.extend(pu.parse_unix_args(prefetchArgsList,kwargs)) prefetch_Cmd.extend(['-O',self.directory]) prefetch_Cmd.append(self.srr_accession) cmdStatus=pe.execute_command(prefetch_Cmd,objectid=self.srr_accession) #return if dryrun if _dryrun: return True if not cmdStatus: pu.print_boldred("prefetch failed for:"+self.srr_accession) return False #move file if downloaded inside the directory if not pu.check_files_exist(self.sra_path): #check outdir/SRR/SRR/SRR.sra if pu.check_files_exist(os.path.join(self.directory,self.srr_accession,self.srr_accession+".sra")): pe.move_file(os.path.join(self.directory,self.srr_accession,self.srr_accession+".sra"),self.sra_path) #validate path exists if not pu.check_files_exist(self.sra_path): pu.print_boldred("Error downloading file. File "+self.sra_path+" does not exist!!!\n Please check you SRA-Tools config") return False #print ("Downloaded file: "+self.sra_path+" {0} ".format(pu.get_file_size(self.sra_path))) #save file .sra file size self.sraFileSize=pu.get_file_size(self.sra_path) #test if file is paired or single end if pe.is_paired(self.sra_path): self.layout="PAIRED" else: self.layout="SINGLE" return True #def destroy(self): """ Delete everything for this object from memory and disk """ # pass
def perform_assembly(self, bam_file, out_dir="", out_suffix="_cufflinks", overwrite=True, verbose=False, quiet=False, logs=True, objectid="NA", **kwargs): """Function to run cufflinks with BAM file as input. Parameters ---------- bam_file: string path to bam file out_dir: output directory out_suffix: string Suffix for the output gtf file overwrite: bool Overwrite if output file already exists. verbose: bool Print stdout and std error quiet: bool Print nothing logs: bool Log this command to pyrpipe logs objectid: str Provide an id to attach with this command e.g. the SRR accession. kwargs: dict Options to pass to cufflinks. This will override the existing options self.passed_args_dict (only replace existing arguments and not replace all the arguments). :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname = pu.get_file_basename(bam_file) if not out_dir: out_dir = pu.get_file_directory(bam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf") """ Handle overwrite """ if not overwrite: #check if file exists. return if yes if os.path.isfile(out_gtf_file): print("The file " + out_gtf_file + " already exists. Exiting..") return out_gtf_file #Add output file name and input bam new_opts = {"-o": out_dir, "--": (bam_file, )} merged_opts = {**kwargs, **new_opts} #call cufflinks status = self.run_cufflinks(verbose, quiet, logs, objectid, **merged_opts) if status: #move out_dir/transcripts.gtf to outfile pe.move_file(os.path.join(out_dir, "transcripts.gtf"), out_gtf_file) #check if sam file is present in the location directory of sraOb if pu.check_files_exist(out_gtf_file): return out_gtf_file else: return ""
def perform_assembly(self, bam_file, out_dir=None, out_suffix="_cufflinks", objectid="NA"): """Function to run cufflinks with BAM file as input. Parameters ---------- bam_file: string path to bam file out_dir: output directory out_suffix: string Suffix for the output gtf file objectid: str Provide an id to attach with this command e.g. the SRR accession. :return: Returns the path to output GTF file :rtype: string """ #create path to output file fname = pu.get_file_basename(bam_file) if not out_dir: out_dir = pu.get_file_directory(bam_file) else: if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #Add output file name and input bam internal_args = (bam_file, ) internal_kwargs = {"-o": out_dir} #add positional args internal_kwargs['--'] = internal_args #targets outfile = os.path.join(out_dir, "transcripts.gtf") out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf") #if final file already exists if not _force and pu.check_files_exist(out_gtf_file): pu.print_green( 'Target files {} already exist.'.format(out_gtf_file)) return out_gtf_file #call cufflinks status = self.run(None, objectid=objectid, target=outfile, **internal_kwargs) if status: if not _dryrun: pe.move_file(outfile, out_gtf_file) if not pu.check_files_exist(out_gtf_file): return "" return out_gtf_file return ""
def perform_alignment(self, sra_object, out_suffix="_star", out_dir="", objectid="NA"): """Function to perform STAR alignment using sra_object. Parameters ---------- sra_object SRA object An object of type SRA. The path to fastq files will be obtained from this object. out_suffix: string Suffix for the output sam file out_dir: string Directory to save the results. Default value is sra_object.directory objectid: str Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports. :return: Returns the path to output bam :rtype: string """ if not out_dir: out_dir = sra_object.directory else: #create out_dir if not exists if not pu.check_paths_exist(out_dir): pu.mkdir(out_dir) #find layout and fq file paths if sra_object.layout == 'PAIRED': internal_kwargs = { "--readFilesIn": sra_object.fastq_path + " " + sra_object.fastq2_path } else: internal_kwargs = {"--readFilesIn": sra_object.fastq_path} #add out dir internal_kwargs["--outFileNamePrefix"] = out_dir + "/" #the expected out file #star can return Aligned.sortedByCoord.out.bam Aligned.out.bam Aligned.toTranscriptome.out.bam #return sorted bam or unsorted bam which ever is present bam = os.path.join(out_dir, 'Aligned.out.bam') #if outSAMtype is not specified make it bam by default if not '--outSAMtype' in self._kwargs: self._kwargs['--outSAMtype'] = 'BAM SortedByCoordinate' if '--outSAMtype' in self._kwargs and 'SortedByCoordinate' in self._kwargs[ '--outSAMtype']: bam = os.path.join(out_dir, 'Aligned.sortedByCoord.out.bam') finalbam = bam.split('.bam')[0] + out_suffix + '.bam' #check if final bam already exists if not _force and pu.check_files_exist(finalbam): pu.print_green('Target files {} already exist.'.format(finalbam)) return finalbam #call star status = self.run(None, objectid=sra_object.srr_accession, target=bam, **internal_kwargs) if status: #return rename the bam file and return path if not _dryrun: pe.move_file(bam, finalbam) if not pu.check_files_exist(finalbam): return "" return finalbam return ""