示例#1
0
def runtest():
    failed = False
    sraob = sra.SRA('ERR726985', directory='./pyrpipe_sratest')
    if not sraob.fastq_exists():
        pu.print_boldred('Test failed')
        failed = True
    pu.print_notification('Cleaning up...')
    sraob.delete_fastq()
    os.rmdir(sraob.directory)

    if failed:
        pu.print_boldred('Paired end test failed')
        failed = False

    sraob = sra.SRA('SRR2134545', directory='./pyrpipe_sratest')
    if not sraob.fastq_exists():
        pu.print_boldred('Test failed')
        failed = True
    pu.print_notification('Cleaning up...')
    sraob.delete_fastq()
    os.rmdir(sraob.directory)

    if failed:
        pu.print_boldred('Single end test failed')
        failed = False

    if not failed:
        pu.print_green(
            '\n#####################All Tests Passed#####################\n')
        os.rmdir('./pyrpipe_sratest')
示例#2
0
    def search_sra(self, path):
        """Search .sra file under a dir
        Return True if found otherwise False
        """
        #search files under the path

        sra_files = pe.find_files(path, "*.sra")

        if len(sra_files) < 1:
            return False

        if len(sra_files) > 1:
            pu.print_boldred(
                "Found multiple .sra files. Using the first entry...")
        sra_path = sra_files[0]
        #self.location=path
        self.srr_accession = pu.get_file_basename(sra_path)
        self.localSRAFilePath = sra_path
        self.sraFileSize = pu.get_file_size(self.localSRAFilePath)
        #test if file is paired or single end
        if pe.is_paired(self.localSRAFilePath):
            self.layout = "PAIRED"
        else:
            self.layout = "SINGLE"

        pu.print_green("Found .sra " + self.localSRAFilePath)
        return True
示例#3
0
    def search_fastq(self, path):
        """Search .fastq file under a dir and create SRA object
        Return True if found otherwise False
        """
        #search files under the path
        fq_files = pe.find_files(path, "*.fastq")

        if len(fq_files) < 1:
            return False

        if len(fq_files) > 2:
            pu.print_boldred("Can not determine .fastq. Exiting...")
            return False

        fq_files.sort()
        #case with single fastq
        if len(fq_files) == 1:
            self.localfastqPath = fq_files[0]
            pu.print_green("Found .fastq " + self.localfastqPath)
            self.layout = "SINGLE"

        #case with paired fastq
        if len(fq_files) == 2:
            self.localfastq1Path = fq_files[0]
            self.localfastq2Path = fq_files[1]
            pu.print_green("Found .fastq " + self.localfastq1Path + " " +
                           self.localfastq2Path)
            self.layout = "PAIRED"

        #self.location=path
        #self.srr_accession=pu.get_file_basename(fq_files[0])
        return True
示例#4
0
    def createMikadoGTFlist(self,
                            out_file,
                            out_dir,
                            searchPath,
                            searchQuery="*.gtf",
                            strand=False):
        """Create a file to be used by mikado configure
        """

        files = pe.find_files(searchPath, searchQuery)
        args = files

        #create out dir
        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)
        outFilePath = os.path.join(out_dir, out_file + ".txt")

        gtfs = []
        for l in args:
            thisName = pu.get_file_basename(l)
            if thisName:
                gtfs.append("\t".join([l, thisName, str(strand)]))

        f = open(outFilePath, "w")
        f.write("\n".join(gtfs))
        f.close()

        pu.print_green("Mikado list file written to:" + outFilePath)
        return outFilePath
示例#5
0
    def build_index(self,
                    in_fasta,
                    dbname,
                    out_dir=None,
                    threads=None,
                    verbose=False,
                    quiet=False,
                    logs=True,
                    objectid="NA",
                    **kwargs):
        """Build a diamond index and store its path in self
        """

        #check input files
        if not pu.check_files_exist(in_fasta):
            pu.print_boldred(
                "Input fasta: {} not found...\n diamond makedb failed".format(
                    in_fasta))
            return False
        #create out_dir
        if not out_dir:
            out_dir = os.getcwd()
        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)

        #check if index already exists
        index_path = os.path.join(out_dir, dbname)
        self.index = index_path
        if self.check_index():
            pu.print_green("Diamond index: {} exists, using it...".format(
                self.index))
            self.index = index_path
            return True

        if not threads:
            threads = self.threads

        newOpts = {
            "--in": in_fasta,
            "-d": index_path,
            "--threads": str(threads)
        }

        #add input files to kwargs, overwrite newOpts with kwargs
        mergedOpts = {**newOpts, **kwargs}

        #call run_diamond
        status = self.run_diamond("makedb",
                                  verbose=verbose,
                                  quiet=quiet,
                                  logs=logs,
                                  objectid=objectid,
                                  **mergedOpts)

        if status:
            self.index = index_path
            return True

        return False
示例#6
0
def generateBenchmarkReport(logFile,envLog,filterList,tempDir,outFile="",verbose=False):
    """
    ignores failed commands with exitcode !=0
    """
    
    ob=bm.Benchmark(logFile,envLog,out_dir=tempDir)
    #generate benchmarks
    ob.plot_time_perobject()
    ob.plot_time_perprogram()
    
    pu.print_green("Benchmark report saved to:"+tempDir+"/benchmark_reports")
示例#7
0
 def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"):
     """Run kallisto quant
     
     sra_object: SRA
         SRA object contatining paths to fastq files
     out_suffix: str
         suffix for output file
     out_dir: str
         path to output directory
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
    
     :return: Path to kallisto out directory
     :rtype: string
     """
     
     if not out_dir:
         out_dir=os.path.join(sra_object.directory,"kallisto_out")
     else:
         #create out_dir if not exists
         if not pu.check_paths_exist(out_dir):
             pu.mkdir(out_dir)
     
     
     if sra_object.layout == 'PAIRED':
         args=(sra_object.fastq_path,sra_object.fastq2_path)
         internal_kwargs={"-o":out_dir,"-i":self.index}
     else:
         args=(sra_object.fastq_path,)
         internal_kwargs={"-o":out_dir,"--single":"","-i":self.index}
         
     
     #targets
     outfile=os.path.join(out_dir,"abundance.tsv")
     newfile=os.path.join(out_dir,"abundance"+out_suffix+".tsv")
     #check if final files already exists
     if not _force and pu.check_files_exist(newfile):
         pu.print_green('Target files {} already exist.'.format(newfile))
         return newfile
     
     #call kallisto
     status=self.run(*args,subcommand='quant',objectid=sra_object.srr_accession,target=outfile,**internal_kwargs)
     
     if status:
         #return rename the bam  file and return path
         if not _dryrun:
             pe.move_file(outfile,newfile)
             if not pu.check_files_exist(newfile):
                 return ""            
         return newfile
     
     return ""
示例#8
0
 def build_index(self,index_path,index_name,fasta,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs):
     """
     build salmon index and store the path to index in self
     
     index_path: str
         path to the output directory
     index_name: str
         index name
     verbose: bool
         Print stdout and std error
     quiet: bool
         Print nothing
     logs: bool
         Log this command to pyrpipe logs
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     kwargs: dict
         Options to pass to salmon. This will override the existing options
         
     :return: status of salmon index
     :rtype: bool
     """
     
     #check input
     if not pu.check_files_exist(fasta):
         pu.print_boldred("{} does not exist. Exiting".format(fasta))
         return False
     #create out dir
     if not pu.check_paths_exist(index_path):
         if not pu.mkdir(index_path):
             print("ERROR in building hisat2 index. Failed to create index directory.")
             return False
     indexOut=os.path.join(index_path,index_name)
     newOpts={"-t":fasta,"-i":indexOut}
     mergedOpts={**kwargs,**newOpts}
     
     #call salmon
     status=self.run_salmon("index",verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**mergedOpts)
     
     if status:
         #check if sam file is present in the location directory of sra_object
         #if check_files_exist(os.path.join(indexOut,"versionInfo.json")): #not sure if this is reliable
         if pu.check_paths_exist(indexOut):
             self.salmon_index=indexOut
             self.passedArgumentDict['-i']=self.salmon_index
             pu.print_green("salmon index is:"+self.salmon_index)
             return True
     
     pu.print_boldred("Failed to create salmon index")
     return False
示例#9
0
 def build_index(self,index_path,index_name,fasta,verbose=False,quiet=False,logs=True,objectid="NA",**kwargs):
     """Function to  build kallisto index
     
     index_path: str
         path to the output directory
     index_name: str
         index name
     verbose: bool
         Print stdout and std error
     quiet: bool
         Print nothing
     logs: bool
         Log this command to pyrpipe logs
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     kwargs: dict
         Options to pass to kallisto. This will override the existing options in self.passed_args_dict (only replace existing arguments and not replace all the arguments).
         
     :return: Status of kallisto index
     :rtype: bool
     """
     
     #check input
     if not pu.check_files_exist(fasta):
         pu.print_boldred("{} does not exist. Exiting".format(fasta))
         return False
     
     #create out dir
     if not pu.check_paths_exist(index_path):
         if not pu.mkdir(index_path):
             print("ERROR in building kallisto index. Failed to create index directory.")
             return False
         
     indexOut=os.path.join(index_path,index_name)
     newOpts={"--":(fasta,),"-i":indexOut}
     mergedOpts={**kwargs,**newOpts}
     
     #call salmon
     status=self.run_kallisto("index",verbose=verbose,quiet=quiet,logs=logs,objectid=objectid,**mergedOpts)
     
     if status:
         #check if sam file is present in the location directory of sra_object
         if pu.check_files_exist(indexOut):
             self.kallisto_index=indexOut
             self.passedArgumentDict['-i']=self.kallisto_index
             pu.print_green("kallisto_index is:"+self.kallisto_index)
             return True
     else:
         pu.print_boldred("Failed to create kallisto index")
         return False
示例#10
0
 def perform_quant(self,sra_object,out_suffix="",out_dir="",objectid="NA"):
     """run salmon quant
     sra_object: SRA
         An SRA object with valid fastq files
     out_suffix: str
         suffix string fout out file
     out_dir: str
         path to outdir
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
     
     :return: Path to salmon out file
     :rtype: string
     """
         
     if not out_dir:
         out_dir=os.path.join(sra_object.directory,"salmon_out")
     else:
         #create out_dir if not exists
         if not pu.check_paths_exist(out_dir):
             pu.mkdir(out_dir)
     
     
     if sra_object.layout == 'PAIRED':
         internal_kwargs={"-o":out_dir,"-l":"A","-1":sra_object.fastq_path,"-2":sra_object.fastq2_path,"-i":self.index}
     else:
         internal_kwargs={"-o":out_dir,"-l":"A","-r":sra_object.fastq_path,"-i":self.index}
     
     #targets
     outfile=os.path.join(out_dir,"quant.sf")
     newfile=os.path.join(out_dir,"quant"+out_suffix+".sf")
     #check if final files already exists
     if not _force and pu.check_files_exist(newfile):
         pu.print_green('Target files {} already exist.'.format(newfile))
         return newfile
     
     #call salmon
     status=self.run(None,subcommand='quant',objectid=sra_object.srr_accession,target=newfile,**internal_kwargs)
     
     if status:
         #return rename the bam  file and return path
         if not _dryrun:
             pe.move_file(outfile,newfile)
             if not pu.check_files_exist(newfile):
                 return ""            
         return newfile
     
     return ""
示例#11
0
    def createMikadoGTFlist(self,
                            out_file,
                            out_dir,
                            searchPath,
                            searchQuery="*.gtf",
                            strand=False):
        """Create a file to be used by mikado configure
        out_file: str
            outfile name
        out_dir: str
            path to out_dir
        searchPath: str
            Path where gtf/gff files will be searched
        searchQuery: str
            Query to perform search. Default: "*.gtf"
        strand: bool
            Stranded flag: Default false
        
            
        """

        files = pe.find_files(searchPath, searchQuery, recursive=True)
        args = files

        #create out dir
        if not pu.check_paths_exist(out_dir):
            pu.mkdir(out_dir)
        outFilePath = os.path.join(out_dir, out_file + ".txt")

        gtfs = []
        for l in args:
            thisName = pu.get_file_basename(l)
            if thisName:
                gtfs.append("\t".join([l, thisName, str(strand)]))

        f = open(outFilePath, "w")
        f.write("\n".join(gtfs))
        f.close()

        pu.print_green("Mikado list file written to:" + outFilePath)
        return outFilePath
示例#12
0
文件: sra.py 项目: shinyfluba/pyrpipe
 def search_fastq(self,path):
     """Search .fastq file under a dir and create SRA object
     Return True if found otherwise False
     """
     
     #check files with names <SRR>_1.fastq and <SRR>_2.fastq
     fq=os.path.join(path,self.srr_accession+'_1.fastq')
     fq2=os.path.join(path,self.srr_accession+'_2.fastq')
     if pu.check_files_exist(fq,fq2):
         self.fastq_path=fq
         self.fastq2_path=fq2
         pu.print_green("Found .fastq "+self.fastq_path+" "+self.fastq2_path)
         self.layout="PAIRED"
         return True
     
     #check single end file
     fq=os.path.join(path,self.srr_accession+'.fastq')
     if pu.check_files_exist(fq):
         self.fastq_path=fq
         pu.print_green("Found .fastq "+self.fastq_path)
         self.layout="SINGLE"
         return True       
     
     #search files under the path
     #fq_files=pe.find_files(path,"*.fastq")
     fq_files=pu.find_files(path,".fastq$")
     
     if len(fq_files)<1:
         return False
     
     if len(fq_files)>2:
         return False
     
     fq_files.sort()
     #case with single fastq
     if len(fq_files)==1:
         self.fastq_path=fq_files[0]
         pu.print_green("Found .fastq "+self.fastq_path)
         self.layout="SINGLE"
     
     #case with paired fastq
     if len(fq_files)==2:
         self.fastq_path=fq_files[0]
         self.fastq2_path=fq_files[1]
         pu.print_green("Found .fastq "+self.fastq_path+" "+self.fastq2_path)
         self.layout="PAIRED"
     
     return True 
示例#13
0
    def init_from_accession(self, srr_accession, location):
        """Create SRA object using provided srr accession and location to save the data
        """
        self.dep_list = ['prefetch', "fasterq-dump"]
        if not pe.check_dependencies(self.dep_list):
            raise Exception("ERROR: Please install missing programs.")

        if srr_accession is None:
            raise Exception("Please provide a valid accession")

        if location is None:
            location = os.getcwd()
        #pu.print_info("Creating SRA: "+srr_accession)
        self.srr_accession = srr_accession
        #create a dir named <srr_accession> and use as location
        self.location = os.path.join(location, self.srr_accession)

        #search for existing files in location
        #self.search_fastq(self.location)
        #scan path for sra
        #self.search_sra(self.location)

        #check SRA file
        if pu.check_files_exist(
                os.path.join(self.location, self.srr_accession + ".sra")):
            pu.print_green(self.srr_accession + ".sra exists.")
            self.localSRAFilePath = os.path.join(self.location,
                                                 self.srr_accession + ".sra")
            self.sraFileSize = pu.get_file_size(self.localSRAFilePath)
            #test if file is paired or single end
            if pe.is_paired(self.localSRAFilePath):
                self.layout = "PAIRED"
            else:
                self.layout = "SINGLE"

        #check fastq file
        self.search_fastq(self.location)
示例#14
0
    def build_index(self, index_path, genome, objectid="NA"):
        """Build a bowtie2 index with given parameters and saves the new index to self.index.
        
        Parameters
        ----------
        
        index_path: string
            Path where the index will be created
        genome: string
            Path to the reference genome
        objectid : string 
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
            
        :return: Returns the status of bowtie2-build
        :rtype: bool
        """

        #check input references
        if not _force:
            if pu.check_bowtie2index(index_path):
                pu.print_green(
                    "bowtie index {} already exists.".format(index_path))
                self.index = index_path
                return True

        #check input files
        if not (pu.check_files_exist(genome)):
            pu.print_boldred(
                "Please provide a valid input fasta file to build bowtie2 index"
            )
            raise ValueError("Please check input to star build index")
            return False

        bowtie2_build_args = [
            '-f', '-c', '--large-index', '--debug', '--sanitized', '--verbose',
            '-a', '--noauto', '-p', '--packed', '--bmax', '--bmaxdivn',
            '--dcv', '--nodc', '-r', '--noref', '-3', '--justref', '-o',
            '--offrate', '-t', '--ftabchars', '--threads', '--seed', '-q',
            '--quiet'
        ]

        #create the out dir
        indexdir = pu.get_file_directory(index_path)
        if not pu.check_paths_exist(indexdir):
            if not pu.mkdir(indexdir):
                raise OSError(
                    "Error creating bowtie2 index. Failed to create index directory."
                )
                return False

        args = (genome, index_path)
        internal_kwargs = {"--threads": self._threads}

        #read build parameters
        yamlfile = os.path.join(_params_dir, 'bowtie2_index.yaml')
        if pu.check_files_exist(yamlfile):
            yaml_params = pl.YAML_loader(yamlfile)
            yaml_kwargs = yaml_params.get_kwargs()
            internal_kwargs = {**yaml_kwargs, **internal_kwargs}

        #add positional args
        internal_kwargs['--'] = args

        bowtie2Build_Cmd = ['bowtie2-build']
        #add options
        bowtie2Build_Cmd.extend(
            pu.parse_unix_args(bowtie2_build_args, internal_kwargs))

        #start ececution
        status = pe.execute_command(bowtie2Build_Cmd, objectid=objectid)
        if not status:
            pu.print_boldred("bowtie2-build failed")
            return False

        if status:
            if pu.check_bowtie2index(index_path) and not _dryrun:
                #update object's index
                self.index = index_path
                if self.check_index():
                    return True
        else:
            raise OSError("Error building bowtie2 index")

        return True
示例#15
0
    def run_fasterqdump(self,
                        delete_sra=False,
                        verbose=False,
                        quiet=False,
                        logs=True,
                        **kwargs):
        """Execute fasterq-dump to convert .sra file to fastq files.
        The fastq files will be stored in the same directory as the sra file. All fastq files should be consistently named
        using the extension .fastq
        
        Parameters
        ----------
        
        delete_sra: bool
            delete sra file after completion
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        kwargs: dict
            A dict containing fasterq-dump arguments
        
        :return: Return status of the fasterq-dump command. True if successful download and False if failed.
        :rtype: bool

        Examples
        --------
        >>> object.run_fasterqdump()
        True
        """
        #check if fastq files exists already
        if self.fastqFilesExistsLocally():
            pu.print_green("Fastq files exist already")
            return True

        #first check is sra exists
        if not self.sraFileExistsLocally():
            pu.print_boldred(
                "Error executing fasterq-dump: .sra file not found. Please run download_sra()."
            )
            return False
        #else directly run fasterq-dump on accession ?

        fasterqdumpArgsList = [
            '-f', '-t', '-s', '-N', '-X', '-a', '-p', '-c', '-o', '-O', '-h',
            '-V', '-L', '-v', '-q', '-b', '-m', '-e', '-x', '-S', '-3', '-P',
            '-M', '-B', '--option-file', '--strict', '--table',
            '--include-technical', '--skip-technical', '--concatenate-reads'
        ]

        #ignore location and file name arguments if given
        if '-O' in kwargs:
            print("Ignoring -O flag." + " location is: " + self.location)
            #delete -O parameter
            del kwargs['-O']
        if '-o' in kwargs:
            print("Ignoring -o flag." + " File name is: " + self.srr_accession)
            #delete -o parameter
            del kwargs['-o']

        #execute command

        fstrqd_Cmd = ['fasterq-dump']
        fstrqd_Cmd.extend(pu.parse_unix_args(fasterqdumpArgsList, kwargs))
        #add location
        fstrqd_Cmd.extend(['-O', self.location])
        #add output filename. output will be <srr_accession>.fastq or <srr_accession>_1.fastq and <srr_accession>_2.fastq
        fstrqd_Cmd.extend(['-o', self.srr_accession + ".fastq"])
        fstrqd_Cmd.append(self.localSRAFilePath)

        #execute command
        cmdStatus = pe.execute_command(fstrqd_Cmd, objectid=self.srr_accession)
        if not cmdStatus:
            print("fasterqdump failed for:" + self.srr_accession)
            return False

        #check if fastq files are downloaded
        if (self.layout == "SINGLE"):
            self.localfastqPath = os.path.join(self.location,
                                               self.srr_accession + ".fastq")

            if not pu.check_files_exist(self.localfastqPath):
                pu.print_boldred("Error running fasterq-dump file. File " +
                                 self.localfastqPath + " does not exist!!!")
                return False

        else:
            self.localfastq1Path = os.path.join(
                self.location, self.srr_accession + "_1.fastq")
            self.localfastq2Path = os.path.join(
                self.location, self.srr_accession + "_2.fastq")

            if not pu.check_files_exist(self.localfastq1Path,
                                        self.localfastq2Path):
                pu.print_boldred("Error running fasterq-dump file. File " +
                                 self.localfastq1Path + " does not exist!!!")
                return False

        #delete sra file if specified
        if delete_sra:
            self.delete_sra()

        return True
示例#16
0
    def download_sra(self, verbose=False, quiet=False, logs=True, **kwargs):
        """This function downloads .sra file from NCBI SRA servers using the prefetch command.

        NCBI sra-toolkit 2.9 or higher must be installed on the system in order to use prefetch. 
        prefetch will create a folder with name same as <srr_accession> under the location (path) specified.
        The path of downloaded file is saved in the object as localSRAPath. This localSRAPath is then used
        by other functions to access the downloaded data. 
        The **kwargs is for passing arguments to the prefetch command.
        
        Parameters
        ----------
        
        kwargs: dict
            dict containing additional prefetch arguments

        :return: Return status of the prefetch command. True if successful download and False if failed.
        :rtype: bool

        Examples
        --------
        >>> object.download_sra()
        True
        """

        #store path to the downloaded sra file
        self.localSRAFilePath = os.path.join(self.location,
                                             self.srr_accession + ".sra")
        #check if already exists
        if pu.check_files_exist(self.localSRAFilePath):
            pu.print_green("File already exists:" + self.localSRAFilePath)
            #save file .sra file size
            self.sraFileSize = pu.get_file_size(self.localSRAFilePath)
            #test if file is paired or single end
            if pe.is_paired(self.localSRAFilePath):
                self.layout = "PAIRED"
            else:
                self.layout = "SINGLE"
            return True

        pu.print_info("Downloading " + self.srr_accession + " ...")

        #scan for prefetch arguments
        prefetchArgsList = [
            '-f', '-t', '-l', '-n', '-s', '-R', '-N', '-X', '-o', '-a',
            '--ascp-options', '-p', '--eliminate-quals', '-c', '-o', '-O',
            '-h', '-V', '-L', '-v', '-q'
        ]

        #ignore location and file name arguments if given
        if '-O' in kwargs:
            print("Ignoring -O flag." + " location is: " + self.location)
            #delete -O parameter
            del kwargs['-O']
        if '-o' in kwargs:
            print("Ignoring -o flag." + " File name is: " + self.srr_accession)
            #delete -o parameter
            del kwargs['-o']

        prefetch_Cmd = ['prefetch']
        prefetch_Cmd.extend(pu.parse_unix_args(prefetchArgsList, kwargs))
        prefetch_Cmd.extend(['-O', self.location])
        prefetch_Cmd.append(self.srr_accession)

        cmdStatus = pe.execute_command(prefetch_Cmd,
                                       objectid=self.srr_accession)
        if not cmdStatus:
            pu.print_boldred("prefetch failed for:" + self.srr_accession)
            return False

        #validate path exists
        if not pu.check_files_exist(self.localSRAFilePath):
            pu.print_boldred("Error downloading file. File " +
                             self.localSRAFilePath + " does not exist!!!")
            return False

        print("Downloaded file: " + self.localSRAFilePath +
              " {0} ".format(pu.get_file_size(self.localSRAFilePath)))
        #save file .sra file size
        self.sraFileSize = pu.get_file_size(self.localSRAFilePath)
        #test if file is paired or single end
        if pe.is_paired(self.localSRAFilePath):
            self.layout = "PAIRED"
        else:
            self.layout = "SINGLE"

        return True
示例#17
0
    def download_fastq(self,
                       verbose=False,
                       quiet=False,
                       logs=True,
                       procs=2,
                       **kwargs):
        """Function to download fastq files
        """

        #check if fastq files exists already
        if self.fastqFilesExistsLocally():
            pu.print_green("Fastq files exist already")
            return True

        fasterqdumpArgsList = [
            '-f', '-t', '-s', '-N', '-X', '-a', '-p', '-c', '-o', '-O', '-h',
            '-V', '-L', '-v', '-q', '-b', '-m', '-x', '-S', '-3', '-P', '-M',
            '-B', '--option-file', '--strict', '--table',
            '--include-technical', '--skip-technical', '--concatenate-reads'
        ]
        fstrqd_Cmd = ['fasterq-dump']
        fstrqd_Cmd.extend(pu.parse_unix_args(fasterqdumpArgsList, kwargs))
        #add location
        fstrqd_Cmd.extend(['-O', self.location])
        #add output filename. output will be <srr_accession>.fastq or <srr_accession>_1.fastq and <srr_accession>_2.fastq
        fstrqd_Cmd.extend(['-o', self.srr_accession + ".fastq"])
        fstrqd_Cmd.extend(['-e', str(procs)])
        if self.sraFileExistsLocally():
            fstrqd_Cmd.append(self.localSRAFilePath)
        else:
            fstrqd_Cmd.append(self.srr_accession)

        #execute command
        cmdStatus = pe.execute_command(fstrqd_Cmd, objectid=self.srr_accession)
        if not cmdStatus:
            print("fasterqdump failed for:" + self.srr_accession)
            return False

        if not hasattr(self, 'layout'):
            fq_files = pe.find_files(self.location,
                                     self.srr_accession + "*.fastq")
            if len(fq_files) == 1:
                self.layout = 'SINGLE'
            else:
                self.layout = 'PAIRED'

        #check if fastq files are downloaded
        if (self.layout == "SINGLE"):
            self.localfastqPath = os.path.join(self.location,
                                               self.srr_accession + ".fastq")
            if not pu.check_files_exist(self.localfastqPath):
                pu.print_boldred("Error running fasterq-dump file. File " +
                                 self.localfastqPath + " does not exist!!!")
                return False
        else:
            self.localfastq1Path = os.path.join(
                self.location, self.srr_accession + "_1.fastq")
            self.localfastq2Path = os.path.join(
                self.location, self.srr_accession + "_2.fastq")
            if not pu.check_files_exist(self.localfastq1Path,
                                        self.localfastq2Path):
                pu.print_boldred("Error running fasterq-dump file. File " +
                                 self.localfastq1Path + " does not exist!!!")
                return False

        return True
示例#18
0
    def run(self,
            *args,
            subcommand=None,
            target=None,
            requires=None,
            objectid=None,
            verbose=None,
            logs=None,
            **kwargs):
        """
        

        Parameters
        ----------
        *args : Tuple
            Positoinal arguments passed to a command. This will copmletely REPLACE the exsiting self._args created during initialization of the runnable object.
        subcommand : String or List, optional
            DESCRIPTION. subcommand passed to the command. The default is None.
        target : Str or List of Str, optional
            DESCRIPTION. The expected output/target files produced by the run operation. False  is returned is all target files are not found after the command. The default is None.
        requires : Str or List of Str, optional
            DESCRIPTION. Files required to strat the run method. Exception is thrown if files are missing. The default is None.
        objectid : Str, optional
            DESCRIPTION. A uniq id to identify the run operation in the logs. Thi is useful for benchmarks. The default is None.
        **kwargs : Keyword arguments
            DESCRIPTION. The options to be passed to the command. This will OVERRIDE ANY EXISTING options in the self._kwargs created during initialization of the runnable object.

        Raises
        ------
        TypeError
            If incorerct types are used for target and required.
        FileNotFoundError
            Raises FileNotFoundError if any of the required files are missing.
        OSError
            Raises OSError if the command is incorrect or not present in path.
        ValueError
            Raises ValueError if args_type is something other than LINUX or JAVA.

        Returns
        -------
        bool
            Return the status of command as True or False. True implies command had 0 exit-code and all target files were found after the command finished.

        """

        #create target list
        target_list = []
        locks = []
        requires_list = []

        if target:
            if isinstance(target, str):
                target_list = [target]
            elif isinstance(target, list):
                target_list = target
            else:
                raise TypeError("target must be a string or a list object")

        #ckeck for locks and remove previous locks and associated targets if exist
        for target in target_list:
            self.verify_integrity(target)

        #if target already present and not overwrite exists then return
        if not _force and target_list:
            if self.verify_target_list(target_list):
                pu.print_green('Target files {} already exist.'.format(
                    ', '.join(target_list)))
                return True

        #check if all requirements are satisfied
        if requires:
            if isinstance(requires, str):
                requires_list = [requires]
            elif isinstance(requires, list):
                requires_list = requires
            else:
                raise TypeError("requires must be a string or a list object")

        #Raise exception if requirements not satisfied
        if requires_list:
            if not self.verify_target_list(requires_list):
                pu.print_boldred('Required files {} fot found.'.format(
                    ', '.join(requires_list)))
                raise FileNotFoundError("FilesNotFound")
            #check if any required file had lock
            for file in requires_list:
                if len(self.get_lock_files(file)):
                    pu.print_boldred(
                        'Required file {} is corrupt. Please verify file is correct and remove any .Lock files'
                        .format(', '.join(requires_list)))
                    raise FileNotFoundError("FilesNotFound")

        #override class kwargs by passed kwargs
        kwargs = {**self._kwargs, **kwargs}
        #if no args provided use constructor's args
        if not args:
            args = self._args
        #if args are not None
        if args and args[0]:
            kwargs['--'] = args

        #make a copy of self._command
        if not self._command:
            pu.print_boldred("Error: command can not be None or empty")
            raise OSError("CommandNotFoundException")

        cmd = []
        if isinstance(self._command, list):
            cmd = self._command.copy()
        elif isinstance(self._command, str):
            cmd = [self._command]

        #if subcommand supplied
        #get valid args
        valid_args_subcommand = self.get_valid_parameters(subcommand)

        if subcommand:
            if isinstance(subcommand, str):
                subcommand = [subcommand]
            #add to command
            cmd.extend(subcommand)

        #parse and add parameters
        if self._args_style == 'LINUX':
            cmd.extend(pu.parse_unix_args(valid_args_subcommand, kwargs))
        elif self._args_style == 'JAVA':
            cmd.extend(pu.parse_java_args(valid_args_subcommand, kwargs))
        else:
            pu.print_boldred("Unknown args style: {}".format(self._args_style))
            raise ValueError("Unknown args style")

        #create locks on target; locks indicate incomplete commands
        if not _dryrun: locks = self.create_lock(target_list, ' '.join(cmd))

        #execute command
        cmd_status = pe.execute_command(cmd,
                                        objectid=objectid,
                                        verbose=verbose,
                                        logs=logs)

        # if command finished remove locks
        self.remove_locks(locks)

        if not cmd_status:
            pu.print_boldred("{} failed: {}".format(self._command,
                                                    " ".join(cmd)))
            #remove target files
            if not _dryrun and target_list:
                pu.print_boldred("Removing target files {}: ".format(
                    ', '.join(target_list)))
                pe.delete_files(*target_list)
            return False

        if cmd_status and target_list and not _dryrun:
            return self.verify_target_list(target_list, verbose=True)

        #return status
        return cmd_status
示例#19
0
def execute_command(cmd,
                    verbose=False,
                    quiet=False,
                    logs=True,
                    dryrun=False,
                    objectid="NA",
                    command_name=""):
    """Function to execute commands using popen. 
    All commands executed by this function can be logged and saved to pyrpipe logs.
    
    Parameters
    ----------
    
    cmd: list
        command to execute via popen in a list
    verbose: bool
        Whether to print stdout and stderr. Default: False. All stdout and stderr will be saved to logs regardless of this flag.
    quiet: bool
        Absolutely no output on screen
    logs: bool
        Log the execution 
    dryrun: bool
        If True, perform a dry run i.e. print commands to screen and log and exit
    objectid: string
        An id to be attached with the command. This is useful fo storing logs for SRA objects where object id is the SRR id.
    command_name: string
        Name of command to be save in log. If empty it is determined as the first element of the cmd list.

    :return: Return status.True is returncode is 0
    :rtype: bool
    """
    if not command_name:
        command_name = cmd[0]
    log_message = " ".join(cmd)

    #dryrun: print and exit
    if dryrun:
        pu.print_blue("$ " + log_message)
        #log
        #create a dict and dump as json
        logDict = {
            'cmd': log_message,
            'exitcode': "0",
            'runtime': "0",
            'starttime': "0",
            'stdout': "dryrun",
            'stderr': "",
            'objectid': objectid,
            'commandname': command_name
        }
        pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict))
        return True

    if not quiet:
        pu.print_blue("$ " + log_message)
    time_start = time.time()
    starttime_str = time.strftime("%y-%m-%d %H:%M:%S",
                                  time.localtime(time.time()))
    try:
        result = subprocess.Popen(cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT)
        stdout, stderr = result.communicate()
        #convert to string
        if stdout:
            stdout = stdout.decode("utf-8")
        else:
            stdout = ""
        if stderr:
            stderr = stderr.decode("utf-8")
        else:
            stderr = ""

        timeDiff = round(time.time() -
                         time_start)  #round to remove microsecond term

        if verbose:
            if stdout:
                pu.print_blue("STDOUT:\n" + stdout)
            if stderr:
                pu.print_boldred("STDERR:\n" + stderr)
        if not quiet:
            pu.print_green("Time taken:" + str(timedelta(seconds=timeDiff)))

        exitCode = result.returncode

        ##Add to logs
        if logs:

            ##get the program used and log its path
            if command_name not in pyrpipeLoggerObject.logged_programs:
                ##get which thisProgram
                #if subcommands are present use parent command
                parent_command = cmd[0]
                progDesc = {
                    'name': command_name,
                    'version': getProgramVersion(parent_command).strip(),
                    'path': getProgramPath(parent_command).strip()
                }
                pyrpipeLoggerObject.env_logger.debug(json.dumps(progDesc))
                pyrpipeLoggerObject.logged_programs.append(command_name)

            #create a dict and dump as json
            logDict = {
                'cmd': log_message,
                'exitcode': exitCode,
                'runtime': str(timedelta(seconds=timeDiff)),
                'starttime': str(starttime_str),
                'stdout': stdout,
                'stderr': stderr,
                'objectid': objectid,
                'commandname': command_name
            }
            pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict))

        if exitCode == 0:
            return True
        else:
            #print the output
            print(
                "Following error occured executing above command (return code={}):"
                .format(str(exitCode)))
            print("STDOUT:\n" + stdout)
            print("STDERR:\n" + stderr)
            return False
    #handle exceptions
    except OSError as e:
        pu.print_boldred("OSError exception occured.\n" + str(e))
        #log error
        timeDiff = round(time.time() - time_start)
        logDict = {
            'cmd': log_message,
            'exitcode': '-1',
            'runtime': str(timedelta(seconds=timeDiff)),
            'starttime': str(starttime_str),
            'stdout': "",
            'stderr': "OSError exception occured.\n" + str(e),
            'objectid': objectid,
            'commandname': command_name
        }
        pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict))
        return False
    except subprocess.CalledProcessError as e:
        pu.print_boldred("CalledProcessError exception occured.\n" + str(e))
        #log error
        timeDiff = round(time.time() - time_start)
        logDict = {
            'cmd': log_message,
            'exitcode': '-1',
            'runtime': str(timedelta(seconds=timeDiff)),
            'starttime': str(starttime_str),
            'stdout': "",
            'stderr': "CalledProcessError exception occured.\n" + str(e),
            'objectid': objectid,
            'commandname': command_name
        }
        pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict))
        return False
    except:
        pu.print_boldred("Fatal error occured during execution.\n" +
                         str(sys.exc_info()[0]))
        #log error
        timeDiff = round(time.time() - time_start)
        logDict = {
            'cmd':
            log_message,
            'exitcode':
            '-1',
            'runtime':
            str(timedelta(seconds=timeDiff)),
            'starttime':
            str(starttime_str),
            'stdout':
            "",
            'stderr':
            str("Fatal error occured during execution.\n" +
                str(sys.exc_info()[0])),
            'objectid':
            objectid,
            'commandname':
            command_name
        }
        pyrpipeLoggerObject.cmd_logger.debug(json.dumps(logDict))
        return False
示例#20
0
def generate_summary(cmdLog, envLog, coverage='a'):
    """Generates summary at the end of run. Simillar to generateHTMLReport
    Parameters
    ----------
    templatefile: string
        path to a template file
    cmdlog: string
        path to the log file
    envlog: string
        path to the env log file
    coverage: string
        tpye of report: full, summary, fail, pass
    
    """
    #vars for generating summary
    startTime = ""
    endTime = ""
    numCommands = 0
    failedCommands = 0
    passedCommands = 0
    numPrograms = 0
    progNames = []
    #parse envLog

    sysInfo, progList = parseEnvLog(envLog)

    #get starttime #end time is calculated from log below
    startTime = dt.datetime.strptime(sysInfo['now'], "%y-%m-%d %H:%M:%S")
    #total progs used
    progNames = progList.keys()
    numPrograms = len(progNames)

    with open(cmdLog) as f:
        data = f.read().splitlines()

    for l in data:
        if not l.startswith("#"):
            thisDict = json.loads(l)
            numCommands += 1
            #add color to table
            if int(thisDict['exitcode']) == 0: passedCommands += 1
            else: failedCommands += 1

    #if nothing in logs exit
    if numCommands < 1:
        pu.print_message('\n=========Summary=========')
        pu.print_message('No commands were executed via pyrpipe')
        return

    #get start and runtime of last command and compute end time
    lastDict = json.loads(data[-1])
    lastST = dt.datetime.strptime(lastDict['starttime'], "%y-%m-%d %H:%M:%S")
    try:
        lastruntime = dt.datetime.strptime(lastDict['runtime'], "%H:%M:%S")
        deltaTime = dt.timedelta(days=0,
                                 hours=lastruntime.hour,
                                 minutes=lastruntime.minute,
                                 seconds=lastruntime.second)
    except ValueError:
        #try days format
        timeString = lastDict['runtime'].split(",")
        days = int(timeString[0].split(" ")[0].strip())
        rest = timeString[1].strip()
        #hours=int(days)*24
        lastruntime = dt.datetime.strptime(rest, "%H:%M:%S")
        #one day less
        #lastruntime=lastruntime+dt.timedelta(days=days-1)
        deltaTime = dt.timedelta(days=days,
                                 hours=lastruntime.hour,
                                 minutes=lastruntime.minute,
                                 seconds=lastruntime.second)

    endTime = lastST + deltaTime
    #remove one extra day
    #endTime=dt.timedelta(days=endTime.day-1,hours=endTime.hour, minutes=endTime.minute, seconds=endTime.second)

    #generate summary
    #summary='\n pyrpipe Summary'
    #summary+='\n Time start: {}     Time end: {}      Total time: {}'.format(str(startTime),str(endTime), str(endTime-startTime))
    #summary+='\n Num commands: {}'.format(numCommands)
    #summary+='\n Num failed commands: {}'.format(failedCommands)
    #summary+='\n Num passed commands: {}'.format(passedCommands)
    #summary+='\n Total programs: {}'.format(numPrograms)
    #summary+='\n Programs: {}'.format(",".join(progNames))

    pu.print_message('\n=========Summary=========')
    pu.print_message(
        'Time start: {}     \nTime end: {}      \nTotal runtime: {}'.format(
            str(startTime), str(endTime), str(endTime - startTime)))
    pu.print_message('Total commands run: {}'.format(numCommands))
    pu.print_green('Passed commands: {}'.format(passedCommands))
    pu.print_boldred('Failed commands: {}'.format(failedCommands))
    pu.print_message('Total unique commands/tools: {}'.format(numPrograms))
    pu.print_message('Command/tools list: {}'.format(",".join(progNames)))
示例#21
0
 def build_index(self,index_path,transcriptome,objectid="NA"):
     """Function to  build kallisto index
     
     index_path: str
         path to the index
     transcriptome: str
         Path to transcriptome
     objectid: str
         Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
             
     :return: Status of kallisto index
     :rtype: bool
     """
     #if index already exists then exit
     if not _force:
         #check if files exists
         if pu.check_files_exist(index_path):
             pu.print_green("Kallisto index {} already exists.".format(index_path))
             self.index=index_path
             return True
         
     #check input
     if not pu.check_files_exist(transcriptome):
         pu.print_boldred("{} does not exist. Exiting".format(transcriptome))
         raise ValueError("Please check input to kallisto index")
         
     
     #create out dir
     indexdir=pu.get_file_directory(index_path)
     #create the out dir
     if not pu.check_paths_exist(indexdir):
         if not pu.mkdir(indexdir):
             raise OSError("Error creating kallisto index. Failed to create index directory.")
     
     args=(transcriptome,)
     internal_kwargs={"-i":index_path}
     #read build parameters
     yamlfile=os.path.join(_params_dir,'kallisto_index.yaml')
     if pu.check_files_exist(yamlfile):
         yaml_params=pl.YAML_loader(yamlfile)
         yaml_kwargs=yaml_params.get_kwargs()
         internal_kwargs={**yaml_kwargs,**internal_kwargs}
     
     #add positional args
     internal_kwargs['--']=args
     
     validArgsIndex=valid_args._args_KALLISTO_INDEX
     
     kallisto_cmd=['kallisto','index']
     kallisto_cmd.extend(pu.parse_unix_args(validArgsIndex,internal_kwargs))
     
     #call kallisto
     status=pe.execute_command(kallisto_cmd,objectid=objectid)
             
     if status:
         if pu.check_files_exist(index_path) and not _dryrun:
             #update object's index
             self.index=index_path
             if self.check_index():
                 return True
     else:
         raise OSError("Error building kallisto index")
     
     return False
示例#22
0
    def perform_alignment(self,
                          sra_object,
                          out_suffix="_hisat2",
                          out_dir="",
                          objectid="NA"):
        """Function to perform alignment using sra_object.
        
        Parameters
        ----------
        
        sra_object SRA object
            An object of type SRA. The path to fastq files will be obtained from this object.
        out_suffix: string
            Suffix for the output sam file
        out_dir: string
            Directory to save the results. Default value is sra_object.directory
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        
        :return: Returns the sorted bam file path after converting sam to bam and sorting it
        :rtype: string
        """
        #check out dir
        if not out_dir:
            out_dir = sra_object.directory
        else:
            #create out_dir if not exists
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        #create path to output sam file
        outSamFile = os.path.join(
            out_dir, sra_object.srr_accession + out_suffix + ".sam")
        outBamFile = os.path.join(
            out_dir, sra_object.srr_accession + out_suffix + "_sorted.bam")
        #check if final bam already exists
        if not _force and pu.check_files_exist(outBamFile):
            pu.print_green('Target files {} already exist.'.format(outBamFile))
            return outBamFile

        #find layout and fq file paths
        if sra_object.layout == 'PAIRED':
            internal_kwargs = {
                "-1": sra_object.fastq_path,
                "-2": sra_object.fastq2_path,
                "-S": outSamFile
            }
        else:
            internal_kwargs = {"-U": sra_object.fastq_path, "-S": outSamFile}

        #call run_hisat2
        status = self.run(None,
                          objectid=sra_object.srr_accession,
                          target=outSamFile,
                          **internal_kwargs)

        if status:
            if not pu.check_files_exist(outSamFile) and not _dryrun:
                return ""
            #convert to bam before returning; returns outBamFile
            return tools.Samtools().sam_sorted_bam(outSamFile)
            #return outSamFile

        return ""
示例#23
0
    def build_index(self,index_path,transcriptome,objectid="NA"):
        """

        Parameters
        ----------
        index_path : TYPE
            DESCRIPTION.
        transcriptome : TYPE
            DESCRIPTION.
        objectid : TYPE, optional
            DESCRIPTION. The default is "NA".

        Raises
        ------
        OSError
            DESCRIPTION.

        Returns
        -------
        bool
            DESCRIPTION.

        """
        
        #if index already exists then exit
        if not _force:
            #check if files exists
            if pu.check_salmonindex(index_path):
                pu.print_green("Salmon index {} already exists.".format(index_path))
                self.index=index_path
                return True
            
        #check input
        if not pu.check_files_exist(transcriptome):
            pu.print_boldred("{} does not exist. Exiting".format(transcriptome))
            return False
        
        #create out dir
        indexdir=pu.get_file_directory(index_path)
        #create the out dir
        if not pu.check_paths_exist(indexdir):
            if not pu.mkdir(indexdir):
                raise OSError("Error creating salmon index. Failed to create index directory.")
        
        
        validArgsIndex=valid_args._args_SALMON_INDEX
        
            
        internal_kwargs={"--threads":_threads,"-t":transcriptome,"-i":index_path}
        #read build parameters
        yamlfile=os.path.join(_params_dir,'salmon_index.yaml')
        if pu.check_files_exist(yamlfile):
            yaml_params=pl.YAML_loader(yamlfile)
            yaml_kwargs=yaml_params.get_kwargs()
            internal_kwargs={**yaml_kwargs,**internal_kwargs}
            
        salmon_cmd=['salmon','index']
        salmon_cmd.extend(pu.parse_unix_args(validArgsIndex,internal_kwargs))
        
        #call salmon
        status=pe.execute_command(salmon_cmd,objectid=objectid)
        
        if status:
            if pu.check_salmonindex(index_path) and not _dryrun:
                #update object's index
                self.index=index_path
                if self.check_index():
                    return True
        else:
            raise OSError("Error building salmon index")
        
        return False
示例#24
0
文件: sra.py 项目: shinyfluba/pyrpipe
 def download_fastq(self,*args,**kwargs):
     """Function to download fastq files
     """
     
     #check if fastq files exists already
     if self.fastq_exists():
         pu.print_green("Fastq files exist already")
         return True        
     #internal_args are created by pyrpipe and will always replace external passed args
     #add the positional args
     if self.sra_exists():
         internal_args=(self.sra_path,)
     else:
         #fstrqd_Cmd.append(self.srr_accession)
         internal_args=(self.srr_accession,)
     #keyword args; boolean flags have empty values
     internal_kwargs={'-O':self.directory,
                      '-o':self.srr_accession+".fastq",
                      '-e':_threads,
                      '-f':""
                      }
     
     
     #merge args, kwargs, internal_args, internal_kwargs
     #If args and kwargs are present
     
     if args or kwargs:
         internal_kwargs={**kwargs,**internal_kwargs}
         internal_args=tuple(set(args+internal_args))
         #append the args to the kwargs using special key '--'
         internal_kwargs['--']=internal_args
     else:
         #check for yaml parameters        
         filepath=os.path.join(_params_dir,'fasterq-dump.yaml')
         yaml_params=pl.YAML_loader(filepath)
         yaml_kwargs=yaml_params.get_kwargs()
         #yaml_args=yaml_params.get_args()
         internal_kwargs={**yaml_kwargs,**internal_kwargs}
         #internal_args=tuple(set(yaml_args+internal_args))
         internal_kwargs['--']=internal_args
     
     
     
     params_list=pu.parse_unix_args(valid_args._args_FASTERQDUMP,internal_kwargs)
     
     fstrqd_Cmd=['fasterq-dump']
     
     #add command and params
     fstrqd_Cmd.extend(params_list)
     
     #execute command
     cmdStatus=pe.execute_command(fstrqd_Cmd,objectid=self.srr_accession)        
     
     if not cmdStatus:
         pu.print_boldred("fasterqdump failed for:"+self.srr_accession)
         return False        
     
     #self.search_fastq(self.directory)
     #determine layout
     self.layout='PAIRED'
     #check files with names <SRR>_1.fastq and <SRR>_2.fastq
     fq=os.path.join(self.directory,self.srr_accession+'_1.fastq')
     fq2=os.path.join(self.directory,self.srr_accession+'_2.fastq')
     self.fastq_path=fq
     self.fastq2_path=fq2
     
     #if dry run
     if _dryrun: return True
     
     if pu.check_files_exist(fq,fq2):
         self.fastq_path=fq
         self.fastq2_path=fq2
         self.layout="PAIRED"
         #remove SRA
         self.delete_sra()
         return True
     
     #check single end file
     fq=os.path.join(self.directory,self.srr_accession+'.fastq')
     if pu.check_files_exist(fq):
         self.fastq_path=fq
         self.layout="SINGLE"
         #remove SRA
         self.delete_sra()
         return True
     
     return False
示例#25
0
    def perform_alignment(self,
                          sra_object,
                          out_suffix="_star",
                          out_dir="",
                          objectid="NA"):
        """Function to perform STAR alignment using sra_object.
        
        Parameters
        ----------
        
        sra_object SRA object
            An object of type SRA. The path to fastq files will be obtained from this object.
        out_suffix: string
            Suffix for the output sam file
        out_dir: string
            Directory to save the results. Default value is sra_object.directory
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        
        :return: Returns the path to output bam
        :rtype: string
        """

        if not out_dir:
            out_dir = sra_object.directory
        else:
            #create out_dir if not exists
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        #find layout and fq file paths
        if sra_object.layout == 'PAIRED':
            internal_kwargs = {
                "--readFilesIn":
                sra_object.fastq_path + " " + sra_object.fastq2_path
            }
        else:
            internal_kwargs = {"--readFilesIn": sra_object.fastq_path}
        #add out dir
        internal_kwargs["--outFileNamePrefix"] = out_dir + "/"

        #the expected out file
        #star can return Aligned.sortedByCoord.out.bam Aligned.out.bam Aligned.toTranscriptome.out.bam
        #return sorted bam or unsorted bam which ever is present
        bam = os.path.join(out_dir, 'Aligned.out.bam')

        #if outSAMtype is not specified make it bam by default
        if not '--outSAMtype' in self._kwargs:
            self._kwargs['--outSAMtype'] = 'BAM SortedByCoordinate'

        if '--outSAMtype' in self._kwargs and 'SortedByCoordinate' in self._kwargs[
                '--outSAMtype']:
            bam = os.path.join(out_dir, 'Aligned.sortedByCoord.out.bam')
        finalbam = bam.split('.bam')[0] + out_suffix + '.bam'

        #check if final bam already exists
        if not _force and pu.check_files_exist(finalbam):
            pu.print_green('Target files {} already exist.'.format(finalbam))
            return finalbam

        #call star
        status = self.run(None,
                          objectid=sra_object.srr_accession,
                          target=bam,
                          **internal_kwargs)

        if status:
            #return rename the bam  file and return path
            if not _dryrun:
                pe.move_file(bam, finalbam)
                if not pu.check_files_exist(finalbam):
                    return ""

            return finalbam

        return ""
示例#26
0
    def perform_assembly(self,
                         bam_file,
                         out_dir=None,
                         out_suffix="_cufflinks",
                         objectid="NA"):
        """Function to run cufflinks with BAM file as input.
                
        Parameters
        ----------
        bam_file: string
            path to bam file
        out_dir: 
            output directory
        out_suffix: string
            Suffix for the output gtf file
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession.
            
        :return: Returns the path to output GTF file
        :rtype: string       
        """

        #create path to output file
        fname = pu.get_file_basename(bam_file)
        if not out_dir:
            out_dir = pu.get_file_directory(bam_file)
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)

        #Add output file name and input bam
        internal_args = (bam_file, )
        internal_kwargs = {"-o": out_dir}
        #add positional args
        internal_kwargs['--'] = internal_args

        #targets
        outfile = os.path.join(out_dir, "transcripts.gtf")
        out_gtf_file = os.path.join(out_dir, fname + out_suffix + ".gtf")

        #if final file already exists
        if not _force and pu.check_files_exist(out_gtf_file):
            pu.print_green(
                'Target files {} already exist.'.format(out_gtf_file))
            return out_gtf_file

        #call cufflinks
        status = self.run(None,
                          objectid=objectid,
                          target=outfile,
                          **internal_kwargs)

        if status:
            if not _dryrun:
                pe.move_file(outfile, out_gtf_file)
                if not pu.check_files_exist(out_gtf_file):
                    return ""

            return out_gtf_file

        return ""
示例#27
0
    def build_index(self, index_path, genome, objectid="NA"):
        """Build a STAR index with given parameters and saves the new index to self.index.
        
        Parameters
        ----------
        
        index_path: string
            Path where the index will be created
        genome: string
            Path to the reference genome
        objectid : string 
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
            
        :return: Returns the status of STAR-build index
        :rtype: bool
        """

        #if index already exists then exit
        if not _force:
            if pu.check_starindex(index_path):
                pu.print_green(
                    "STAR index {} already exists.".format(index_path))
                self.index = index_path
                return True

        #check input files
        if not (pu.check_files_exist(genome)):
            pu.print_boldred(
                "Please provide a valid input fasta file to build STAR index")
            raise ValueError("Please check input to build star index")

        #create index path if doesnt exist
        if not pu.check_paths_exist(index_path):
            if not pu.mkdir(index_path):
                raise OSError(
                    "Error creating STAR index. Failed to create index directory."
                )
                return False

        #determine parameters and execute cmd
        #internal_args=()
        internal_kwargs = {
            "--runMode": "genomeGenerate",
            "--genomeDir": index_path,
            "--genomeFastaFiles": genome,
            "--runThreadN": self._threads
        }

        #read build parameters
        yamlfile = os.path.join(_params_dir, 'star_index.yaml')
        if pu.check_files_exist(yamlfile):
            yaml_params = pl.YAML_loader(yamlfile)
            yaml_kwargs = yaml_params.get_kwargs()
            internal_kwargs = {**yaml_kwargs, **internal_kwargs}

        starbuild_Cmd = ['STAR']
        starbuild_Cmd.extend(
            pu.parse_unix_args(valid_args._args_STAR, internal_kwargs))

        #execute command
        status = pe.execute_command(starbuild_Cmd, objectid=objectid)
        if status:
            if pu.check_paths_exist(index_path) and not _dryrun:
                #update object's index
                self.index = index_path
                if self.check_index():
                    return True
        else:
            raise OSError("Error building STAR index")

        return True
示例#28
0
    def perform_qc(self,sra_object,out_dir="",out_suffix="_trimgalore",objectid="NA"):
        """Function to perform qc using trimgalore.
        The function perform_qc() is consistent for all QC classess.
        
        Parameters
        ----------
        
        sra_object: SRA
            An SRA object whose fastq files will be used
        out_dir: str
            Path to output directory
        out_suffix: string
            Suffix for the output sam file
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
            
        :return: Returns the path of fastq files after QC. tuple has one item for single end files and two for paired.
        :rtype: tuple
        """
        if not out_dir:
            out_dir=sra_object.directory
        else:
            if not pu.check_paths_exist(out_dir):
                pu.mkdir(out_dir)
        
        #get layout
        if sra_object.layout=='PAIRED':
            fq1=sra_object.fastq_path
            fq2=sra_object.fastq2_path
            internal_args=(fq1,fq2)
            internal_kwargs={"--paired":"","-o":out_dir}
            
            
            """
            running trim galore will create two files named <input>_val_1.fq and <input>_val_2.fq
            move these files to the specified out files
            """
            file1=os.path.join(out_dir,pu.get_file_basename(fq1)+"_val_1.fq")
            file2=os.path.join(out_dir,pu.get_file_basename(fq2)+"_val_2.fq")
            #targets
            out_file1=os.path.join(out_dir,pu.get_file_basename(fq1)+out_suffix+".fastq")
            out_file2=os.path.join(out_dir,pu.get_file_basename(fq2)+out_suffix+".fastq")
            
            #check if final files already exists
            if not _force and pu.check_files_exist(out_file1,out_file2):
                pu.print_green('Target files {}, {} already exist.'.format(out_file1,out_file2))
                return out_file1,out_file2
            
            
            #run trimgalore
            status=self.run(*internal_args,objectid=objectid,target=[file1,file2],**internal_kwargs)
            
            if status:
                #return rename the bam  file and return path
                if not _dryrun:
                    pe.move_file(file1,out_file1,verbose=False)
                    pe.move_file(file2,out_file2,verbose=False)
                    if not pu.check_files_exist(out_file1,out_file2):
                        return ""
                
                return out_file1,out_file2
            
            return ("",)
            
            
        else:
            fq=sra_object.fastq_path
            internal_args=(fq,)
            internal_kwargs={"-o":out_dir}

            """
            running trim galore will create one file named <input>_trimmed.fq
            move these files to the specified out files
            """
            file=os.path.join(out_dir,pu.get_file_basename(fq)+"_trimmed.fq")
            #target
            out_file=os.path.join(out_dir, pu.get_file_basename(fq)+out_suffix+".fastq")
            #check if final files already exists
            if not _force and pu.check_files_exist(out_file):
                pu.print_green('Target files {} already exist.'.format(out_file))
                return (out_file,)
            
            #run trimgalore
            status=self.run(*internal_args,objectid=objectid,target=file,**internal_kwargs)
            if status:
                #return rename the bam  file and return path
                if not _dryrun:
                    pe.move_file(file,out_file)
                    if not pu.check_files_exist(out_file):
                        return ""
                
                return (out_file,)
            
            return ("",)
示例#29
0
    def build_index(self, index_path, genome, objectid="NA"):
        """Build a hisat index with given parameters and saves the new index to self.index.
        
        Parameters
        ----------
        
        index_path: string
            Path where the index will be created
        genome: string
            Path to the reference genome
        objectid : string 
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
            
        :return: Returns the status of hisat2-build
        :rtype: bool
        """

        #if index already exists then exit
        if not _force:
            #check if files exists
            if pu.check_hisatindex(index_path):
                pu.print_green(
                    "Hisat2 index {} already exists.".format(index_path))
                self.index = os.path.join(index_path)
                return True

        #check input files
        if not pu.check_files_exist(genome):
            pu.print_boldred(
                "Please provide a valid input fasta file to build Hisat2 index"
            )
            raise ValueError("Please check input to hisat2 build index")

        indexdir = pu.get_file_directory(index_path)
        #create the out dir
        if not pu.check_paths_exist(indexdir):
            if not pu.mkdir(indexdir):
                raise OSError(
                    "Error creating hisat2 index. Failed to create index directory."
                )

        hisat2Buildvalid_args = valid_args._args_HISAT2BUILD

        args = (genome, index_path)
        internal_kwargs = {"-p": self._threads}
        #read build parameters
        yamlfile = os.path.join(_params_dir, 'hisat2_index.yaml')
        if pu.check_files_exist(yamlfile):
            yaml_params = pl.YAML_loader(yamlfile)
            yaml_kwargs = yaml_params.get_kwargs()
            internal_kwargs = {**yaml_kwargs, **internal_kwargs}

        #add positional args
        internal_kwargs['--'] = args

        hisat2Build_Cmd = ['hisat2-build']
        hisat2Build_Cmd.extend(
            pu.parse_unix_args(hisat2Buildvalid_args, internal_kwargs))

        #execute command
        status = pe.execute_command(hisat2Build_Cmd, objectid=objectid)

        if status:
            if pu.check_hisatindex(index_path) and not _dryrun:
                #update object's index
                self.index = index_path
                if self.check_index():
                    return True
        else:
            raise OSError("Error building Hisat2 index")

        return True
示例#30
0
    def build_index(self,
                    index_path,
                    *args,
                    threads=None,
                    overwrite=False,
                    verbose=False,
                    quiet=False,
                    logs=True,
                    objectid="NA",
                    **kwargs):
        """Build a star index with given parameters and saves the new index to self.star_index.
        
        Parameters
        ----------
        
        index_path: string
            Path where the index will be created
        args: tuple
            Path to reference input files
        threads: int
            Num threads to use
        overwrite: bool
            Overwrite if index already exists
        verbose: bool
            Print stdout and std error
        quiet: bool
            Print nothing
        logs: bool
            Log this command to pyrpipe logs
        objectid: str
            Provide an id to attach with this command e.g. the SRR accession. This is useful for debugging, benchmarking and reports.
        
        kwargs: dict
            Parameters for the star command

        :return: Returns status of star command
        :rtype: bool
        """

        #if index already exists then exit
        if not overwrite:
            if pu.check_starindex(index_path):
                pu.print_green("STAR index already exists. Using it...")
                self.star_index = index_path
                return True

        #check input files
        if len(args) < 1:
            pu.print_boldred(
                "Please provide input fasta file to build STAR index")
            return False

        if not pu.check_files_exist(*args):
            raise Exception("Please check input to star index")
            return False

        #create path if doesnt exist
        if not pu.check_paths_exist(index_path):
            if not pu.mkdir(index_path):
                raise Exception("Error creating STAR index. Exiting.")
                return False

        if not threads:
            threads = self.threads

        #add runMode
        newOpts = {
            "--runMode": "genomeGenerate",
            "--genomeDir": index_path,
            "--genomeFastaFiles": " ".join(args),
            "--runThreadN": str(threads)
        }

        mergedOpts = {**newOpts, **kwargs}

        starbuild_Cmd = ['STAR']
        starbuild_Cmd.extend(pu.parse_unix_args(None, mergedOpts))

        #execute command
        status = pe.execute_command(starbuild_Cmd,
                                    verbose=verbose,
                                    quiet=quiet,
                                    logs=logs,
                                    objectid=objectid)

        if status:
            if pu.check_paths_exist(index_path):
                #update object's index
                self.star_index = index_path
                if self.check_index():
                    return True
        else:
            return False