def main(): import spiper from path import Path from spiper.runner import get_all_files, get_changed_files, cache_run from pprint import pprint spiper.rcParams['dir_layout'] = 'flat' prefix = Path('/tmp/test_import/root') prefix.dirname().rmtree_p() print('#### [Note] currently the package change is not recorded') fs = get_changed_files(test_import, prefix) pprint(fs) fs = get_all_files(test_import, prefix) pprint(fs) cache_run(test_import, prefix) pprint(fs) print('#### [Note] Remote workflow should detect file changes') fs = get_changed_files(simple_flow, prefix) pprint(fs) assert fs != [] print('### run actual workflow') cache_run(simple_flow, prefix) fs = get_changed_files(simple_flow, prefix) pprint(fs) #### The Flow() execution will never be skipped #### hence The self.output.log will always be changed assert fs == [ File('/tmp/test_import/root.workflow.log'), File('/tmp/test_import/root_backup.output..log') ]
def job_stringtie_count( self, prefix, BAM_FILE=File, GTF_FILE=File, THREADS_=int, _IMAGE=Depend( 'docker://quay.io/biocontainers/stringtie:2.1.1--hc900ff6_0'), _output=['count', 'cmd']): _ = ''' Example run: stringtie -p 4 --rf 809_S1.bam -G /home/feng/ref/Arabidopsis_thaliana_TAIR10/annotation/genes.gtf -o 809_S1.stringtie.gtf -A 809_S1.stringtie.count &> 809_S1.stringtie.log ''' CMD = [ 'stringtie', '-p', str(THREADS_), File(BAM_FILE), '--rf', '-G', File(GTF_FILE), '-A', File(self.output.count), ] res = SingularityShellCommand(CMD, _IMAGE, self.output.cmd)
def job_picard_dedup( self, prefix, bam_file=File, THREADS_=int, _IMAGE=Depend('docker://quay.io/biocontainers/picard:2.21.9--0'), _IMAGE_SAMTOOLS=Depend( "docker://quay.io/biocontainers/samtools:1.10--h9402c20_2"), _output=['bam', 'log', 'cmd_log'], ): CMD = [ 'picard', 'MarkDuplicates', Concat('I=', File(bam_file)), Concat('O=', File(self.output.bam)), Concat('M=', File(self.output.log)), # Concat('TMP_DIR=',File(self.output.bam+'.picard_temp').makedirs_p().check_writable()), 'REMOVE_DUPLICATES=true', ] res = LoggedSingularityCommand( self.prefix_named, CMD, _IMAGE, self.output.cmd_log, ) res = LoggedSingularityCommand( self.prefix_named, # prefix, ['samtools', 'index', self.output.bam], _IMAGE_SAMTOOLS, self.output.cmd_log, mode='a', extra_files=[self.output.bam + '.bai'])
def job_hisat2_index( self, prefix, FASTA_FILE=File, THREADS_=int, _IMAGE=Depend( "docker://quay.io/biocontainers/hisat2:2.1.0--py36hc9558a2_4"), _output=[ Prefix('index_prefix'), File('log'), File('cmd'), ], ): CMD = [ 'hisat2-build', '-p', str(THREADS_), File(FASTA_FILE), Prefix(self.output.index_prefix), '&>', File(self.output.log), ] res = LoggedSingularityCommand(self.prefix_named, CMD, _IMAGE, self.output.cmd) return self
def tarball_dangerous_cache(self,prefix, input_prefix = File, _output=[ File('tar_gz'), File('cmd')]): with input_prefix.dirname(): stdout = LoggedShellCommand([ 'tar', '-cvzf', self.output.tar_gz, input_prefix.basename()+'*', ], self.output.cmd) return self
def gen_files(self, prefix, _seq = 'AGCTTCGTC', _output=[ File('out_txt')]): with open(self.output.out_txt,'w') as f: f.write( _seq * 10 ) return self
def workflow(self, prefix, seed =int , L=int, _output = [ File('log'), ]): print('\n[Flow running] mock=%s'%getattr(self.runner.func,'__name__','None')) curr = self.runner(random_seq, prefix, seed, L) curr1 = self.config_runner(tag='const')(random_seq, prefix, 0, 100) curr = self.runner(transcribe, prefix, curr.output.seq,) curr = self.runner(mutate, prefix, curr.output.fasta) stdout = LoggedShellCommand(['ls -lhtr',prefix.dirname()], self.output.log).rstrip() self.runner(copy_file, prefix+'.source.py', __file__) return self
def workflow(self, prefix, seed =int , L=int, _output = [ File('log'), ]): _ = ''' A workflow is not a Node() ''' print('\n[Flow running] mock=%s'%getattr(self.runner.func,'__name__','None')) ### [ToDo] (func, prefix) must be unique within each workflow # self.data = {} curr = self.runner(random_seq, prefix, seed, L) curr1 = self.config_runner(tag='temp')(random_seq, prefix, 0, 100) curr = self.runner(transcribe, prefix, curr.output.seq,) curr = self.runner(mutate, prefix, curr.output.fasta) stdout = LoggedShellCommand(['ls -lhtr',prefix.dirname()],).rstrip() return self
def test_mock_overwrite(self): # assert 0 prefix = None if prefix is None: prefix = File('/tmp/spiper.symbolic/root') prefix.dirname().rmtree_p() _d = spiper.rcParams.copy() spiper.rcParams['dir_layout'] = 'clean' tarball_main( mock_run, prefix) fs = sorted(prefix.fileglob('*',0,0)); print(pprint(fs)) assert fs == [ File('/tmp/spiper.symbolic/root.gen_files.out_txt'), File('/tmp/spiper.symbolic/root.gen_files.out_txt.empty.mock'), File('/tmp/spiper.symbolic/root.gen_files.out_txt.old.mock'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.cmd.empty.mock'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.cmd.old.mock'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.tar_gz'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.tar_gz.empty.mock'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.tar_gz.old.mock'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.cmd.empty.mock'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.cmd.old.mock'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.tar_gz'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.tar_gz.empty.mock'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.tar_gz.old.mock')] # tarball_main( lambda *a:cache_run(*a,check_changed=2), prefix) tarball_main( cache_run, prefix) fs = sorted(prefix.fileglob('*',0,0)); print(pprint(fs)) assert fs == [ File('/tmp/spiper.symbolic/root.gen_files.out_txt'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.tar_gz'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.tar_gz')] tarball_main( mock_run , prefix) fs = sorted(prefix.fileglob('*',0,0)); print(pprint(fs)) assert fs == [ File('/tmp/spiper.symbolic/root.gen_files.out_txt'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.tar_gz'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.tar_gz')] with open((prefix + '.gen_files.out_txt'),'w') as f: f.write('100'*2000) tarball_main( mock_run , prefix) fs = sorted(prefix.fileglob('*',0,0)); print(pprint(fs)) assert fs == [ File('/tmp/spiper.symbolic/root.gen_files.out_txt'), File('/tmp/spiper.symbolic/root.gen_files.out_txt.old.mock'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.tar_gz'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.cmd.old.mock'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.tar_gz'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.tar_gz.old.mock')] tarball_main( cache_run, prefix) fs = sorted(prefix.fileglob('*',0,0)); print(pprint(fs)) assert fs == [ File('/tmp/spiper.symbolic/root.gen_files.out_txt'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_dangerous_cache.tar_gz'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.cmd'), File('/tmp/spiper.symbolic/root.tarball_prefix_cache.tar_gz')] # res = LoggedShellCommand(['ls -lhtr',prefix+'*']) # .fileglob('*'),]) # print(res) # prefix # tarball_main( mock_run , prefix) # assert 0 # (prefix.dirname()/'root.tarball_dangerous_cache.tar_gz').touch() # self.assertRaises(spiper._types.OverwriteError, tarball_main, mock_run, prefix) spiper.rcParams.update(_d)
def main(self=None, prefix = None): from spiper.runner import cache_run, mock_run, get_changed_files, get_all_files from spiper.shell import LoggedShellCommand from spiper.types import File,CacheFile from pprint import pprint spiper.rcParams['dir_layout']='clean' # if prefix is None: prefix = Path('/tmp/spiper.symbolic/root') # backup_prefix = File('/home/user/.temp/backup_03_mock_flow/root') backup_prefix = File('~/.temp/backup_03_mock_flow/root').expand() prefix.dirname().rmtree_p() backup_prefix.dirname().rmtree_p() print('\n...[start]%r'%prefix) #### once a workflow is defined, we can view the proposed file changes fs = get_changed_files(workflow, prefix, 1, 100, verbose=0) pprint(fs) assert fs ==[ File('/tmp/spiper.symbolic/root.workflow.log'), File('/tmp/spiper.symbolic/root.random_seq.seq'), File('/tmp/spiper.symbolic/root.random_seq_const.seq'), File('/tmp/spiper.symbolic/root.transcribe.fasta'), File('/tmp/spiper.symbolic/root.mutate.fasta'), File('/tmp/spiper.symbolic/root.source.py'), # File('/home/user/.temp/backup_03_mock_flow/root.source.py') ] ### backup is conveniently defined as a workflow taking an executed workflow as an input. ### To check the proposed backup, mock_run() the workflow first. workflow_out = mock_run(workflow, prefix, 1, 100) fs = get_changed_files(backup, backup_prefix, workflow_out) pprint(fs) assert fs == [ File('/home/user/.temp/backup_03_mock_flow/root.subflow.random_seq.output.seq'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.random_seq_const.output.seq'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.transcribe.output.fasta'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.mutate.output.fasta'), File('/home/user/.temp/backup_03_mock_flow/root.output.log'), # File('/tmp/spiper.symbolic/root.source.py') File('/home/user/.temp/backup_03_mock_flow/root.source.py') ] ### a convenient Flow may be defined to execute the two in chain ### If there is certain change to the workflow, ### the backup can also be runned fs = get_changed_files (run_and_backup, prefix, 1, 100, backup_prefix, verbose=0) pprint(fs) assert fs == [ File('/tmp/spiper.symbolic/root.workflow.log'), File('/tmp/spiper.symbolic/root.random_seq.seq'), File('/tmp/spiper.symbolic/root.random_seq_const.seq'), File('/tmp/spiper.symbolic/root.transcribe.fasta'), File('/tmp/spiper.symbolic/root.mutate.fasta'), File('/tmp/spiper.symbolic/root.source.py'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.random_seq.output.seq'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.random_seq_const.output.seq'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.transcribe.output.fasta'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.mutate.output.fasta'), File('/home/user/.temp/backup_03_mock_flow/root.output.log'), File('/home/user/.temp/backup_03_mock_flow/root.source.py'), File('/home/user/.temp/backup_03_mock_flow/root.plot_graph.deptree_json'), File('/home/user/.temp/backup_03_mock_flow/root.plot_graph.deptree_dot_txt'), ] ###### constants that are preserved between runs should be detected unchanged _ = cache_run (run_and_backup, prefix, 1, 100, backup_prefix, verbose=0) fs = get_changed_files (run_and_backup, prefix, 2, 200, backup_prefix, verbose=0) pprint(fs) assert fs == [File('/tmp/spiper.symbolic/root.workflow.log'), File('/tmp/spiper.symbolic/root.random_seq.seq'), # File('/tmp/spiper.symbolic/root.random_seq_const.seq'), File('/tmp/spiper.symbolic/root.transcribe.fasta'), File('/tmp/spiper.symbolic/root.mutate.fasta'), # File('/tmp/spiper.symbolic/root.source.py'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.random_seq.output.seq'), # File('/home/user/.temp/backup_03_mock_flow/root.subflow.random_seq_const.output.seq'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.transcribe.output.fasta'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.mutate.output.fasta'), File('/home/user/.temp/backup_03_mock_flow/root.output.log'), # File('/home/user/.temp/backup_03_mock_flow/root.source.py'), File('/home/user/.temp/backup_03_mock_flow/root.plot_graph.deptree_json'), File('/home/user/.temp/backup_03_mock_flow/root.plot_graph.deptree_dot_txt'), ] ##### get_all_files() return a leaf file regardless of whether is is changed fs = get_all_files (run_and_backup, prefix, 2, 200, backup_prefix, verbose=0) pprint(fs) assert fs == [ File('/tmp/spiper.symbolic/root.workflow.log'), File('/tmp/spiper.symbolic/root.random_seq.seq'), File('/tmp/spiper.symbolic/root.random_seq_const.seq'), File('/tmp/spiper.symbolic/root.transcribe.fasta'), File('/tmp/spiper.symbolic/root.mutate.fasta'), File('/tmp/spiper.symbolic/root.source.py'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.random_seq.output.seq'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.random_seq_const.output.seq'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.transcribe.output.fasta'), File('/home/user/.temp/backup_03_mock_flow/root.subflow.mutate.output.fasta'), File('/home/user/.temp/backup_03_mock_flow/root.output.log'), File('/home/user/.temp/backup_03_mock_flow/root.source.py'), File('/home/user/.temp/backup_03_mock_flow/root.plot_graph.deptree_json'), File('/home/user/.temp/backup_03_mock_flow/root.plot_graph.deptree_dot_txt'), ] _ = cache_run (run_and_backup, prefix, 2, 200, backup_prefix, verbose=0)
def job_trimmomatic( self, prefix, FASTQ_FILE_1=InputFile, FASTQ_FILE_2=InputFile, THREADS_=int, _IMAGE=Depend('docker://quay.io/biocontainers/trimmomatic:0.35--6'), _output=[ File('fastq1'), File('fastq2'), File('log'), File('cmd'), ], ): _ = ''' trimmomatic PE -threads 4 -phred33 /home/feng/temp/187R/187R-S1-2018_06_27_14:02:08/809_S1_R1_raw.fastq /home/feng/temp /187R/187R-S1-2018_06_27_14:02:08/809_S1_R2_raw.fastq 809_S1_R1_raw_pass.fastq 809_S1_R1_raw_fail.fastq 809_S1_R2_raw_pass.fastq 809_S1_R2_raw_fail.fastq ILLUMINACLIP:/home/Program_NGS_sl-pw-srv01/Trimmomatic-0.32/adapters/TruSeq3-PE-2.fa :6:30:10 LEADING:3 TRAILING:3 MINLEN:36 SLIDINGWINDOW:4:15 ''' # _out = get_output_files(self, prefix, _output) CMD = [ 'trimmomatic', 'PE', '-threads', str(THREADS_), '-phred33', File(FASTQ_FILE_1), File(FASTQ_FILE_2), File(self.output.fastq1), File(self.output.fastq1 + '.fail'), File(self.output.fastq2), File(self.output.fastq2 + '.fail'), 'ILLUMINACLIP:' '/usr/local/share/trimmomatic-0.35-6/adapters/TruSeq3-PE-2.fa' ':6:30:10', 'LEADING:3', 'TRAILING:3', 'MINLEN:36', 'SLIDINGWINDOW:4:15', '&>', File(self.output.log) ] res = SingularityShellCommand(CMD, _IMAGE, self.output.cmd) return self
def job_hisat2_align( self, prefix, INDEX_PREFIX=Prefix, FASTQ_FILE_1=InputFile, FASTQ_FILE_2=InputFile, THREADS_=int, _IMAGE=Depend( "docker://quay.io/biocontainers/hisat2:2.1.0--py36hc9558a2_4"), _IMAGE_SAMTOOLS=Depend( "docker://quay.io/biocontainers/samtools:1.10--h9402c20_2"), _output=[ File('bam'), File('log'), File('cmd'), ]): # _out = get_output_files(self,prefix,_output) results = [] CMD = [ 'hisat2', '-x', Prefix(INDEX_PREFIX), '-1', File(FASTQ_FILE_1), '-2', File(FASTQ_FILE_2), # '-U', InputFile( FASTQ_FILE_1), # ['-2',InputFile( FASTQ_FILE_2) ] if FASTQ_FILE_2 else [], '-S', File(self.output.bam + '.sam'), '--threads', str(THREADS_), '--no-mixed', '--rna-strandness', 'RF', '--dta', '--fr', '&>', File(self.output.log), ] res = SingularityShellCommand(CMD, _IMAGE, self.output.cmd) # results.append(job_result( None, CMD, self.output)) _ = ''' samtools view /home/feng/temp/187R/187R-S1-2018_06_27_14:02:08/809_S1.sam -b --threads 4 -o 809_S1.bam ''' CMD = [ 'samtools', 'view', File(self.output.bam + '.sam'), '--threads', str(THREADS_), '-o', File(self.output.bam + '.unsorted'), ] res = SingularityShellCommand(CMD, _IMAGE_SAMTOOLS, self.output.cmd) CMD = [ 'samtools', 'sort', File(self.output.bam + '.unsorted'), '--threads', str(THREADS_), '-o', File(self.output.bam), ] res = SingularityShellCommand(CMD, _IMAGE_SAMTOOLS, self.output.cmd) return self
def workflow(self, prefix, hisat2_cache_prefix=str, genome_fasta=File, genome_gtf_file=File, fastq1=File, fastq2=File, THREADS_=int, _output=[]): # print # assert 0,repr((hisat2_cache_prefix)) # self.data = {} # self.data['index'] = curr = self.runner( job_hisat2_index, hisat2_cache_prefix, genome_fasta, THREADS_, ) # self.data['trimmed'] = curr = self.runner( job_trimmomatic, prefix, fastq1, fastq2, THREADS_, ) curr = self.runner( job_hisat2_align, prefix, self.subflow['job_hisat2_index'].output.index_prefix, self.subflow['job_trimmomatic'].output.fastq1, self.subflow['job_trimmomatic'].output.fastq2, [], THREADS_, ) self.runner( job_picard_dedup, prefix, self.subflow['job_hisat2_align'].output.bam, THREADS_, ) self.config_runner(tag='picard_dedup_bam')( job_bam_qc, prefix, self.subflow['job_picard_dedup'].output.bam, THREADS_, ) last = self.config_runner(tag='picard_dedup_bam')( job_bam2bw_cpm, prefix, self.subflow['job_picard_dedup'].output.bam, self.subflow['job_bam_qc-picard_dedup_bam'].output.data_json, 1, ) self.runner( LinkFile, # last.output.output.bw[:-len('.bw')] + '.cpm_bw', # last.output.bw, # ) self.subflow['job_bam2bw_cpm-picard_dedup_bam'].output.bw[:-len('.bw')] + '.cpm_bw', self.subflow['job_bam2bw_cpm-picard_dedup_bam'].output.bw) self.runner( job_stringtie_count, prefix, self.subflow['job_picard_dedup'].output.bam, genome_gtf_file, THREADS_, ) assert File(__file__).isfile( ), 'Cannot find source file using __file__:%r' % __file__ self.runner(CopyFile, self.prefix_named + '.source.py', __file__) return self
def job_hisat2_align( self, prefix, INDEX_PREFIX=Prefix, FASTQ_FILE_1=File, FASTQ_FILE_2=File, hisat2_args=list, THREADS_=int, _IMAGE=Depend( "docker://quay.io/biocontainers/hisat2:2.1.0--py36hc9558a2_4"), _IMAGE_SAMTOOLS=Depend( "docker://quay.io/biocontainers/samtools:1.10--h9402c20_2"), _output=[ File('bam'), File('log'), File('cmd'), ]): # _out = get_output_files(self,prefix,_output) results = [] cmd1 = CMD = [ 'hisat2', # hisat2_args, '-x', Prefix(INDEX_PREFIX), '-1', File(FASTQ_FILE_1), '-2', File(FASTQ_FILE_2), # '-U', File( FASTQ_FILE_1), # ['-2',File( FASTQ_FILE_2) ] if FASTQ_FILE_2 else [], '-S', '/dev/stdout', '--threads', str(max(1, THREADS_ - 1)), hisat2_args or ['--no-mixed', '--rna-strandness', 'RF', '--dta', '--fr'], '2>', File(self.output.log), ] ''' singularity --verbose --debug exec docker://python:2.7.17-alpine python -V singularity shell docker://python:2.7.17-alpine python -V ''' # res = LoggedSingularityCommand(CMD, _IMAGE, self.output.cmd) # results.append(job_result( None, CMD, self.output)) # _ = ''' # samtools view /home/feng/temp/187R/187R-S1-2018_06_27_14:02:08/809_S1.sam -b --threads 4 -o 809_S1.bam # ''' cmd2 = CMD = [ 'samtools', 'view', '-bS', '/dev/stdin', '--threads', str(1), '-o', (self.output.bam + '.unsorted'), ] # res = LoggedSingularityCommand(CMD, _IMAGE_SAMTOOLS, self.output.cmd) cmd3 = CMD = [ 'samtools', 'sort', (self.output.bam + '.unsorted'), '--threads', str(THREADS_), '-o', (self.output.bam), '-T', File(self.output.bam + '.sort_temp/').makedirs_p().check_writable(), ] CMD = [ # 'PIPE=$(mktemp -u);mkfifo $PIPE;exec 3<>$PIPE ;rm $PIPE;', LoggedSingularityCommandList( self.prefix_named, cmd1, _IMAGE, ), '|', LoggedSingularityCommandList(self.prefix_named, cmd2, _IMAGE_SAMTOOLS), '&&', LoggedSingularityCommandList(self.prefix_named, cmd3, _IMAGE_SAMTOOLS), # extra_files = [File(self.output.bam.dirname())]), # LoggedSingularityCommandList(cmd3, _IMAGE_SAMTOOLS, extra_files = [File(self.output.bam.dirname())]), # LoggedSingularityCommandList([cmd3,'&&','df',File(self.output.bam.dirname())], _IMAGE_SAMTOOLS, # extra_files = [File(self.output.bam.dirname())]), ] res = LoggedShellCommand(CMD, self.output.cmd) # (self.output.bam+'.sam').unlink_p() # (self.output.bam+'.unsorted').unlink_p() # res = LoggedSingularityCommand(CMD, _IMAGE_SAMTOOLS, self.output.cmd) return self