def test_pipes(): ParslPipeline._pipeline_run_temp_dir = tempfile.TemporaryDirectory( dir='/tmp', suffix='__operon') one = Software('one', path='/path/to/one') pipe = Pipe( one.prep(Parameter('-a', '1'), Parameter('-b', '2'), Redirect(stream='>', dest='/path/to/log'))) assert isinstance(pipe.piped_software_blueprint, dict)
def test_parameters(): p_single = Parameter('one') p_multiple = Parameter('-a', 'one', '-b', 'two') p_equals = Parameter('--one', 'a', sep='=') p_fused = Parameter('-a one -b', 'two') p_with_data1 = Parameter(Data('/path/to/data')) p_with_data2 = Parameter('-a', Data('/path/to/data1'), Data('/path/to/data2')) # Parameters turn into strings as expected assert str(p_single) == 'one' assert str(p_multiple) == '-a one -b two' assert str(p_equals) == '--one=a' # Use with different separators assert str( p_fused ) == '-a one -b two' # Spaces in single string handled correctly # Holds Data objects correctly assert len(p_single.data) == 0 and not p_single.data assert len(p_with_data1.data) == 1 assert len(p_with_data2.data) == 2 assert str(p_with_data1.data[0]) == '/path/to/data' assert str(p_with_data2) == '-a /path/to/data1 /path/to/data2'
def pipeline(self, pipeline_args, pipeline_config): # chunky run RiboSeq_pipe.py --fastqs # /mnt/cinder/thomas/RiboSeq/Lane5/AWS-3_S3_L005_R1_001.fastq.gz # --output /mnt/cinder/thomas/RiboSeq/test --threads # create variables from parser if wanted bamFiles = pipeline_args['bam:lib'] outputDir = pipeline_args['output'] numThreads = pipeline_args['threads'] # Create output directory subprocess.call(['mkdir', outputDir]) # Software bedtools = Software('bedtools', pipeline_config['bedtools']['path']) samtools = Software('samtools', pipeline_config['samtools']['path']) samtools_header = Software('samtools', pipeline_config['samtools']['path']) samtools_uniq = Software('samtools', pipeline_config['samtools']['path']) samtools_sort = Software('samtools sort', pipeline_config['samtools']['path']) read_distribution = Software('read_distribution', pipeline_config['read_distribution']['path']) featureCounts = Software('featureCounts', pipeline_config['featureCounts']['path']) fastQC = Software('FastQC', pipeline_config['FastQC']['path']) picard = Software('picard', "java -Xms8g -Xmx9g -jar {}".format(pipeline_config['picard']['path'])) # Change these to just be done in python script? # Common software tools awk = Software('awk', 'awk') sort = Software('sort', 'sort') uniq = Software('uniq', 'uniq') paste = Software('paste', 'paste') cat = Software('cat', 'cat') grep = Software('grep', 'grep') # Directories and Files pathToGtf = pipeline_config['star']['GTF_ref'] pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed'] pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100'] pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed'] pathTo_genomeFasta = pipeline_config['picard']['genomeFasta'] pathTo_ref_flat = pipeline_config['picard']['refFlat'] # bid_list = [] # for fastqlib in fastqFiles: # bid_list.append(fastqlib.split(':')[-1]) for bamlib in bamFiles: bam, bid = bamlib.split(':') newDir = os.path.join(outputDir, bid) subprocess.call(['mkdir', newDir]) uniq_bam = '{}/{}.uniq_sorted.bam'.format(newDir, bid) samtools_uniq.register( Parameter('view'), Parameter(Data(bam).as_input()), # star outfile name Pipe( grep.prep( Parameter('-w'), Parameter('NH:i:1'), Pipe( cat.prep( Parameter(os.path.join(newDir, '{}.header.sam'.format(bid)), '-'), Pipe( samtools.prep( Parameter('view'), Parameter('-bS', '-'), Pipe( samtools.prep( Parameter('sort'), Parameter('-', '-o', Data(uniq_bam).as_output()) ) ) ) ) ) ) ) ) ) # QC # Codon_periodicity intersectBed_filepath = '{}/{}.intersect_start100.bed'.format(newDir, bid) relative_pos_table_filepath = '{}/{}_relative_pos_aggregate.table'.format(newDir, bid) bedtools.register( Parameter('intersect'), Parameter('-a {}'.format(pathTo_hg19_bed_start100)), Parameter('-b', Data(uniq_bam).as_input()), Parameter('-s'), Parameter('-bed'), Parameter('-wa'), Parameter('-wb'), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.intersect_start100.bed'.format(bid))), extra_outputs=[Data(intersectBed_filepath).as_output()] ) CodeBlock.register( func=relative_pos_table, args=[], kwargs={'intersect_bedtools_filepath' : intersectBed_filepath, 'relativePos_filepath': relative_pos_table_filepath}, inputs=[Data(intersectBed_filepath).as_input()], outputs=[Data(relative_pos_table_filepath).as_output()] ) codon_periodicity_filepath = '{}/{}_codon_periodicity_plot.png'.format(newDir, bid) CodeBlock.register( func=create_codon_periodicity, args=[], kwargs={'relativePos_filepath': relative_pos_table_filepath, 'codon_periodicity_filepath' : codon_periodicity_filepath}, inputs=[Data(relative_pos_table_filepath).as_input()], outputs=[Data(codon_periodicity_filepath).as_output()] ) # Picard picard.register( Parameter('CollectMultipleMetrics'), Parameter('I={}'.format(uniq_bam)), # input Parameter('O={}/{}.multiple_metrics'.format(newDir, bid)), # output Parameter('R={}'.format(pathTo_genomeFasta)), # genomeReference extra_inputs=[Data(uniq_bam)] ) picard.register( Parameter('CollectGcBiasMetrics'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.gc_bias_metrics'.format(newDir, bid)), # output Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format(newDir, bid)), # chart Parameter('S={}/{}.summary_metrics'.format(newDir, bid)), # summary metrics Parameter('R={}'.format(pathTo_genomeFasta)), # genome reference extra_inputs=[Data(uniq_bam)] ) picard.register( Parameter('CollectRnaSeqMetrics'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)), # output Parameter('REF_FLAT={}'.format('{}'.format(pathTo_ref_flat))), # ref_flat Parameter('STRAND=FIRST_READ_TRANSCRIPTION_STRAND'), # strandedness extra_inputs=[Data(uniq_bam)] ) picard.register( Parameter('MarkDuplicates'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.marked_duplicates.bam'.format(newDir, bid)), # output Parameter('M={}/{}.marked_dup_metrics.txt'.format(newDir, bid)), # marked dup metrics Parameter('ASSUME_SORTED=true'), # It is sorted extra_inputs=[Data(uniq_bam)] ) # featureCounts featureCounts.register( Parameter('-a', Data('{}'.format(pathToGtf)).as_input()), # gtf Parameter('-s', '1'), # strand-specific read counting Parameter('-o', '{}/{}.featureCounts'.format(newDir, bid)), # output Parameter(Data(uniq_bam).as_input()) # input ) # fastQC # fastQC.register( # Parameter('--outdir={}'.format(newDir)), # output # Parameter('--t', numThreads), # Parameter(Data(fastq).as_input()) # ) # read_distribution read_distribution.register( Parameter('-r'), Parameter(Data(pathTo_hg19_bed).as_input()), Parameter('-i'), Parameter(Data(uniq_bam).as_input()), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.read_distribution.log'.format(bid))) )
def pipeline(self, pipeline_args, pipeline_config): tmpdir = tempfile.mkdtemp() os.chdir(tmpdir) wget = Software('wget') ersatz = Software('ersatz') star = Software('STAR', subprogram='twopass') wget.register( Parameter( 'https://lh3.googleusercontent.com/' '8gaEOU2p30N4Up-KMUl4MQBtnn0F5DyH5bqKKr0QqptnQgPk4lxXaWLJhi8Dcu9i8qE=w170' ), Parameter('-O', Data(os.path.join(tmpdir, 'image.png')).as_output())) master = ersatz.register() ersatz.register(wait_on=[master]) ersatz.register(wait_on=[master]) ersatz.register(wait_on=[master]) ersatz.register(wait_on=[master]) # Ersatz2 (2) ersatz.register( Parameter('--inputs', Data(os.path.join(tmpdir, 'image.png')).as_input()), Parameter('--outputs', Data('21.out').as_output(), Data('22.out').as_output())) # Ersatz1 (3) ersatz.register(Parameter('--outputs', Data('11.out').as_output())) # Ersatz3 (4) ersatz.register(Parameter('--outputs', Data('31.out').as_output())) # Ersatz4 (5) ersatz4 = ersatz.register( Parameter('--inputs', Data('11.out').as_input(), Data('21.out').as_input()), Parameter('--outputs', Data('41.out').as_output(), Data('42.out').as_output())) # Ersatz5 (6) ersatz.register( Parameter('--inputs', Data('22.out').as_input(), Data('31.out').as_input()), Parameter('--outputs', Data('51.out').as_output()), wait_on=[ersatz4] # TODO Software as dependencies ) # Ersatz6 (7) ersatz.register( Parameter('--inputs', Data('41.out').as_input(), Data('42.out').as_input(), Data('51.out').as_input()), Parameter('--outputs', Data('61.out').as_output()))
def test_software(): # Reset app_id counter _ParslAppBlueprint._id_counter = 0 # Set temp dir ParslPipeline._pipeline_run_temp_dir = tempfile.TemporaryDirectory( dir='/tmp', suffix='__operon') # Spoof pipeline config Software._pipeline_config = { 'software1': { 'path': '/path/to/soft1' }, 'software2': { 'path': '/path/to/soft2', 'one': 'two' }, 'bwa_mem': { 'path': '/path/to/bwa mem' } } # Create Software instances software1 = Software('software1') software2 = Software('software2', subprogram='sub') assert software2.basename == 'soft2' software3 = Software('software3', path='/path/to/soft3') software4 = Software('software4', path='/path/to/soft4', subprogram='four') assert software4.path == '/path/to/soft4 four' bwa_mem = Software('bwa_mem') assert bwa_mem.path == '/path/to/bwa mem' assert bwa_mem.basename == 'bwa_mem' # Raise ValueError when software path cannot be inferred with pytest.raises(ValueError): software5 = Software('software5') # With different success codes software6 = Software('software6', path='/path/to/soft6', success_on=['0', '1', '128']) # Registering properly adds to blueprints software1.register(Parameter('-a', 'one'), Redirect(stream='>', dest='/path/to/dest')) assert len(_ParslAppBlueprint._blueprints) == 1 assert list(_ParslAppBlueprint._blueprints.keys()) == ['soft1_1'] # .prep() unique ID per call assert software2.prep()['id'] != software2.prep()['id'] # inputs, outputs, wait_on, stdout, stderr lists properly populated # Proper handling of Data objects _def_soft3 = software3.register(Parameter('-a', 'one')) assert isinstance(_def_soft3, _DeferredApp) _soft4_blueprint = software4.prep( Parameter('-a', Data('/input1.txt').as_input()), Parameter('-b', Data('/input2.txt').as_input()), Parameter('--outfile', Data('/output1.txt').as_output()), Redirect(stream='>', dest='/stdout.log'), Redirect(stream=Redirect.STDERR, dest=Data('/stderr.log')), extra_inputs=[Data('/input3.txt'), Data('/input4.txt')], extra_outputs=[Data('/output2.txt'), Data('/output3.txt')], wait_on=[_def_soft3]) assert sorted(_soft4_blueprint['inputs']) == [ '/input{}.txt'.format(i) for i in range(1, 5) ] assert sorted(_soft4_blueprint['outputs'] ) == ['/output{}.txt'.format(i) for i in range(1, 4)] + ['/stderr.log'] assert len(_soft4_blueprint['wait_on']) == 1 assert _soft4_blueprint['wait_on'][0] == _def_soft3.app_id assert _soft4_blueprint['stdout'] == '/stdout.log' assert _soft4_blueprint['stderr'] == '/stderr.log' # Proper handling of Pipe _piped_soft1_soft6 = software1.prep( Parameter('-a', '1'), Redirect(stream='>', dest='/ignored.out'), # This should be overridden by Pipe Pipe( software6.prep(Parameter('-b', '2'), Parameter('-c', Data('/piped.out').as_input()), Redirect(stream='>', dest='/piped.log')))) assert _piped_soft1_soft6['cmd'].count('|') == 1 assert _piped_soft1_soft6['stdout'] == '/piped.log' assert len(_piped_soft1_soft6['inputs']) == 1 assert _piped_soft1_soft6['inputs'][0] == '/piped.out' # Ignore extra Redirects _ignore_redir = software2.prep( Parameter('-a', '1'), Redirect(stream='>', dest='/real.out'), Redirect(stream='2>', dest='/real.err'), Redirect(stream='>', dest='/ignored.out'), Redirect(stream='>', dest='/also_ignored.out'), Redirect(stream='2>', dest='/also_ignored.err')) assert _ignore_redir['stdout'] == '/real.out' assert _ignore_redir['stderr'] == '/real.err' # Ignore extra pipes _ignore_pipe = software2.prep( Parameter('-d', 'honeysucklerose'), Pipe(software1.prep(Parameter('-a', 'real'))), Pipe(software4.prep(Parameter('-b', 'ignored')))) assert _ignore_pipe['cmd'].count('|') == 1 assert 'ignored' not in _ignore_pipe['cmd'] assert 'real' in _ignore_pipe['cmd'] # Sending stdout/stderr to temporary file _stdout_stderr_tmp = software6.prep(Parameter('-a', 'ella')) assert _stdout_stderr_tmp['stdout'] is not None assert _stdout_stderr_tmp['stdout'].endswith('.stdout') assert _stdout_stderr_tmp['stderr'] is not None assert _stdout_stderr_tmp['stderr'].endswith('.stderr')
def pipeline_components_for_tests(): # Instantiate software petrichor = Software( 'petrichor', '/home/dfitzgerald/workspace/PycharmProjects/Operon/tests/petrichor') bash_sleep = Software('bash_sleep', '/bin/sleep') # Define python app def notos(sleep=None, outs=None): import time import random id_ = random.randint(1, 1000) if sleep: time.sleep(sleep) if outs: for out in outs: with open(out, 'w') as outfile: outfile.write('{}\n'.format(id_)) # App a, start=0, end=2 petrichor.register(Parameter('--sleep', '2'), Redirect(stream='>', dest=Data('a.out'))) # App b, start=0, end=3 petrichor.register(Parameter('--sleep', '3'), Redirect(stream='>', dest=Data('b.out'))) # App c, start=0, end=5 petrichor.register(Parameter('--sleep', '5'), Redirect(stream='>', dest=Data('c.out'))) # App d, start=5, end=8 CodeBlock.register(func=notos, kwargs={ 'sleep': 3, 'outs': ['d1.out', 'd2.out'] }, inputs=[Data('a.out'), Data('b.out'), Data('c.out')], outputs=[Data('d1.out'), Data('d2.out')]) # App e, start=0, end=10 app_e = bash_sleep.register(Parameter('10')) # App g, start=10, end=12 CodeBlock.register(func=notos, kwargs={ 'sleep': 2, 'outs': ['g1.out', 'g2.out'] }, inputs=[Data('d2.out')], outputs=[Data('g1.out'), Data('g2.out')], wait_on=[app_e]) # App f, start=8, end=11 petrichor.register(Parameter('--sleep', '3'), Parameter('--outfile', Data('f.out').as_output(tmp=True)), extra_inputs=[Data('d1.out')]) # App h, start=12, end=18 app_h = bash_sleep.register(Parameter('6'), extra_inputs=[Data('g2.out')]) # App i, start=18, end=20 petrichor.register(Parameter('--sleep', '2'), Parameter('--outfile', Data('i.final').as_output()), extra_inputs=[Data('g1.out'), Data('f.out')], wait_on=[app_h])
def multipipeline_components_for_tests(): # Instantiate software petrichor = Software( 'petrichor', '/home/dfitzgerald/workspace/PycharmProjects/Operon/tests/petrichor') bash_sleep = Software('bash_sleep', '/bin/sleep') # Define python app def notos(sleep=None, outs=None): import time import random id_ = random.randint(1, 1000) if sleep: time.sleep(sleep) if outs: for out in outs: with open(out, 'w') as outfile: outfile.write('{}\n'.format(id_)) Meta.define_executor(label='small', resources={'cpu': '1', 'mem': '1G'}) Meta.define_executor(label='med', resources={'cpu': '2', 'mem': '2G'}) Meta.define_executor(label='large', resources={'cpu': '3', 'mem': '3G'}) # App a, start=0, end=2 # petrichor_1 petrichor.register(Parameter('--sleep', '2'), Redirect(stream='>', dest=Data('a.out')), meta={'executor': 'small'}) # App b, start=0, end=3 # petrichor_2 petrichor.register(Parameter('--sleep', '3'), Redirect(stream='>', dest=Data('b.out')), meta={'executor': 'med'}) # App c, start=0, end=5 # petrichor_3 petrichor.register(Parameter('--sleep', '5'), Redirect(stream='>', dest=Data('c.out')), meta={'executor': 'large'}) # App d, start=5, end=8 # notos_4 CodeBlock.register(func=notos, kwargs={ 'sleep': 3, 'outs': ['d1.out', 'd2.out'] }, inputs=[Data('a.out'), Data('b.out'), Data('c.out')], outputs=[Data('d1.out'), Data('d2.out')], meta={'executor': 'small'}) # App e, start=0, end=10 # sleep_5 app_e = bash_sleep.register(Parameter('10'), meta={'executor': 'med'}) # App g, start=10, end=12 # notos_6 CodeBlock.register(func=notos, kwargs={ 'sleep': 2, 'outs': ['g1.out', 'g2.out'] }, inputs=[Data('d2.out')], outputs=[Data('g1.out'), Data('g2.out')], wait_on=[app_e], meta={'executor': 'small'}) # App f, start=8, end=11 # petrichor_7 petrichor.register(Parameter('--sleep', '3'), Parameter('--outfile', Data('f.out').as_output(tmp=True)), extra_inputs=[Data('d1.out')], meta={'executor': 'med'}) # App h, start=12, end=18 # sleep_8 app_h = bash_sleep.register(Parameter('6'), extra_inputs=[Data('g2.out')], meta={'executor': 'large'}) # App i, start=18, end=20 # petrichor_9 petrichor.register(Parameter('--sleep', '2'), Parameter('--outfile', Data('i.final').as_output()), extra_inputs=[Data('g1.out'), Data('f.out')], wait_on=[app_h], meta={'executor': 'small'})
def pipeline(self, pipeline_args, pipeline_config): # chunky run RiboSeq_pipe.py --fastqs # /mnt/cinder/thomas/RiboSeq/Lane5/AWS-3_S3_L005_R1_001.fastq.gz # --output /mnt/cinder/thomas/RiboSeq/test --threads # create variables from parser if wanted fastqFiles = pipeline_args['fastq:lib'] outputDir = pipeline_args['output'] adapter = pipeline_args['adapter'] numThreads = pipeline_args['threads'] # Create output directory subprocess.call(['mkdir', outputDir]) # Software cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) star = Software('star', pipeline_config['star']['path']) bedtools = Software('bedtools', pipeline_config['bedtools']['path']) bowtie2 = Software('bowtie2', pipeline_config['bowtie2']['path']) samtools = Software('samtools', pipeline_config['samtools']['path']) samtools_header = Software('samtools', pipeline_config['samtools']['path']) samtools_uniq = Software('samtools', pipeline_config['samtools']['path']) samtools_sort = Software('samtools sort', pipeline_config['samtools']['path']) read_distribution = Software( 'read_distribution', pipeline_config['read_distribution']['path']) featureCounts = Software('featureCounts', pipeline_config['featureCounts']['path']) fastQC = Software('FastQC', pipeline_config['FastQC']['path']) picard = Software( 'picard', "java -Xms8g -Xmx9g -jar {}".format( pipeline_config['picard']['path'])) # Change these to just be done in python script? # Common software tools awk = Software('awk', 'awk') sort = Software('sort', 'sort') uniq = Software('uniq', 'uniq') paste = Software('paste', 'paste') cat = Software('cat', 'cat') grep = Software('grep', 'grep') # Directories and Files pathToGenomeDir = pipeline_config['star']['genomeDir'] pathToGenome = pipeline_config['bowtie2']['genome_ref'] pathToGtf = pipeline_config['star']['GTF_ref'] pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed'] pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100'] pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed'] pathTo_genomeFasta = pipeline_config['picard']['genomeFasta'] pathTo_ref_flat = pipeline_config['picard']['refFlat'] # bid_list = [] # for fastqlib in fastqFiles: # bid_list.append(fastqlib.split(':')[-1]) for fastqlib in fastqFiles: fastq, bid = fastqlib.split(':') # Make new directories to store data newDir = os.path.join(outputDir, bid) subprocess.call(['mkdir', newDir]) trimmed_read_filename = '{}/{}.trimmed.fastq.gz'.format( newDir, bid) cutadapt.register( Parameter('--quality-base=33'), Parameter('--minimum-length=25'), Parameter('--discard-untrimmed'), Parameter('--output={}'.format(trimmed_read_filename)), # Parameter('--cores', numThreads), Parameter('-a', adapter), Parameter(Data(fastq).as_input()), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.cutadapt.summary.log'.format(bid))), extra_outputs=[ Data(trimmed_read_filename).as_output(tmp=True) ]) bowtie2.register( Parameter('--seedlen=23'), Parameter('--threads', numThreads), Parameter( '--un-gz', Data('{}/{}.filtered.fastq.gz'.format(newDir, bid)).as_output()), Parameter( '-x', Data( pathToGenome).as_input()), # Path to rtsRNA_seqs files Parameter('-U', Data(trimmed_read_filename).as_input()), Parameter('-S'), Parameter( Data('{}/{}.rts.sam'.format(newDir, bid)).as_output(tmp=True)), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.bowtie2.log'.format(bid))), Redirect(stream=Redirect.STDERR, dest=os.path.join(newDir, '{}.bowtie2.log2'.format(bid)))) samtools.register( Parameter('view'), Parameter('-Sb'), Parameter( Data('{}/{}.rts.sam'.format(newDir, bid)).as_input()), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.rts.bam'.format(bid))), ) star.register( Parameter( '--runThreadN', numThreads), # Change to command line parameter --threads Parameter('--sjdbGTFfile', pathToGtf), Parameter('--outSAMtype', 'BAM', 'Unsorted'), Parameter('--outFileNamePrefix', '{}/{}_'.format(newDir, bid)), Parameter('--genomeDir', pathToGenomeDir), # Parameter('--genomeLoad', genomeLoad), broken Parameter( '--readFilesIn', Data('{}/{}.filtered.fastq.gz'.format(newDir, bid)).as_input()), Parameter('--readFilesCommand zcat'), # reads gzipped files extra_outputs=[ Data('{}/{}.Aligned.bam'.format(newDir, bid)).as_output() ]) samtools_header.register( Parameter('view'), Parameter('-H'), Parameter( Data('{}/{}.Aligned.bam'.format( newDir, bid)).as_input()), # star outfile name Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.header.sam'.format(bid))), extra_outputs=[ Data('{}/{}.header.sam'.format(newDir, bid)).as_output() ]) uniq_bam = '{}/{}.uniq_sorted.bam'.format(newDir, bid) samtools_uniq.register( Parameter('view'), Parameter( Data('{}/{}.Aligned.bam'.format( newDir, bid)).as_input()), # star outfile name Pipe( grep.prep( Parameter('-w'), Parameter('NH:i:1'), Pipe( cat.prep( Parameter( os.path.join(newDir, '{}.header.sam'.format(bid)), '-'), Pipe( samtools.prep( Parameter('view'), Parameter('-bS', '-'), Pipe( samtools.prep( Parameter('sort'), Parameter( '-', '-o', Data(uniq_bam).as_output()) ))))))))) # QC # Codon_periodicity intersectBed_filepath = '{}/{}.intersect_start100.bed'.format( newDir, bid) relative_pos_table_filepath = '{}/{}_relative_pos_aggregate.table'.format( newDir, bid) bedtools.register( Parameter('intersect'), Parameter('-a {}'.format(pathTo_hg19_bed_start100)), Parameter('-b', Data(uniq_bam).as_input()), Parameter('-s'), Parameter('-bed'), Parameter('-wa'), Parameter('-wb'), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.intersect_start100.bed'.format(bid))), extra_outputs=[Data(intersectBed_filepath).as_output()]) CodeBlock.register( func=relative_pos_table, args=[], kwargs={ 'intersect_bedtools_filepath': intersectBed_filepath, 'relativePos_filepath': relative_pos_table_filepath }, inputs=[Data(intersectBed_filepath).as_input()], outputs=[Data(relative_pos_table_filepath).as_output()]) codon_periodicity_filepath = '{}/{}_codon_periodicity_plot.png'.format( newDir, bid) CodeBlock.register( func=create_codon_periodicity, args=[], kwargs={ 'relativePos_filepath': relative_pos_table_filepath, 'codon_periodicity_filepath': codon_periodicity_filepath }, inputs=[Data(relative_pos_table_filepath).as_input()], outputs=[Data(codon_periodicity_filepath).as_output()]) # Picard picard.register( Parameter('CollectMultipleMetrics'), Parameter('I={}'.format(uniq_bam)), # input Parameter('O={}/{}.multiple_metrics'.format(newDir, bid)), # output Parameter( 'R={}'.format(pathTo_genomeFasta)), # genomeReference extra_inputs=[Data(uniq_bam)]) picard.register( Parameter('CollectGcBiasMetrics'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.gc_bias_metrics'.format(newDir, bid)), # output Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format( newDir, bid)), # chart Parameter('S={}/{}.summary_metrics'.format( newDir, bid)), # summary metrics Parameter( 'R={}'.format(pathTo_genomeFasta)), # genome reference extra_inputs=[Data(uniq_bam)]) picard.register( Parameter('CollectRnaSeqMetrics'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)), # output Parameter('REF_FLAT={}'.format( '{}'.format(pathTo_ref_flat))), # ref_flat Parameter( 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND'), # strandedness extra_inputs=[Data(uniq_bam)]) picard.register( Parameter('MarkDuplicates'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.marked_duplicates.bam'.format( newDir, bid)), # output Parameter('M={}/{}.marked_dup_metrics.txt'.format( newDir, bid)), # marked dup metrics Parameter('ASSUME_SORTED=true'), # It is sorted extra_inputs=[Data(uniq_bam)]) # featureCounts featureCounts.register( Parameter('-a', Data('{}'.format(pathToGtf)).as_input()), # gtf Parameter('-s', '1'), # strand-specific read counting Parameter('-o', '{}/{}.featureCounts'.format(newDir, bid)), # output Parameter(Data(uniq_bam).as_input()) # input ) # fastQC fastQC.register( Parameter('--outdir={}'.format(newDir)), # output Parameter('--t', numThreads), Parameter(Data(fastq).as_input())) # read_distribution read_distribution.register( Parameter('-r'), Parameter(Data(pathTo_hg19_bed).as_input()), Parameter('-i'), Parameter(Data(uniq_bam).as_input()), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.read_distribution.log'.format(bid))))