def test_redirects(): r_norm = Redirect(stream='>', dest='/path/to/dest') r_norm_const = Redirect(stream=Redirect.STDOUT, dest='/path/to/dest') r_stderr = Redirect(stream='2>', dest='/path/to/dest') r_stderr_const = Redirect(stream=Redirect.STDERR, dest='/path/to/dest') r_both = Redirect(stream='&>', dest='/path/to/dest') r_both_const = Redirect(stream=Redirect.BOTH, dest='/path/to/dest') r_null = Redirect(stream='>', dest=Redirect.NULL) r_data = Redirect(stream='>', dest=Data('/path/to/data')) # Turn into strings as expected assert str(r_norm) == '1>/path/to/dest' assert str(r_norm_const) == '1>/path/to/dest' assert str(r_stderr) == '2>/path/to/dest' assert str(r_stderr_const) == '2>/path/to/dest' assert str(r_both) == '&>/path/to/dest' assert str(r_both_const) == '&>/path/to/dest' # Ensure mode is proper assert r_norm.mode == 'w' # Ensure NULL constant works assert str(r_null) == '1>/dev/null' # With Data object assert isinstance(r_norm.dest, str) assert isinstance(r_data.dest, Data) assert str(r_data) == '1>/path/to/data'
def test_codeblocks(): def func1(one, two, three): return one**two + three _reg1 = CodeBlock.register(func=func1, args=(1, 2, 3), inputs=[Data('/wait_one.txt')]) assert isinstance(_reg1, _DeferredApp) _reg2 = CodeBlock.register(func=func1, kwargs={ 'one': 1, 'two': 2, 'three': 3 }, outputs=[Data('/output_one.txt')], stdout='/reg2.out', stderr='/reg2.err', wait_on=[_reg1]) _reg1_blueprint = _ParslAppBlueprint._blueprints[_reg1.app_id] _reg2_blueprint = _ParslAppBlueprint._blueprints[_reg2.app_id] # ID is unique per call assert _reg1_blueprint['id'] != _reg2_blueprint['id'] # Correct reference to function assert _reg1_blueprint['func'] is func1 assert _reg2_blueprint['func'] is func1 # args and kwargs are both passed correctly # inputs, outputs, wait_on, stdout, stderr lists properly populated assert _reg1_blueprint['args'] == (1, 2, 3) assert _reg1_blueprint['kwargs'] == dict() assert _reg2_blueprint['args'] == list() assert _reg2_blueprint['kwargs'] == {'one': 1, 'two': 2, 'three': 3} assert _reg1_blueprint['inputs'] == ['/wait_one.txt'] assert _reg2_blueprint['outputs'] == ['/output_one.txt'] assert len(_reg2_blueprint['wait_on']) == 1 assert _reg2_blueprint['wait_on'][0] == _reg1_blueprint['id'] assert _reg2_blueprint['stdout'] == '/reg2.out' assert _reg2_blueprint['stderr'] == '/reg2.err'
def test_parameters(): p_single = Parameter('one') p_multiple = Parameter('-a', 'one', '-b', 'two') p_equals = Parameter('--one', 'a', sep='=') p_fused = Parameter('-a one -b', 'two') p_with_data1 = Parameter(Data('/path/to/data')) p_with_data2 = Parameter('-a', Data('/path/to/data1'), Data('/path/to/data2')) # Parameters turn into strings as expected assert str(p_single) == 'one' assert str(p_multiple) == '-a one -b two' assert str(p_equals) == '--one=a' # Use with different separators assert str( p_fused ) == '-a one -b two' # Spaces in single string handled correctly # Holds Data objects correctly assert len(p_single.data) == 0 and not p_single.data assert len(p_with_data1.data) == 1 assert len(p_with_data2.data) == 2 assert str(p_with_data1.data[0]) == '/path/to/data' assert str(p_with_data2) == '-a /path/to/data1 /path/to/data2'
def test_data(): # Reset _data storage Data._data = dict() # Turn into string as expected d_norm = Data('/path/to/data') assert str(d_norm) == '/path/to/data' # Multiple instantiations of same path return same instance d_same1 = Data('/path/to/same') d_same2 = Data('/path/to/same') assert str(d_same1) == str(d_same2) assert d_same1 is d_same2 # Data class should have two stored paths assert len(Data._data) == 2 # Mark Data object as input/output assert d_norm.mode is None d_norm_input = Data('/path/to/data').as_input() assert d_norm_input.mode == Data.INPUT d_norm_output = Data('/path/to/data').as_output() assert d_norm_output.mode == Data.OUTPUT # d_norm should also be output now, since paths were the same assert d_norm.mode == Data.OUTPUT # Data class should still only have two stored paths assert len(Data._data) == 2 # Mark Data object as temporary d_tmp = Data('/path/to/tmp').as_output(tmp=True) assert d_tmp.tmp == True assert d_tmp.mode == Data.OUTPUT assert len(Data._data) == 3 # Passing '' returns empty string d_blank = Data('') assert isinstance(d_blank, str) assert d_blank == '' assert len(Data._data) == 3 # Ensure new Data object was not created
def pipeline(self, pipeline_args, pipeline_config): # chunky run RiboSeq_pipe.py --fastqs # /mnt/cinder/thomas/RiboSeq/Lane5/AWS-3_S3_L005_R1_001.fastq.gz # --output /mnt/cinder/thomas/RiboSeq/test --threads # create variables from parser if wanted bamFiles = pipeline_args['bam:lib'] outputDir = pipeline_args['output'] numThreads = pipeline_args['threads'] # Create output directory subprocess.call(['mkdir', outputDir]) # Software bedtools = Software('bedtools', pipeline_config['bedtools']['path']) samtools = Software('samtools', pipeline_config['samtools']['path']) samtools_header = Software('samtools', pipeline_config['samtools']['path']) samtools_uniq = Software('samtools', pipeline_config['samtools']['path']) samtools_sort = Software('samtools sort', pipeline_config['samtools']['path']) read_distribution = Software('read_distribution', pipeline_config['read_distribution']['path']) featureCounts = Software('featureCounts', pipeline_config['featureCounts']['path']) fastQC = Software('FastQC', pipeline_config['FastQC']['path']) picard = Software('picard', "java -Xms8g -Xmx9g -jar {}".format(pipeline_config['picard']['path'])) # Change these to just be done in python script? # Common software tools awk = Software('awk', 'awk') sort = Software('sort', 'sort') uniq = Software('uniq', 'uniq') paste = Software('paste', 'paste') cat = Software('cat', 'cat') grep = Software('grep', 'grep') # Directories and Files pathToGtf = pipeline_config['star']['GTF_ref'] pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed'] pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100'] pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed'] pathTo_genomeFasta = pipeline_config['picard']['genomeFasta'] pathTo_ref_flat = pipeline_config['picard']['refFlat'] # bid_list = [] # for fastqlib in fastqFiles: # bid_list.append(fastqlib.split(':')[-1]) for bamlib in bamFiles: bam, bid = bamlib.split(':') newDir = os.path.join(outputDir, bid) subprocess.call(['mkdir', newDir]) uniq_bam = '{}/{}.uniq_sorted.bam'.format(newDir, bid) samtools_uniq.register( Parameter('view'), Parameter(Data(bam).as_input()), # star outfile name Pipe( grep.prep( Parameter('-w'), Parameter('NH:i:1'), Pipe( cat.prep( Parameter(os.path.join(newDir, '{}.header.sam'.format(bid)), '-'), Pipe( samtools.prep( Parameter('view'), Parameter('-bS', '-'), Pipe( samtools.prep( Parameter('sort'), Parameter('-', '-o', Data(uniq_bam).as_output()) ) ) ) ) ) ) ) ) ) # QC # Codon_periodicity intersectBed_filepath = '{}/{}.intersect_start100.bed'.format(newDir, bid) relative_pos_table_filepath = '{}/{}_relative_pos_aggregate.table'.format(newDir, bid) bedtools.register( Parameter('intersect'), Parameter('-a {}'.format(pathTo_hg19_bed_start100)), Parameter('-b', Data(uniq_bam).as_input()), Parameter('-s'), Parameter('-bed'), Parameter('-wa'), Parameter('-wb'), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.intersect_start100.bed'.format(bid))), extra_outputs=[Data(intersectBed_filepath).as_output()] ) CodeBlock.register( func=relative_pos_table, args=[], kwargs={'intersect_bedtools_filepath' : intersectBed_filepath, 'relativePos_filepath': relative_pos_table_filepath}, inputs=[Data(intersectBed_filepath).as_input()], outputs=[Data(relative_pos_table_filepath).as_output()] ) codon_periodicity_filepath = '{}/{}_codon_periodicity_plot.png'.format(newDir, bid) CodeBlock.register( func=create_codon_periodicity, args=[], kwargs={'relativePos_filepath': relative_pos_table_filepath, 'codon_periodicity_filepath' : codon_periodicity_filepath}, inputs=[Data(relative_pos_table_filepath).as_input()], outputs=[Data(codon_periodicity_filepath).as_output()] ) # Picard picard.register( Parameter('CollectMultipleMetrics'), Parameter('I={}'.format(uniq_bam)), # input Parameter('O={}/{}.multiple_metrics'.format(newDir, bid)), # output Parameter('R={}'.format(pathTo_genomeFasta)), # genomeReference extra_inputs=[Data(uniq_bam)] ) picard.register( Parameter('CollectGcBiasMetrics'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.gc_bias_metrics'.format(newDir, bid)), # output Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format(newDir, bid)), # chart Parameter('S={}/{}.summary_metrics'.format(newDir, bid)), # summary metrics Parameter('R={}'.format(pathTo_genomeFasta)), # genome reference extra_inputs=[Data(uniq_bam)] ) picard.register( Parameter('CollectRnaSeqMetrics'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)), # output Parameter('REF_FLAT={}'.format('{}'.format(pathTo_ref_flat))), # ref_flat Parameter('STRAND=FIRST_READ_TRANSCRIPTION_STRAND'), # strandedness extra_inputs=[Data(uniq_bam)] ) picard.register( Parameter('MarkDuplicates'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.marked_duplicates.bam'.format(newDir, bid)), # output Parameter('M={}/{}.marked_dup_metrics.txt'.format(newDir, bid)), # marked dup metrics Parameter('ASSUME_SORTED=true'), # It is sorted extra_inputs=[Data(uniq_bam)] ) # featureCounts featureCounts.register( Parameter('-a', Data('{}'.format(pathToGtf)).as_input()), # gtf Parameter('-s', '1'), # strand-specific read counting Parameter('-o', '{}/{}.featureCounts'.format(newDir, bid)), # output Parameter(Data(uniq_bam).as_input()) # input ) # fastQC # fastQC.register( # Parameter('--outdir={}'.format(newDir)), # output # Parameter('--t', numThreads), # Parameter(Data(fastq).as_input()) # ) # read_distribution read_distribution.register( Parameter('-r'), Parameter(Data(pathTo_hg19_bed).as_input()), Parameter('-i'), Parameter(Data(uniq_bam).as_input()), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.read_distribution.log'.format(bid))) )
def pipeline(self, pipeline_args, pipeline_config): tmpdir = tempfile.mkdtemp() os.chdir(tmpdir) wget = Software('wget') ersatz = Software('ersatz') star = Software('STAR', subprogram='twopass') wget.register( Parameter( 'https://lh3.googleusercontent.com/' '8gaEOU2p30N4Up-KMUl4MQBtnn0F5DyH5bqKKr0QqptnQgPk4lxXaWLJhi8Dcu9i8qE=w170' ), Parameter('-O', Data(os.path.join(tmpdir, 'image.png')).as_output())) master = ersatz.register() ersatz.register(wait_on=[master]) ersatz.register(wait_on=[master]) ersatz.register(wait_on=[master]) ersatz.register(wait_on=[master]) # Ersatz2 (2) ersatz.register( Parameter('--inputs', Data(os.path.join(tmpdir, 'image.png')).as_input()), Parameter('--outputs', Data('21.out').as_output(), Data('22.out').as_output())) # Ersatz1 (3) ersatz.register(Parameter('--outputs', Data('11.out').as_output())) # Ersatz3 (4) ersatz.register(Parameter('--outputs', Data('31.out').as_output())) # Ersatz4 (5) ersatz4 = ersatz.register( Parameter('--inputs', Data('11.out').as_input(), Data('21.out').as_input()), Parameter('--outputs', Data('41.out').as_output(), Data('42.out').as_output())) # Ersatz5 (6) ersatz.register( Parameter('--inputs', Data('22.out').as_input(), Data('31.out').as_input()), Parameter('--outputs', Data('51.out').as_output()), wait_on=[ersatz4] # TODO Software as dependencies ) # Ersatz6 (7) ersatz.register( Parameter('--inputs', Data('41.out').as_input(), Data('42.out').as_input(), Data('51.out').as_input()), Parameter('--outputs', Data('61.out').as_output()))
def test_software(): # Reset app_id counter _ParslAppBlueprint._id_counter = 0 # Set temp dir ParslPipeline._pipeline_run_temp_dir = tempfile.TemporaryDirectory( dir='/tmp', suffix='__operon') # Spoof pipeline config Software._pipeline_config = { 'software1': { 'path': '/path/to/soft1' }, 'software2': { 'path': '/path/to/soft2', 'one': 'two' }, 'bwa_mem': { 'path': '/path/to/bwa mem' } } # Create Software instances software1 = Software('software1') software2 = Software('software2', subprogram='sub') assert software2.basename == 'soft2' software3 = Software('software3', path='/path/to/soft3') software4 = Software('software4', path='/path/to/soft4', subprogram='four') assert software4.path == '/path/to/soft4 four' bwa_mem = Software('bwa_mem') assert bwa_mem.path == '/path/to/bwa mem' assert bwa_mem.basename == 'bwa_mem' # Raise ValueError when software path cannot be inferred with pytest.raises(ValueError): software5 = Software('software5') # With different success codes software6 = Software('software6', path='/path/to/soft6', success_on=['0', '1', '128']) # Registering properly adds to blueprints software1.register(Parameter('-a', 'one'), Redirect(stream='>', dest='/path/to/dest')) assert len(_ParslAppBlueprint._blueprints) == 1 assert list(_ParslAppBlueprint._blueprints.keys()) == ['soft1_1'] # .prep() unique ID per call assert software2.prep()['id'] != software2.prep()['id'] # inputs, outputs, wait_on, stdout, stderr lists properly populated # Proper handling of Data objects _def_soft3 = software3.register(Parameter('-a', 'one')) assert isinstance(_def_soft3, _DeferredApp) _soft4_blueprint = software4.prep( Parameter('-a', Data('/input1.txt').as_input()), Parameter('-b', Data('/input2.txt').as_input()), Parameter('--outfile', Data('/output1.txt').as_output()), Redirect(stream='>', dest='/stdout.log'), Redirect(stream=Redirect.STDERR, dest=Data('/stderr.log')), extra_inputs=[Data('/input3.txt'), Data('/input4.txt')], extra_outputs=[Data('/output2.txt'), Data('/output3.txt')], wait_on=[_def_soft3]) assert sorted(_soft4_blueprint['inputs']) == [ '/input{}.txt'.format(i) for i in range(1, 5) ] assert sorted(_soft4_blueprint['outputs'] ) == ['/output{}.txt'.format(i) for i in range(1, 4)] + ['/stderr.log'] assert len(_soft4_blueprint['wait_on']) == 1 assert _soft4_blueprint['wait_on'][0] == _def_soft3.app_id assert _soft4_blueprint['stdout'] == '/stdout.log' assert _soft4_blueprint['stderr'] == '/stderr.log' # Proper handling of Pipe _piped_soft1_soft6 = software1.prep( Parameter('-a', '1'), Redirect(stream='>', dest='/ignored.out'), # This should be overridden by Pipe Pipe( software6.prep(Parameter('-b', '2'), Parameter('-c', Data('/piped.out').as_input()), Redirect(stream='>', dest='/piped.log')))) assert _piped_soft1_soft6['cmd'].count('|') == 1 assert _piped_soft1_soft6['stdout'] == '/piped.log' assert len(_piped_soft1_soft6['inputs']) == 1 assert _piped_soft1_soft6['inputs'][0] == '/piped.out' # Ignore extra Redirects _ignore_redir = software2.prep( Parameter('-a', '1'), Redirect(stream='>', dest='/real.out'), Redirect(stream='2>', dest='/real.err'), Redirect(stream='>', dest='/ignored.out'), Redirect(stream='>', dest='/also_ignored.out'), Redirect(stream='2>', dest='/also_ignored.err')) assert _ignore_redir['stdout'] == '/real.out' assert _ignore_redir['stderr'] == '/real.err' # Ignore extra pipes _ignore_pipe = software2.prep( Parameter('-d', 'honeysucklerose'), Pipe(software1.prep(Parameter('-a', 'real'))), Pipe(software4.prep(Parameter('-b', 'ignored')))) assert _ignore_pipe['cmd'].count('|') == 1 assert 'ignored' not in _ignore_pipe['cmd'] assert 'real' in _ignore_pipe['cmd'] # Sending stdout/stderr to temporary file _stdout_stderr_tmp = software6.prep(Parameter('-a', 'ella')) assert _stdout_stderr_tmp['stdout'] is not None assert _stdout_stderr_tmp['stdout'].endswith('.stdout') assert _stdout_stderr_tmp['stderr'] is not None assert _stdout_stderr_tmp['stderr'].endswith('.stderr')
def pipeline_components_for_tests(): # Instantiate software petrichor = Software( 'petrichor', '/home/dfitzgerald/workspace/PycharmProjects/Operon/tests/petrichor') bash_sleep = Software('bash_sleep', '/bin/sleep') # Define python app def notos(sleep=None, outs=None): import time import random id_ = random.randint(1, 1000) if sleep: time.sleep(sleep) if outs: for out in outs: with open(out, 'w') as outfile: outfile.write('{}\n'.format(id_)) # App a, start=0, end=2 petrichor.register(Parameter('--sleep', '2'), Redirect(stream='>', dest=Data('a.out'))) # App b, start=0, end=3 petrichor.register(Parameter('--sleep', '3'), Redirect(stream='>', dest=Data('b.out'))) # App c, start=0, end=5 petrichor.register(Parameter('--sleep', '5'), Redirect(stream='>', dest=Data('c.out'))) # App d, start=5, end=8 CodeBlock.register(func=notos, kwargs={ 'sleep': 3, 'outs': ['d1.out', 'd2.out'] }, inputs=[Data('a.out'), Data('b.out'), Data('c.out')], outputs=[Data('d1.out'), Data('d2.out')]) # App e, start=0, end=10 app_e = bash_sleep.register(Parameter('10')) # App g, start=10, end=12 CodeBlock.register(func=notos, kwargs={ 'sleep': 2, 'outs': ['g1.out', 'g2.out'] }, inputs=[Data('d2.out')], outputs=[Data('g1.out'), Data('g2.out')], wait_on=[app_e]) # App f, start=8, end=11 petrichor.register(Parameter('--sleep', '3'), Parameter('--outfile', Data('f.out').as_output(tmp=True)), extra_inputs=[Data('d1.out')]) # App h, start=12, end=18 app_h = bash_sleep.register(Parameter('6'), extra_inputs=[Data('g2.out')]) # App i, start=18, end=20 petrichor.register(Parameter('--sleep', '2'), Parameter('--outfile', Data('i.final').as_output()), extra_inputs=[Data('g1.out'), Data('f.out')], wait_on=[app_h])
def multipipeline_components_for_tests(): # Instantiate software petrichor = Software( 'petrichor', '/home/dfitzgerald/workspace/PycharmProjects/Operon/tests/petrichor') bash_sleep = Software('bash_sleep', '/bin/sleep') # Define python app def notos(sleep=None, outs=None): import time import random id_ = random.randint(1, 1000) if sleep: time.sleep(sleep) if outs: for out in outs: with open(out, 'w') as outfile: outfile.write('{}\n'.format(id_)) Meta.define_executor(label='small', resources={'cpu': '1', 'mem': '1G'}) Meta.define_executor(label='med', resources={'cpu': '2', 'mem': '2G'}) Meta.define_executor(label='large', resources={'cpu': '3', 'mem': '3G'}) # App a, start=0, end=2 # petrichor_1 petrichor.register(Parameter('--sleep', '2'), Redirect(stream='>', dest=Data('a.out')), meta={'executor': 'small'}) # App b, start=0, end=3 # petrichor_2 petrichor.register(Parameter('--sleep', '3'), Redirect(stream='>', dest=Data('b.out')), meta={'executor': 'med'}) # App c, start=0, end=5 # petrichor_3 petrichor.register(Parameter('--sleep', '5'), Redirect(stream='>', dest=Data('c.out')), meta={'executor': 'large'}) # App d, start=5, end=8 # notos_4 CodeBlock.register(func=notos, kwargs={ 'sleep': 3, 'outs': ['d1.out', 'd2.out'] }, inputs=[Data('a.out'), Data('b.out'), Data('c.out')], outputs=[Data('d1.out'), Data('d2.out')], meta={'executor': 'small'}) # App e, start=0, end=10 # sleep_5 app_e = bash_sleep.register(Parameter('10'), meta={'executor': 'med'}) # App g, start=10, end=12 # notos_6 CodeBlock.register(func=notos, kwargs={ 'sleep': 2, 'outs': ['g1.out', 'g2.out'] }, inputs=[Data('d2.out')], outputs=[Data('g1.out'), Data('g2.out')], wait_on=[app_e], meta={'executor': 'small'}) # App f, start=8, end=11 # petrichor_7 petrichor.register(Parameter('--sleep', '3'), Parameter('--outfile', Data('f.out').as_output(tmp=True)), extra_inputs=[Data('d1.out')], meta={'executor': 'med'}) # App h, start=12, end=18 # sleep_8 app_h = bash_sleep.register(Parameter('6'), extra_inputs=[Data('g2.out')], meta={'executor': 'large'}) # App i, start=18, end=20 # petrichor_9 petrichor.register(Parameter('--sleep', '2'), Parameter('--outfile', Data('i.final').as_output()), extra_inputs=[Data('g1.out'), Data('f.out')], wait_on=[app_h], meta={'executor': 'small'})
def pipeline(self, pipeline_args, pipeline_config): # chunky run RiboSeq_pipe.py --fastqs # /mnt/cinder/thomas/RiboSeq/Lane5/AWS-3_S3_L005_R1_001.fastq.gz # --output /mnt/cinder/thomas/RiboSeq/test --threads # create variables from parser if wanted fastqFiles = pipeline_args['fastq:lib'] outputDir = pipeline_args['output'] adapter = pipeline_args['adapter'] numThreads = pipeline_args['threads'] # Create output directory subprocess.call(['mkdir', outputDir]) # Software cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) star = Software('star', pipeline_config['star']['path']) bedtools = Software('bedtools', pipeline_config['bedtools']['path']) bowtie2 = Software('bowtie2', pipeline_config['bowtie2']['path']) samtools = Software('samtools', pipeline_config['samtools']['path']) samtools_header = Software('samtools', pipeline_config['samtools']['path']) samtools_uniq = Software('samtools', pipeline_config['samtools']['path']) samtools_sort = Software('samtools sort', pipeline_config['samtools']['path']) read_distribution = Software( 'read_distribution', pipeline_config['read_distribution']['path']) featureCounts = Software('featureCounts', pipeline_config['featureCounts']['path']) fastQC = Software('FastQC', pipeline_config['FastQC']['path']) picard = Software( 'picard', "java -Xms8g -Xmx9g -jar {}".format( pipeline_config['picard']['path'])) # Change these to just be done in python script? # Common software tools awk = Software('awk', 'awk') sort = Software('sort', 'sort') uniq = Software('uniq', 'uniq') paste = Software('paste', 'paste') cat = Software('cat', 'cat') grep = Software('grep', 'grep') # Directories and Files pathToGenomeDir = pipeline_config['star']['genomeDir'] pathToGenome = pipeline_config['bowtie2']['genome_ref'] pathToGtf = pipeline_config['star']['GTF_ref'] pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed'] pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100'] pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed'] pathTo_genomeFasta = pipeline_config['picard']['genomeFasta'] pathTo_ref_flat = pipeline_config['picard']['refFlat'] # bid_list = [] # for fastqlib in fastqFiles: # bid_list.append(fastqlib.split(':')[-1]) for fastqlib in fastqFiles: fastq, bid = fastqlib.split(':') # Make new directories to store data newDir = os.path.join(outputDir, bid) subprocess.call(['mkdir', newDir]) trimmed_read_filename = '{}/{}.trimmed.fastq.gz'.format( newDir, bid) cutadapt.register( Parameter('--quality-base=33'), Parameter('--minimum-length=25'), Parameter('--discard-untrimmed'), Parameter('--output={}'.format(trimmed_read_filename)), # Parameter('--cores', numThreads), Parameter('-a', adapter), Parameter(Data(fastq).as_input()), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.cutadapt.summary.log'.format(bid))), extra_outputs=[ Data(trimmed_read_filename).as_output(tmp=True) ]) bowtie2.register( Parameter('--seedlen=23'), Parameter('--threads', numThreads), Parameter( '--un-gz', Data('{}/{}.filtered.fastq.gz'.format(newDir, bid)).as_output()), Parameter( '-x', Data( pathToGenome).as_input()), # Path to rtsRNA_seqs files Parameter('-U', Data(trimmed_read_filename).as_input()), Parameter('-S'), Parameter( Data('{}/{}.rts.sam'.format(newDir, bid)).as_output(tmp=True)), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.bowtie2.log'.format(bid))), Redirect(stream=Redirect.STDERR, dest=os.path.join(newDir, '{}.bowtie2.log2'.format(bid)))) samtools.register( Parameter('view'), Parameter('-Sb'), Parameter( Data('{}/{}.rts.sam'.format(newDir, bid)).as_input()), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.rts.bam'.format(bid))), ) star.register( Parameter( '--runThreadN', numThreads), # Change to command line parameter --threads Parameter('--sjdbGTFfile', pathToGtf), Parameter('--outSAMtype', 'BAM', 'Unsorted'), Parameter('--outFileNamePrefix', '{}/{}_'.format(newDir, bid)), Parameter('--genomeDir', pathToGenomeDir), # Parameter('--genomeLoad', genomeLoad), broken Parameter( '--readFilesIn', Data('{}/{}.filtered.fastq.gz'.format(newDir, bid)).as_input()), Parameter('--readFilesCommand zcat'), # reads gzipped files extra_outputs=[ Data('{}/{}.Aligned.bam'.format(newDir, bid)).as_output() ]) samtools_header.register( Parameter('view'), Parameter('-H'), Parameter( Data('{}/{}.Aligned.bam'.format( newDir, bid)).as_input()), # star outfile name Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.header.sam'.format(bid))), extra_outputs=[ Data('{}/{}.header.sam'.format(newDir, bid)).as_output() ]) uniq_bam = '{}/{}.uniq_sorted.bam'.format(newDir, bid) samtools_uniq.register( Parameter('view'), Parameter( Data('{}/{}.Aligned.bam'.format( newDir, bid)).as_input()), # star outfile name Pipe( grep.prep( Parameter('-w'), Parameter('NH:i:1'), Pipe( cat.prep( Parameter( os.path.join(newDir, '{}.header.sam'.format(bid)), '-'), Pipe( samtools.prep( Parameter('view'), Parameter('-bS', '-'), Pipe( samtools.prep( Parameter('sort'), Parameter( '-', '-o', Data(uniq_bam).as_output()) ))))))))) # QC # Codon_periodicity intersectBed_filepath = '{}/{}.intersect_start100.bed'.format( newDir, bid) relative_pos_table_filepath = '{}/{}_relative_pos_aggregate.table'.format( newDir, bid) bedtools.register( Parameter('intersect'), Parameter('-a {}'.format(pathTo_hg19_bed_start100)), Parameter('-b', Data(uniq_bam).as_input()), Parameter('-s'), Parameter('-bed'), Parameter('-wa'), Parameter('-wb'), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.intersect_start100.bed'.format(bid))), extra_outputs=[Data(intersectBed_filepath).as_output()]) CodeBlock.register( func=relative_pos_table, args=[], kwargs={ 'intersect_bedtools_filepath': intersectBed_filepath, 'relativePos_filepath': relative_pos_table_filepath }, inputs=[Data(intersectBed_filepath).as_input()], outputs=[Data(relative_pos_table_filepath).as_output()]) codon_periodicity_filepath = '{}/{}_codon_periodicity_plot.png'.format( newDir, bid) CodeBlock.register( func=create_codon_periodicity, args=[], kwargs={ 'relativePos_filepath': relative_pos_table_filepath, 'codon_periodicity_filepath': codon_periodicity_filepath }, inputs=[Data(relative_pos_table_filepath).as_input()], outputs=[Data(codon_periodicity_filepath).as_output()]) # Picard picard.register( Parameter('CollectMultipleMetrics'), Parameter('I={}'.format(uniq_bam)), # input Parameter('O={}/{}.multiple_metrics'.format(newDir, bid)), # output Parameter( 'R={}'.format(pathTo_genomeFasta)), # genomeReference extra_inputs=[Data(uniq_bam)]) picard.register( Parameter('CollectGcBiasMetrics'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.gc_bias_metrics'.format(newDir, bid)), # output Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format( newDir, bid)), # chart Parameter('S={}/{}.summary_metrics'.format( newDir, bid)), # summary metrics Parameter( 'R={}'.format(pathTo_genomeFasta)), # genome reference extra_inputs=[Data(uniq_bam)]) picard.register( Parameter('CollectRnaSeqMetrics'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)), # output Parameter('REF_FLAT={}'.format( '{}'.format(pathTo_ref_flat))), # ref_flat Parameter( 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND'), # strandedness extra_inputs=[Data(uniq_bam)]) picard.register( Parameter('MarkDuplicates'), Parameter('I={}'.format(uniq_bam)), Parameter('O={}/{}.marked_duplicates.bam'.format( newDir, bid)), # output Parameter('M={}/{}.marked_dup_metrics.txt'.format( newDir, bid)), # marked dup metrics Parameter('ASSUME_SORTED=true'), # It is sorted extra_inputs=[Data(uniq_bam)]) # featureCounts featureCounts.register( Parameter('-a', Data('{}'.format(pathToGtf)).as_input()), # gtf Parameter('-s', '1'), # strand-specific read counting Parameter('-o', '{}/{}.featureCounts'.format(newDir, bid)), # output Parameter(Data(uniq_bam).as_input()) # input ) # fastQC fastQC.register( Parameter('--outdir={}'.format(newDir)), # output Parameter('--t', numThreads), Parameter(Data(fastq).as_input())) # read_distribution read_distribution.register( Parameter('-r'), Parameter(Data(pathTo_hg19_bed).as_input()), Parameter('-i'), Parameter(Data(uniq_bam).as_input()), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.read_distribution.log'.format(bid))))