def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate options
        bam = pipeline_args['bam']
        output_dir = pipeline_args['output']
        logs_dir = os.path.join(output_dir, 'logs')

        cufflinks_lib_type = pipeline_args['cufflinks_lib_type']
        htseq_stranded = pipeline_args['htseq_stranded']

        # Create output, tmp, and logs directories
        tmp_dir = os.path.join(output_dir, 'tmp')
        subprocess.call(['mkdir', '-p', output_dir, logs_dir, tmp_dir])

        # Keep list of items to delete
        staging_delete = [os.path.join(output_dir, 'tmp')]

        # Establish Software instances
        cufflinks = Software('Cufflinks', pipeline_config['cufflinks']['path'])
        htseq = Software('HTSeq', pipeline_config['htseq']['path'])

        cufflinks_output_dir = os.path.join(output_dir, 'cufflinks')
        subprocess.call(['mkdir', '-p', cufflinks_output_dir])
        cufflinks.run(
            Parameter('--GTF', pipeline_config['cufflinks']['transcriptome-gtf']),
            Parameter('-p', pipeline_config['cufflinks']['threads']),
            Parameter('--library-type', cufflinks_lib_type),
            Parameter('--upper-quartile-norm'),
            Parameter('-o', cufflinks_output_dir),
            Parameter('--max-bundle-frags', '1000000000'),
            Parameter(bam)
        )

        htseq_output_dir = os.path.join(output_dir, 'htseq')
        subprocess.call(['mkdir', '-p', htseq_output_dir])
        for id_attr in ['gene_id', 'gene_name']:
            for feature_type in ['gene', 'transcript', 'exon']:
                htseq.run(
                    Parameter('-f', 'bam'),
                    Parameter('-r', 'name'),
                    Parameter('-s', htseq_stranded),
                    Parameter('-t', feature_type),
                    Parameter('-i', id_attr),
                    Parameter(bam),
                    Parameter(pipeline_config['htseq']['transcriptome-gtf']),
                    Redirect(stream=Redirect.STDOUT, dest=os.path.join(htseq_output_dir,
                                                                       '{}.{}.counts'.format(feature_type,
                                                                                             id_attr)))
                )

        # Delete temporary files
        for delete_file in staging_delete:
            subprocess.call(['rm', '-rf', delete_file])
Exemplo n.º 2
0
    def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate options
        reads = pipeline_args['reads']
        output_dir = pipeline_args['output']
        logs_dir = os.path.join(output_dir, 'logs')
        lib_prefix = pipeline_args['lib']
        step = pipeline_args['step']
        forward_adapter = pipeline_args['forward_adapter']
        reverse_adapter = pipeline_args['reverse_adapter']
        run_is_stranded = pipeline_args['is_stranded']

        # Determine if run is paired-end from input
        run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1

        # Create output, tmp, and logs directories
        subprocess.call([
            'mkdir', '-p', output_dir, logs_dir,
            os.path.join(output_dir, 'tmp')
        ])

        # Timing functions for getting running time
        start_time = datetime.now()

        # Gather QC data
        qc_data = {
            'total_raw_reads_counts': [],
            'trimmed_reads_counts': [],
            'num_reads_mapped': '0',
            'running_time_seconds': '',
            'running_time_readable': ''
        }

        # Keep list of items to delete
        staging_delete = [os.path.join(output_dir, 'tmp')]

        # Establish software instances
        cat = Software('cat', '/bin/cat')
        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        star = Software('STAR', pipeline_config['STAR']['path'])
        rsem_calculate_expression = Software(
            'RSEM', pipeline_config['RSEM']['path-calculate-expression'])
        rsem_plot_model = Software('RSEM',
                                   pipeline_config['RSEM']['path-plot-model'])
        bedGraph_to_bw = Software('bedGraphToBigWig',
                                  pipeline_config['bedgraph_to_bw']['path'])
        bed_sort = Software('bedSort', pipeline_config['bedSort']['path'])
        samtools_flagstat = Software(
            'samtools flagstat',
            pipeline_config['samtools']['path'] + ' flagstat')

        # Step 1: If more than one reads pairs are provided, combine them
        if step <= 1 and len(reads) >= 2:
            if run_is_paired_end:
                # Aggregate read1s and read2s
                read1s, read2s = [], []
                for reads_set in reads:
                    read1, read2 = reads_set.split(':')
                    read1s.append(read1)
                    read2s.append(read2)

                # Combine reads groups
                combined_reads = []
                for name, reads_group in [('read1', read1s),
                                          ('read2', read2s)]:
                    combined_read_filename = os.path.join(
                        output_dir,
                        '{}.combined.{}.fastq.gz'.format(lib_prefix, name))
                    combined_reads.append(combined_read_filename)
                    staging_delete.append(combined_read_filename)
                    cat.run(
                        Parameter(*[read for read in reads_group]),
                        Redirect(stream=Redirect.STDOUT,
                                 dest=combined_read_filename))

                # Update reads list
                reads = [':'.join(combined_reads)]
            else:
                # Combine reads
                combined_read_filename = os.path.join(
                    output_dir, '{}.combined.fastq.gz'.format(lib_prefix))
                staging_delete.append(combined_read_filename)
                cat.run(
                    Parameter(*[read for read in reads]),
                    Redirect(stream=Redirect.STDOUT,
                             dest=combined_read_filename))

                # Update reads list
                reads = [combined_read_filename]

        # Step 2: Trim adapters with cutadapt
        if step <= 2:
            reads_set = reads[FIRST_READS_PAIR]
            if run_is_paired_end:
                # Get paired-end reads, construct new filenames
                read1, read2 = reads_set.split(':')

                # QC: Get raw fastq read counts
                qc_data['total_raw_reads_counts'].extend([
                    str(int(self.count_gzipped_lines(read1)) / 4),
                    str(int(self.count_gzipped_lines(read2)) / 4)
                ])

                trimmed_read1_filename = os.path.join(
                    output_dir, lib_prefix + '_read1.trimmed.fastq.gz')
                trimmed_read2_filename = os.path.join(
                    output_dir, lib_prefix + '_read2.trimmed.fastq.gz')

                staging_delete.append(trimmed_read1_filename)
                staging_delete.append(trimmed_read2_filename)

                # Run cutadapt
                cutadapt.run(
                    Parameter('--quality-base={}'.format(
                        pipeline_config['cutadapt']['quality-base'])),
                    Parameter('--minimum-length=5'),
                    Parameter('--output={}'.format(trimmed_read1_filename)),
                    Parameter(
                        '--paired-output={}'.format(trimmed_read2_filename)),
                    Parameter('-a', forward_adapter),
                    Parameter('-A', reverse_adapter), Parameter('-q', '30'),
                    Parameter(read1), Parameter(read2),
                    Redirect(stream=Redirect.STDOUT,
                             dest=os.path.join(logs_dir,
                                               'cutadapt.summary.log')))

                # QC: Get trimmed fastq read counts
                qc_data['trimmed_reads_counts'].extend([
                    str(
                        int(self.count_gzipped_lines(trimmed_read1_filename)) /
                        4),
                    str(
                        int(self.count_gzipped_lines(trimmed_read2_filename)) /
                        4)
                ])

                # Update reads list
                reads = ':'.join(
                    [trimmed_read1_filename, trimmed_read2_filename])

            else:
                # QC: Get raw fastq read count
                qc_data['total_raw_reads_counts'].append(
                    str(
                        int(
                            self.count_gzipped_lines(
                                os.path.join(
                                    output_dir, '{}.combined.fastq.gz'.format(
                                        lib_prefix)))) / 4))

                # Construct new filename
                trimmed_read_filename = os.path.join(
                    output_dir, lib_prefix + '.trimmed.fastq.gz')

                staging_delete.append(trimmed_read_filename)

                # Run cutadapt
                cutadapt.run(
                    Parameter('--quality-base={}'.format(
                        pipeline_config['cutadapt']['quality-base'])),
                    Parameter('--minimum-length=5'),
                    Parameter('--output={}'.format(trimmed_read_filename)),
                    Parameter('-a', forward_adapter), Parameter('-q', '30'),
                    Parameter(reads[FIRST_READS_PAIR]),
                    Redirect(stream=Redirect.STDOUT,
                             dest=os.path.join(logs_dir, 'cutadapt.summary')))

                # QC: Get trimmed fastq read count
                qc_data['trimmed_reads_counts'].append(
                    str(
                        int(self.count_gzipped_lines(trimmed_read_filename)) /
                        4))

                # Update reads list
                reads = [trimmed_read_filename]

        # Step 3: Alignment
        if step <= 3:
            # Gets reads for paired-end and single-end
            if run_is_paired_end:
                read1, read2 = reads.split(':')
            else:
                read1 = reads[FIRST_READS_PAIR]
                read2 = ''

            # Set up STAR parameters
            star_outfile_prefix = os.path.join(
                output_dir,
                lib_prefix + ('.' if lib_prefix[-1] != '.' else ''))
            star_common = [
                Parameter('--outFileNamePrefix', star_outfile_prefix),
                Parameter('--genomeDir',
                          pipeline_config['STAR']['genome-dir']),
                Parameter('--readFilesIn', read1, read2),
                Parameter('--readFilesCommand', 'zcat'),
                Parameter('--outFilterType', 'BySJout'),
                Parameter('--outFilterMultimapNmax', '20'),
                Parameter('--alignSJoverhangMin', '8'),
                Parameter('--alignSJDBoverhangMin', '1'),
                Parameter('--outFilterMismatchNmax', '999'),
                Parameter('--alignIntronMin', '20'),
                Parameter('--alignIntronMax', '1000000'),
                Parameter('--alignMatesGapMax', '1000000'),
                Parameter('--outSAMunmapped', 'Within'),
                Parameter('--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD'),
                Parameter('--outFilterMismatchNoverReadLmax', '0.04'),
                Parameter('--sjdbScore', '1')
            ]

            star_run = [
                Parameter('--runThreadN', pipeline_config['STAR']['threads']),
                #Parameter('--genomeLoad', 'LoadAndKeep'),
                #Parameter('--limitBAMsortRAM', '10000000000')
            ]

            star_bam = [
                Parameter('--outSAMtype', 'BAM', 'SortedByCoordinate'),
                Parameter('--quantMode', 'TranscriptomeSAM')
            ]

            star_strand, star_wig = [], []

            # STAR strandedness parameters
            if run_is_stranded:
                star_wig.append(Parameter('--outWigStrand', 'Stranded'))
            else:
                star_strand.append(
                    Parameter('--outSAMstrandField', 'intronMotif'))
                star_wig.append(Parameter('--outWigStrand', 'Unstranded'))

            star_meta = []

            # Run STAR alignment step
            star.run(*(star_common + star_run + star_bam + star_strand +
                       star_meta))

            # Store STAR output files
            star_output_bam = star_outfile_prefix + 'Aligned.sortedByCoord.out.bam'

            # QC: Get samtools flagstat
            samtools_flagstat.run(
                Parameter(star_output_bam),
                Redirect(stream=Redirect.STDOUT,
                         dest=star_output_bam + '.flagstat'))

            # QC: Get number of mapped reads from this BAM
            with open(star_output_bam + '.flagstat') as flagstats:
                flagstats_contents = flagstats.read()
                target_line = re.search(r'(\d+) \+ \d+ mapped',
                                        flagstats_contents)
                if target_line is not None:
                    qc_data['num_reads_mapped'] = str(
                        int(target_line.group(1)) / 2)

            # Generate bedGraph
            signal_output_dir = os.path.join(output_dir, 'signal')
            subprocess.call(['mkdir', '-p', signal_output_dir])
            signal_output_prefix = os.path.join(
                signal_output_dir,
                lib_prefix + ('.' if lib_prefix[-1] != '.' else ''))

            # Run STAR for signal generation
            star.run(Parameter('--runMode', 'inputAlignmentsFromBAM'),
                     Parameter('--inputBAMfile', star_output_bam),
                     Parameter('--outWigType', 'bedGraph'),
                     Parameter('--outFileNamePrefix', signal_output_prefix),
                     Parameter('--outWigReferencesPrefix', 'chr'), *star_wig)

            # Convert bedGraph to bigWig
            chrNL_txt = os.path.join(output_dir, 'chrNL.txt')
            with open(chrNL_txt, 'w') as chrNL_filehandle:
                subprocess.call([
                    'grep', '^chr',
                    os.path.join(pipeline_config['STAR']['genome-dir'],
                                 'chrNameLength.txt')
                ],
                                stdout=chrNL_filehandle)

            # Generate temporary signal file path
            sig_tmp = os.path.join(output_dir, 'sig.tmp')
            staging_delete.append(sig_tmp)
            if run_is_stranded:
                strand = [None, '-', '+']
                for i_strand in [1, 2]:
                    for i_mult in ['Unique', 'UniqueMultiple']:
                        # Get signal file for this iteration
                        signal_file = '{}Signal.{}.str{}.out.bg'.format(
                            signal_output_prefix, i_mult, str(i_strand))
                        # Write to temporary signal file
                        with open(sig_tmp, 'w') as sig_tmp_filehandle:
                            subprocess.call(['grep', '^chr', signal_file],
                                            stdout=sig_tmp_filehandle)
                        # Sort sig.tmp with bedSort
                        bed_sort.run(Parameter(sig_tmp), Parameter(sig_tmp))
                        # Run bedGraph to bigWig conversion
                        bedGraph_to_bw.run(
                            Parameter(sig_tmp), Parameter(chrNL_txt),
                            Parameter('{}Signal.{}.strand{}.bw'.format(
                                signal_output_prefix, i_mult,
                                strand[i_strand])))
            else:
                for i_mult in ['Unique', 'UniqueMultiple']:
                    # Get signal file for this iteration
                    signal_file = '{}Signal.{}.str1.out.bg'.format(
                        signal_output_prefix, i_mult)
                    # Write to temporary signal file
                    with open(sig_tmp, 'w') as sig_tmp_filehandle:
                        subprocess.call(['grep', '^chr', signal_file],
                                        stdout=sig_tmp_filehandle)
                    # Sort sig.tmp with bedSort
                    bed_sort.run(Parameter(sig_tmp), Parameter(sig_tmp))
                    # Run bedGraph to bigWig conversion
                    bedGraph_to_bw.run(
                        Parameter(sig_tmp), Parameter(chrNL_txt),
                        Parameter('{}Signal.{}.unstranded.bw'.format(
                            signal_output_prefix, i_mult)))

        # Step 4: Sort transcriptome BAM to ensure order of reads to make RSEM output deterministic
        if step <= 4:
            # Set BAM file paths, mv transcriptome BAM to temporary name
            star_outfile_prefix = os.path.join(
                output_dir,
                lib_prefix + ('.' if lib_prefix[-1] != '.' else ''))
            transcriptome_bam = star_outfile_prefix + 'Aligned.toTranscriptome.out.bam'
            tr_bam = star_outfile_prefix + 'Tr.bam'
            staging_delete.append(tr_bam)
            subprocess.call(['mv', transcriptome_bam, tr_bam])

            # Template command
            merge_cmd = 'cat <({input1}) <({input2}) | {compress} > {output}'
            input1_cmd = '{samtools} view -H {bam}'
            compress_cmd = 'samtools view -@ {threads} -bS -'

            if run_is_paired_end:
                input2_cmd = (
                    '{samtools} view -@ {threads} {bam} | ' +
                    'awk \'{{printf "%s", $0 " "; getline; print}}\' | ' +
                    'sort -S {ram} -T {tmpdir} | ' + 'tr \' \' \'\\n\'')
            else:
                input2_cmd = ('{samtools} view -@ {threads} {bam} | ' +
                              'sort -S {ram} -T {tmpdir}')

            print merge_cmd.format(
                input1=input1_cmd.format(
                    samtools=pipeline_config['samtools']['path'], bam=tr_bam),
                input2=input2_cmd.format(
                    samtools=pipeline_config['samtools']['path'],
                    threads=pipeline_config['RSEM']['threads'],
                    bam=tr_bam,
                    ram=pipeline_config['sort']['memory'],
                    tmpdir=os.path.join(output_dir, 'tmp')),
                compress=compress_cmd.format(
                    threads=pipeline_config['RSEM']['threads']),
                output=transcriptome_bam)

            subprocess.call(merge_cmd.format(
                input1=input1_cmd.format(
                    samtools=pipeline_config['samtools']['path'], bam=tr_bam),
                input2=input2_cmd.format(
                    samtools=pipeline_config['samtools']['path'],
                    threads=pipeline_config['RSEM']['threads'],
                    bam=tr_bam,
                    ram=pipeline_config['sort']['memory'],
                    tmpdir=os.path.join(output_dir, 'tmp')),
                compress=compress_cmd.format(
                    threads=pipeline_config['RSEM']['threads']),
                output=transcriptome_bam),
                            shell=True,
                            executable='/bin/bash')

            subprocess.call(['rm', tr_bam])

        # Step 5: Run RSEM to get quantification
        if step <= 5:
            star_outfile_prefix = os.path.join(
                output_dir,
                lib_prefix + ('.' if lib_prefix[-1] != '.' else ''))
            transcriptome_bam = star_outfile_prefix + 'Aligned.toTranscriptome.out.bam'

            # Set up RSEM parameters
            rsem_common = [
                Parameter('--bam'),
                Parameter('--estimate-rspd'),
                Parameter('--calc-ci'),
                Parameter('--no-bam-output'),
                Parameter('--seed', '12345')
            ]

            rsem_run = [
                Parameter('-p', pipeline_config['RSEM']['threads']),
                Parameter('--ci-memory', pipeline_config['RSEM']['memory'])
            ]

            rsem_type = []
            if run_is_paired_end:
                rsem_type.append(Parameter('--paired-end'))
            if run_is_stranded:
                rsem_type.append(Parameter('--forward-prob', '0'))

            # Run RSEM quantification step
            rsem_calculate_expression.run(
                *(rsem_common + rsem_run + rsem_type + [
                    Parameter(transcriptome_bam),
                    Parameter(pipeline_config['RSEM']['reference-dir']),
                    Parameter(os.path.join(output_dir, 'RSEM_Quant')),
                    Redirect(Redirect.BOTH,
                             dest=os.path.join(logs_dir, 'Log.rsem'))
                ]))

            # Generate RSEM plot model
            rsem_plot_model.run(
                Parameter(os.path.join(output_dir, 'RSEM_Quant'),
                          os.path.join(output_dir, 'Quant.pdf')))

        # QC: Get time delta
        elapsed_time = datetime.now() - start_time
        qc_data['running_time_seconds'] = str(elapsed_time.seconds)
        qc_data['running_time_readable'] = str(elapsed_time)

        # QC: Output QC data to file
        with open(os.path.join(logs_dir, 'qc_metrics.txt'),
                  'w') as qc_data_file:
            qc_data_file.write(json.dumps(qc_data, indent=4) + '\n')

        # Delete temporary files
        for delete_file in staging_delete:
            subprocess.call(['rm', '-rf', delete_file])

        print 'Complete'
        print 'Elapsed time: {}'.format(str(elapsed_time))
        print 'Elapsed time seconds: {}'.format(str(elapsed_time.seconds))
    def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate options
        reads = pipeline_args['reads']
        output_dir = pipeline_args['output']
        logs_dir = os.path.join(output_dir, 'logs')
        lib_prefix = pipeline_args['lib']
        step = pipeline_args['step']
        forward_adapter = pipeline_args['forward_adapter']
        reverse_adapter = pipeline_args['reverse_adapter']
        run_is_stranded = pipeline_args['is_stranded']

        # Determine if run is paired-end from input
        run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1

        # Create output, tmp, and logs directories
        tmp_dir = os.path.join(output_dir, 'tmp')
        subprocess.call(['mkdir', '-p', output_dir, logs_dir, tmp_dir])

        # Keep list of items to delete
        staging_delete = [os.path.join(output_dir, 'tmp')]

        qc_metrics = {
            'total_raw_reads': [],
            'total_trimmed_reads': [],
            'percent_num_reads_mapped_genome': [],
            'percent_num_reads_mapped_transcriptome': [],
            'percent_duplicate_reads': '0',
            'num_reads_multimapped': [],
            'percent_num_reads_rrna': '',
            'viral_rna': []
        }

        synapse_metadata = {
            'Assay': 'RNAseq',
            'Individual_ID': '',
            'Sample_ID': '',
            'File_Name': '',
            'BrodmannArea': '',
            'BrainRegion': '',
            'Hemisphere': '',
            'CellType': 'NA',
            'TissueState': '',
            'RNAIsolationBatch': '',
            'RIN': '',
            'LibraryBatch': '',
            'LibraryPrep': 'stranded, rRNA depletion',
            'LibraryKit': 'Illumina RS-122-2301',
            'ERCC_Added': '',
            'RunType': 'paired-end',
            'ReadLength': '100bp',
            'FlowcellBatch': '',
            'SequencingPlatform': '',
            'TotalReads': '',
            'MappedReads_Primary': '0',
            'MappedReads_Multimapped': '0',
            'rRNARate': '0',
            'Notes': ''
        }

        # Establish Software instances
        cutadapt = Software('Cutadapt', pipeline_config['cutadapt']['path'])
        fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
        star = Software('STAR Two-Pass', pipeline_config['STAR']['path'])
        novosort = Software('Novosort', pipeline_config['novosort']['path'])
        samtools_flagstat = Software(
            'Samtools Flagstat',
            pipeline_config['samtools']['path'] + ' flagstat')
        samtools_index = Software(
            'Samtools Index', pipeline_config['samtools']['path'] + ' index')
        samtools_faidx = Software(
            'Samtools Faidx', pipeline_config['samtools']['path'] + ' faidx')
        picard_markduplicates = Software(
            'Picard MarkDuplicates',
            'java -Xmx{heap_size}g -jar {path} MarkDuplicates'.format(
                heap_size=pipeline_config['picard'].get(
                    'heap_size', JAVA_DEFAULT_HEAP_SIZE),
                path=pipeline_config['picard']['path']))
        picard_create_seq_dict = Software(
            'Picard CreateSequenceDictionary',
            'java -Xmx{heap_size}g -jar {path} CreateSequenceDictionary'.
            format(heap_size=pipeline_config['picard'].get(
                'heap_size', JAVA_DEFAULT_HEAP_SIZE),
                   path=pipeline_config['picard']['path']))
        rnaseqc = Software(
            'RNAseQC', 'java -Xmx{heap_size}g -jar {path}'.format(
                heap_size=pipeline_config['picard'].get(
                    'heap_size', JAVA_DEFAULT_HEAP_SIZE),
                path=pipeline_config['RNAseQC']['path']))
        picard_add_read_groups = Software(
            'Picard AddOrReplaceReadGroups',
            'java -Xmx{heap_size}g -jar {path} AddOrReplaceReadGroups'.format(
                heap_size=pipeline_config['picard'].get(
                    'heap_size', JAVA_DEFAULT_HEAP_SIZE),
                path=pipeline_config['picard']['path']))
        bedtools_coverage = Software(
            'Bedtools Coverage',
            pipeline_config['bedtools']['path'] + ' coverage')
        bedtools_bamtobed = Software(
            'Bedtools Bamtobed',
            pipeline_config['bedtools']['path'] + ' bamtobed')

        # Housekeeping
        star_output = []
        novosort_outfile = ''

        # Step 1: Trimming | Cutadapt
        if step <= 1:
            for i, read in enumerate(reads):
                if run_is_paired_end:
                    # Get paired-end reads, construct new filenames
                    read1, read2 = read.split(':')

                    # QC: Get raw fastq read counts
                    qc_metrics['total_raw_reads'].append([
                        str(int(self.count_gzipped_lines(read1)) / 4),
                        str(int(self.count_gzipped_lines(read2)) / 4)
                    ])

                    trimmed_read1_filename = os.path.join(
                        output_dir,
                        lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i))
                    trimmed_read2_filename = os.path.join(
                        output_dir,
                        lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i))
                    staging_delete.extend(
                        [trimmed_read1_filename, trimmed_read2_filename])

                    # Run cutadapt
                    cutadapt.run(
                        Parameter('--quality-base={}'.format(
                            pipeline_config['cutadapt']['quality-base'])),
                        Parameter('--minimum-length=5'),
                        Parameter(
                            '--output={}'.format(trimmed_read1_filename)),
                        Parameter('--paired-output={}'.format(
                            trimmed_read2_filename)),
                        Parameter('-a', forward_adapter),
                        Parameter('-A', reverse_adapter),
                        Parameter('-q', '30'), Parameter(read1),
                        Parameter(read2),
                        Redirect(stream=Redirect.STDOUT,
                                 dest=os.path.join(logs_dir,
                                                   'cutadapt.summary')))

                    # QC: Get trimmed fastq read counts
                    qc_metrics['total_trimmed_reads'].append([
                        str(
                            int(
                                self.count_gzipped_lines(
                                    trimmed_read1_filename)) / 4),
                        str(
                            int(
                                self.count_gzipped_lines(
                                    trimmed_read2_filename)) / 4)
                    ])

                    # Update reads list
                    reads[i] = ':'.join(
                        [trimmed_read1_filename, trimmed_read2_filename])
                else:
                    # QC: Get raw fastq read counts
                    qc_metrics['total_raw_reads'].append(
                        [str(int(self.count_gzipped_lines(read)) / 4)])

                    # Construct new filename
                    trimmed_read_filename = os.path.join(
                        output_dir,
                        lib_prefix + '_{}.trimmed.fastq.gz'.format(i))
                    staging_delete.append(trimmed_read_filename)

                    # Run cutadapt
                    cutadapt.run(
                        Parameter('--quality-base={}'.format(
                            pipeline_config['cutadapt']['quality-base'])),
                        Parameter('--minimum-length=5'),
                        Parameter('--output={}'.format(trimmed_read_filename)),
                        Parameter('-a', forward_adapter),
                        Parameter('-q', '30'), Parameter(read),
                        Redirect(stream=Redirect.STDOUT,
                                 dest=os.path.join(
                                     logs_dir, 'cutadapt.chicago.summary')))

                    # QC: Get trimmed fastq read counts
                    qc_metrics['total_trimmed_reads'].append([
                        str(
                            int(self.count_gzipped_lines(
                                trimmed_read_filename)) / 4)
                    ])

                    # Update reads list
                    reads[i] = trimmed_read_filename

        # Step 2: FastQC
        if step <= 2:
            # Make FastQC directory
            fastqc_output_dir = os.path.join(output_dir, 'fastqc')
            subprocess.call(['mkdir', '-p', fastqc_output_dir])

            all_fastqs = []

            if run_is_paired_end:
                for read in reads:
                    all_fastqs.extend(read.split(':'))
            else:
                all_fastqs.extend(reads)

            for fastq in all_fastqs:
                fastqc.run(Parameter('--outdir={}'.format(fastqc_output_dir)),
                           Parameter(fastq))

        # Step 3: Alignment | STAR 2-pass, Alignment Stats | samtools flagstat
        if step <= 3:
            # Set up common STAR parameters
            star_common = [
                Parameter('--runMode', 'alignReads'),
                Parameter('--twopassMode', 'Basic'),
                Parameter('--runThreadN', pipeline_config['STAR']['threads']),
                Parameter('--genomeDir',
                          pipeline_config['STAR']['genome-dir']),
                Parameter('--readFilesCommand', 'zcat'),
                Parameter('--quantMode', 'TranscriptomeSAM', 'GeneCounts'),
                Parameter('--outSAMtype', 'BAM', 'Unsorted'),
                Parameter('--outFilterType', 'BySJout'),
                Parameter('--outFilterMultimapNmax', '20'),
                Parameter('--alignSJoverhangMin', '8'),
                Parameter('--alignSJDBoverhangMin', '1'),
                Parameter('--outFilterMismatchNmax', '2'),
                Parameter('--alignIntronMin', '20'),
                Parameter('--alignIntronMax', '1000000'),
                Parameter('--alignMatesGapMax', '1000000'),
                (Parameter('--outFilterIntronMotifs', 'RemoveNoncanonical')
                 if run_is_stranded else Parameter('--outSAMstrandField',
                                                   'intronMotif'))
            ]

            # Get STAR output file prefix
            star_outfile_prefix = os.path.join(
                output_dir,
                lib_prefix + ('_' if lib_prefix[-1] != '.' else '') + '{}.')

            # Align each read or read pair
            for i, read in enumerate(reads):
                star_output_bam = star_outfile_prefix.format(
                    i) + 'Aligned.out.bam'
                star_output_transcriptome_bam = star_outfile_prefix.format(
                    i) + 'Aligned.toTranscriptome.out.bam'
                star_output.append(star_output_bam)

                if run_is_paired_end:
                    read1, read2 = read.split(':')

                    star_paired_end = [
                        Parameter('--readFilesIn', read1, read2),
                        Parameter('--outFileNamePrefix',
                                  star_outfile_prefix.format(i))
                    ]

                    star.run(*(star_common + star_paired_end))
                else:
                    star_single_end = [
                        Parameter('--readFilesIn', read),
                        Parameter('--outFileNamePrefix',
                                  star_outfile_prefix.format(i))
                    ]

                    star.run(*(star_common + star_single_end))

                # Get flagstats for both alignments
                samtools_flagstat.run(
                    Parameter(star_output_bam),
                    Redirect(stream=Redirect.STDOUT,
                             dest=star_output_bam + '.flagstat'))
                samtools_flagstat.run(
                    Parameter(star_output_transcriptome_bam),
                    Redirect(stream=Redirect.STDOUT,
                             dest=star_output_transcriptome_bam + '.flagstat'))

                # QC: Get number of mapped reads to the genome from this BAM
                try:
                    with open(star_output_bam + '.flagstat') as flagstats:
                        flagstats_contents = flagstats.read()

                        # Pull out mapped reads
                        target_line = re.search(
                            r'(\d+) \+ \d+ mapped \(([0-9\.]+)%',
                            flagstats_contents)
                        if target_line is not None:
                            num_mapped = int(target_line.group(1))
                            qc_metrics[
                                'percent_num_reads_mapped_genome'].append([
                                    str(num_mapped / 2),
                                    '{}%'.format(target_line.group(2))
                                ])

                            num_secondary = int(
                                re.search(r'(\d+) \+ \d+ secondary',
                                          flagstats_contents).group(1))
                            num_supplementary = int(
                                re.search(r'(\d+) \+ \d+ supplementary',
                                          flagstats_contents).group(1))

                            synapse_metadata['MappedReads_Primary'] = str(
                                int(synapse_metadata['MappedReads_Primary']) +
                                num_mapped - num_secondary - num_supplementary)
                            synapse_metadata['MappedReads_Multimapped'] = str(
                                int(synapse_metadata['MappedReads_Multimapped']
                                    ) + num_secondary)
                        else:
                            qc_metrics[
                                'percent_num_reads_mapped_genome'].append('0')

                        # Pull out multimapped reads
                        target_line = re.search(r'(\d+) \+ \d+ secondary',
                                                flagstats_contents)
                        if target_line is not None:
                            qc_metrics['num_reads_multimapped'].append(
                                str(int(target_line.group(1)) / 2))
                        else:
                            qc_metrics['num_reads_multimapped'].append('0')
                except:
                    qc_metrics['percent_num_reads_mapped_genome'].append(
                        'Could not open flagstats for {}'.format(
                            star_output_bam))
                    qc_metrics['num_reads_multimapped'].append(
                        'Could not open flagstats for {}'.format(
                            star_output_bam))

                # QC: Get number of mapped reads to the transcriptome from this BAM
                try:
                    with open(star_output_transcriptome_bam +
                              '.flagstat') as flagstats:
                        flagstats_contents = flagstats.read()
                        target_line = re.search(
                            r'(\d+) \+ \d+ mapped \(([0-9\.]+)%',
                            flagstats_contents)
                        if target_line is not None:
                            qc_metrics[
                                'percent_num_reads_mapped_transcriptome'].append(
                                    [
                                        str(int(target_line.group(1)) / 2),
                                        '{}%'.format(target_line.group(2))
                                    ])
                        else:
                            qc_metrics[
                                'percent_num_reads_mapped_transcriptome'].append(
                                    '0')
                except:
                    qc_metrics[
                        'percent_num_reads_mapped_transcriptome'].append(
                            'Could not open flagstats for {}'.format(
                                star_output_bam))

        # Step 4: BAM Merge | Novosort
        if step <= 4:
            # Novosort to sort and merge BAM files
            novosort_outfile = os.path.join(
                output_dir,
                lib_prefix + ('.' if lib_prefix[-1] != '.' else '') +
                'merged.Aligned.out.bam')
            novosort.run(
                Parameter('--tmpdir', os.path.join(output_dir, 'tmp')),
                Parameter(*[bam for bam in star_output]),
                Redirect(stream=Redirect.STDOUT, dest=novosort_outfile))
            """
            The step below was commented out on 27 June 2016. It was taking up large amounts of memory, more than
            Beagle could handle, and some samples were consistently failing as a result. I think RNAseQC does this
            step anyway, I only left it in because I figured it wasn't doing any harm. Well now it is, so it's gone.
            """
            # QC: Get number of reads mapped to rRNA regions
            # aligned_bed_file = os.path.join(output_dir, str(uuid.uuid4()) + '.bed')
            # coverage_file = os.path.join(output_dir, str(uuid.uuid4()) + '.coverage.bed')
            # staging_delete.extend([aligned_bed_file, coverage_file])
            #
            # bedtools_bamtobed.run(
            #     Parameter('-i', novosort_outfile),
            #     Redirect(stream=Redirect.STDOUT, dest=aligned_bed_file)
            # )
            # bedtools_coverage.run(
            #     Parameter('-s'),
            #     Parameter('-counts'),
            #     Parameter('-a', pipeline_config['qc']['rRNA-bed']),
            #     Parameter('-b', aligned_bed_file),
            #     Redirect(stream=Redirect.STDOUT, dest=coverage_file)
            # )
            # try:
            #     rRNA_count = 0
            #     with open(coverage_file) as coverage:
            #         for line in coverage:
            #             rRNA_count += int(line.strip().split('\t')[6])
            #     percent_rRNA = (rRNA_count /
            #                     float(sum([int(aln[MAPPED_READS_COUNT])
            #                                for aln
            #                                in qc_metrics['percent_num_reads_mapped_transcriptome']]))
            #                     )
            #     qc_metrics['percent_num_reads_rrna'] = [str(rRNA_count), str(percent_rRNA)]
            #     synapse_metadata['rRNARate'] = str(percent_rRNA)
            # except Exception as e:
            #     qc_metrics['percent_num_reads_rrna'] = ['error', 'error', e.message]

            # Prepare genome fasta for RNAseQC
            genome_fa = pipeline_config['qc']['genome-fa']
            genome_fai = genome_fa + '.fai'
            genome_dict = os.path.splitext(genome_fa)[0] + '.dict'

            if not os.path.isfile(genome_fai):
                samtools_faidx.run(Parameter(genome_fa))
            if not os.path.isfile(genome_dict):
                picard_create_seq_dict.run(
                    Parameter('REFERENCE={}'.format(genome_fa)),
                    Parameter('OUTPUT={}'.format(genome_dict)))

            # Add read group to alignment file
            read_group_bam = os.path.join(output_dir, 'readgroup.bam')
            staging_delete.append(read_group_bam)
            picard_add_read_groups.run(
                Parameter('INPUT={}'.format(novosort_outfile)),
                Parameter('OUTPUT={}'.format(read_group_bam)),
                Parameter('RGLB={}'.format(lib_prefix)),
                Parameter('RGPL=Illumina'), Parameter('RGPU=1'),
                Parameter('RGSM=Sample'))

            # Generate BAM index for RNAseQC
            samtools_index.run(Parameter(read_group_bam))
            staging_delete.append(read_group_bam + '.bai')

            # QC: Get RNAseQC output
            rnaseqc_output_dir = os.path.join(output_dir, 'RNAseQC')
            subprocess.call(['mkdir', '-p', rnaseqc_output_dir])
            rnaseqc.run(
                Parameter('-o', rnaseqc_output_dir),
                Parameter('-r', genome_fa),
                Parameter('-t',
                          pipeline_config['cufflinks']['transcriptome-gtf']),
                Parameter(
                    '-s', '"{sample_id}|{bam_file}|{notes}"'.format(
                        sample_id=lib_prefix,
                        bam_file=read_group_bam,
                        notes='None')),
                Parameter('-singleEnd')
                if not run_is_paired_end else Parameter())

            # Picard MarkDuplicates to get duplicates metrics
            markduplicates_outfile = os.path.join(
                output_dir, '{}.processed.bam'.format(lib_prefix))
            markduplicates_metrics_filepath = os.path.join(
                logs_dir, 'mark_dup.metrics')
            picard_markduplicates.run(
                Parameter('INPUT={}'.format(novosort_outfile)),
                Parameter('OUTPUT={}'.format(markduplicates_outfile)),
                Parameter('TMP_DIR={}'.format(tmp_dir)),
                Parameter(
                    'METRICS_FILE={}'.format(markduplicates_metrics_filepath)),
                Redirect(stream=Redirect.BOTH,
                         dest=os.path.join(logs_dir, 'mark_dup.log')))

            # QC: Get percent duplicates
            try:
                with open(markduplicates_metrics_filepath) as markdup_metrics:
                    for line in markdup_metrics:
                        if line[FIRST_CHAR] == '#':
                            continue
                        record = line.strip().split('\t')
                        if len(record) == 9:
                            if re.match(r'\d\.\d+', record[7]) is not None:
                                qc_metrics['percent_duplicate_reads'] = record[
                                    7]
            except Exception as e:
                qc_metrics['percent_duplicate_reads'] = [
                    'Could not open MarkDuplicates metrics', e.message
                ]

        # Write out QC metrics to file
        with open(os.path.join(logs_dir, 'qc_metrics.txt'),
                  'w') as qc_data_file:
            qc_data_file.write(json.dumps(qc_metrics, indent=4) + '\n')

        # Populate Synapse QC matrix
        if re.match(r'\d{4}-\d{4}', lib_prefix.strip()) is not None:
            synapse_metadata['Individual_ID'] = lib_prefix
            synapse_metadata[
                'File_Name'] = 'PEC_BrainGVEX_UIC-UChicago_FC_mRNA_HiSeq2000_{}'.format(
                    lib_prefix)

        re_raw_filename = re.match(
            r'\d{4}-\d{4}_.+_(.+)_.+_(.+_\d)_\d_sequence\.txt\.gz',
            os.path.basename(pipeline_args['reads'][0].split(':')[0]))
        if re_raw_filename is not None:
            sequencing_inst_name = re_raw_filename.group(1)
            if '673' in sequencing_inst_name or '484' in sequencing_inst_name:
                synapse_metadata['SequencingPlatform'] = 'HiSeq2000'
            elif '1070' in sequencing_inst_name:
                synapse_metadata['SequencingPlatform'] = 'HiSeq2500'
            flowcell_batch = re_raw_filename.group(2)
            synapse_metadata['FlowcellBatch'] = flowcell_batch

        total_raw_reads_end1 = sum(
            [int(count[0]) for count in qc_metrics['total_raw_reads']]) / 4
        synapse_metadata['TotalReads'] = str(total_raw_reads_end1)

        # Write out Synapse metadata
        with open(os.path.join(logs_dir, 'synapse_metadata.txt'),
                  'w') as synapse_metadata_file:
            synapse_metadata_file.write(
                json.dumps(synapse_metadata, indent=4) + '\n')

        # Delete temporary files
        for delete_file in staging_delete:
            subprocess.call(['rm', '-rf', delete_file])
Exemplo n.º 4
0
  def run_pipeline(self, pipeline_args, pipeline_config):
    # create variables from parser if wanted
    bamFiles = pipeline_args['bam:lib']
    outputDir = pipeline_args['output']

    # Create output directory
    subprocess.call(['mkdir', outputDir])

    # Software
    picard = Software('picard', pipeline_config['picard']['path'])

    # Change these to just be done in python script?

    # Common software tools 
    awk = Software('awk', 'awk')
    sort = Software('sort', 'sort')
    uniq = Software('uniq', 'uniq')
    paste = Software('paste', 'paste')
    cat = Software('cat', 'cat')
    grep = Software('grep', 'grep')

    # Directories and Files
    pathTo_genomeFasta = pipeline_config['picard']['genomeFasta']


    # Keep track of Bids in pipeline

    # bid_list = []
    # bam_list = []
    # for bamLib in bamFiles:
    #   bid_list.append(bamLib.split(':')[1])
    #   bam_list.append(bamLib.split(':')[0])


    '''
    Picard tools

    java -jar picard.jar CollectMultipleMetrics 
    I=2017-221.uniq_sorted.bam 
    O= multiple_metrics 
    R=GRCh37.p13.genome.fa

    java -jar picard.jar CollectGcBiasMetrics
    I= .uniq
    O=gc_bias_metrics.txt 
    CHART=gc_bias_metrics.pdf 
    S=summary_metrics.txt 
    R=reference_sequence.fasta

    java -jar picard.jar CollectRnaSeqMetrics
    I=input.bam 
    O=output.RNA_Metrics 
    REF_FLAT=ref_flat.txt 
    STRAND=FIRST_READ_TRANSCRIPTION_STRAND

    java -jar picard.jar MarkDuplicates
    I=input.bam 
    O=marked_duplicates.bam 
    M=marked_dup_metrics.txt
    ASSUME_SORTED=true
    '''

    for bamLib in bamFiles:
      bam, bid = bamLib.split(':')
      newDir = new_dir(outputDir, bid)
      subprocess.call(['mkdir', newDir])

      # consider multithreading?

      picard.run(
        Parameter('CollectMultipleMetrics'),
        Parameter('I={}'.format(bam)),     # input
        Parameter('O={}/{}.multiple_metrics'.format(newDir, bid)),    # output
        Parameter('R={}'.format(pathTo_genomeFasta))                  # genomeReference
      )

      picard.run(
        Parameter('CollectGcBiasMetrics'),
        Parameter('I={}'.format(bam)),          # input
        Parameter('O={}/{}.gc_bias_metrics'.format(newDir, bid)),           # output
        Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format(newDir, bid)),   # chart
        Parameter('S={}/{}.summary_metrics'.format(newDir, bid)),           # summary metrics
        Parameter('R={}'.format(pathTo_genomeFasta))                        # genome reference
      )

      picard.run(
        Parameter('CollectRnaSeqMetrics'),
        Parameter('I={}'.format(bam)),     # input
        Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)),         # output
        Parameter('REF_FLAT={}/{}'.format(newDir, bid)),              # ref_flat
        Parameter('STRAND=FIRST_READ_TRANSCRIPTION_STRAND')           # strandedness
      )

      picard.run(
        Parameter('MarkDuplicates'),
        Parameter('I={}'.format(bam)),       # input
        Parameter('O={}/{}.marked_duplicates.bam'.format(newDir, bid)), # output
        Parameter('M={}/{}.marked_dup_metrics.txt'.format(new, bid)),                    # marked dup metrics
        Parameter('TMP_DIR={}'.format(newDir)),
        Parameter('ASSUME_SORTED=true')                                # sorted
        Parameter('VALIDATION_STRINGENCY=LENIENT'),
        Redirect(stream=Redirect.BOTH, dest=os.path.join(newDir, 'mark_dup.log'))
      )
Exemplo n.º 5
0
    def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate variables from argparse
        read_pairs = pipeline_args['reads']
        output_dir = os.path.abspath(pipeline_args['output'])
        logs_dir = os.path.join(output_dir, 'logs')
        lib_prefix = pipeline_args['lib']
        step = int(pipeline_args['step'])
        forward_adapter = pipeline_args['forward_adapter']
        reverse_adapter = pipeline_args['reverse_adapter']

        # Create output, tmp, and logs directories
        tmp_dir = os.path.join(output_dir, 'tmp')
        subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir])

        # Keep list of items to delete
        staging_delete = [tmp_dir]
        bwa_bam_outs = []
        qc_data = {
            'total_raw_reads_counts': [],
            'trimmed_reads_counts': [],
            # TODO Find a better way to store FastQC results
            'num_reads_mapped': [],
            'percent_duplicate_reads': '0',
            'num_unique_reads_mapped': [],  # TODO This isn't implemented
            'num_mtDNA_reads_mapped': [],  # TODO This isn't implemented
            'num_reads_mapped_after_filtering': '-1',  # TODO This isn't implemented
            'num_peaks_called': '-1',
            # TODO Get number of peaks in annotation sites
        }

        # Instantiate software instances
        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
        bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln')
        bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe')
        samtools_view = Software('samtools view',
                                 pipeline_config['samtools']['path'] + ' view')
        samtools_flagstat = Software('samtools flagstat',
                                     pipeline_config['samtools']['path'] + ' flagstat')
        samtools_index = Software('samtools index',
                                  pipeline_config['samtools']['path'] + ' index')
        novosort = Software('novosort', pipeline_config['novosort']['path'])
        picard_mark_dup = Software('Picard MarkDuplicates',
                                   pipeline_config['picard']['path'] + ' MarkDuplicates')
        picard_insert_metrics = Software('Picard CollectInsertSizeMetrics',
                                         pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics')
        bedtools_bamtobed = Software('bedtools bamtobed',
                            pipeline_config['bedtools']['path'] + ' bamtobed')
        bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + ' sort')
        bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge')
        bedtools_intersect = Software('bedtools intersect',
                                      pipeline_config['bedtools']['path'] + ' intersect')
        homer_maketagdir = Software('HOMER makeTagDirectory',
                                    pipeline_config['makeTagDirectory']['path'])
        homer_findpeaks = Software('HOMER findPeaks', pipeline_config['findPeaks']['path'])
        homer_pos2bed = Software('HOMER pos2bed', pipeline_config['pos2bed']['path'])

        if step <= 1:
            for i, read_pair in enumerate(read_pairs):
                read1, read2 = read_pair.split(':')

                # QC: Get raw fastq read counts
                qc_data['total_raw_reads_counts'].append([
                    str(int(self.count_gzipped_lines(read1))/4),
                    str(int(self.count_gzipped_lines(read2))/4)
                ])

                trimmed_read1_filename = os.path.join(output_dir,
                                                      lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i))
                trimmed_read2_filename = os.path.join(output_dir,
                                                      lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i))

                cutadapt.run(
                    Parameter('--quality-base=33'),
                    Parameter('--minimum-length=5'),
                    Parameter('-q', '30'),  # Minimum quality score
                    Parameter('--output={}'.format(trimmed_read1_filename)),
                    Parameter('--paired-output={}'.format(trimmed_read2_filename)),
                    Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'),
                    Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'),
                    Parameter(read1),
                    Parameter(read2),
                    Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log'))
                )

                # QC: Get trimmed fastq read counts
                qc_data['trimmed_reads_counts'].append([
                    str(int(self.count_gzipped_lines(trimmed_read1_filename))/4),
                    str(int(self.count_gzipped_lines(trimmed_read2_filename))/4)
                ])

                staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename])
                read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename])

        if step <= 2:
            # Make FastQC directory
            fastqc_output_dir = os.path.join(output_dir, 'fastqc')
            subprocess.call(['mkdir', '-p', fastqc_output_dir])
            for i, read_pair in enumerate(read_pairs):
                for read in read_pair.split(':'):
                    fastqc.run(
                        Parameter('--outdir={}'.format(fastqc_output_dir)),
                        Parameter(read)
                    )

                    bwa_aln.run(
                        Parameter('-t', pipeline_config['bwa']['threads']),
                        Parameter(pipeline_config['bwa']['index-dir']),
                        Parameter(read),
                        Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read))
                    )

                    staging_delete.append('{}.sai'.format(read))

        if step <= 3:
            for i, read_pair in enumerate(read_pairs):
                read1, read2 = read_pair.split(':')
                bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i))

                bwa_sampe.run(
                    Parameter('-a', '2000'),  # Maximum insert size
                    Parameter('-n', '1'),
                    Parameter(pipeline_config['bwa']['index-dir']),
                    Parameter('{}.sai'.format(read1)),
                    Parameter('{}.sai'.format(read2)),
                    Parameter(read1),
                    Parameter(read2),
                    Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')),
                    Pipe(
                        samtools_view.pipe(
                            Parameter('-hSb'),
                            Parameter('-o', bwa_bam_output),
                            Parameter('-')  # Get input from stdin
                        )
                    )
                )

                bwa_bam_outs.append(bwa_bam_output)

        if step <= 4:
            for i, bwa_bam in enumerate(bwa_bam_outs):
                samtools_flagstat.run(
                    Parameter(bwa_bam),
                    Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat')
                )

                # QC: Get number of mapped reads from this BAM
                try:
                    with open(bwa_bam + '.flagstat') as flagstats:
                        flagstats_contents = flagstats.read()
                        target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents)
                        if target_line is not None:
                            qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2))
                        else:
                            qc_data['num_reads_mapped'].append('0')
                except:
                    qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format(
                        bwa_bam + '.flagstat'
                    ))

            sortmerged_bam = os.path.join(output_dir, '{}.sortmerged.bam'.format(lib_prefix))
            steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix))
            duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix))
            unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix))
            unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix))
            chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))

            novosort.run(
                Parameter('--threads', pipeline_config['novosort']['threads']),
                Parameter('--tmpcompression', '6'),
                Parameter('--tmpdir', tmp_dir),
                Parameter(*[bam for bam in bwa_bam_outs]),
                Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam),
                Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log'))
            )

            # This creates a dependency on PySam
            # Removes reads with template length < 38 due to steric hindrence
            samtools_index.run(Parameter(sortmerged_bam))
            sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb')
            steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb',
                                                                  template=sortmerged_bam_alignmentfile)
            for read in sortmerged_bam_alignmentfile.fetch():
                if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF:
                    steric_filter_bam_alignmentfile.write(read)

            sortmerged_bam_alignmentfile.close()
            steric_filter_bam_alignmentfile.close()

            # Mark and remove duplicates
            markduplicates_metrics_filepath = os.path.join(logs_dir,
                                                           'mark_dup.metrics')
            picard_mark_dup.run(
                Parameter('INPUT={}'.format(steric_filter_bam)),
                Parameter('OUTPUT={}'.format(duprm_bam)),
                Parameter('TMP_DIR={}'.format(tmp_dir)),
                Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)),
                Parameter('REMOVE_DUPLICATES=true'),
                Parameter('VALIDATION_STRINGENCY=LENIENT'),
                Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log'))
            )

            # QC: Get percent duplicates
            try:
                with open(markduplicates_metrics_filepath) as markdup_metrics:
                    for line in markdup_metrics:
                        if line[FIRST_CHAR] == '#':
                            continue
                        record = line.strip().split('\t')
                        if len(record) == 9:
                            if re.match(r'\d+', record[7]) is not None:
                                qc_data['percent_duplicate_reads'] = record[7]
            except:
                qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics'

            # Filter down to uniquely mapped reads
            samtools_view.run(
                Parameter('-b'),
                Parameter('-F', '256'),
                Parameter('-q', '10'),
                Parameter('-o', unique_bam),
                Parameter(duprm_bam)
            )

            # Remove unmapped reads
            samtools_view.run(
                Parameter('-b'),
                Parameter('-F', '12'),
                Parameter('-o', unmappedrm_bam),
                Parameter(unique_bam)
            )

            # Create BAM index, then remove chrM
            samtools_index.run(
                Parameter(unmappedrm_bam)
            )

            # Remove chrM
            all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']]
            samtools_view.run(
                Parameter('-b'),
                Parameter('-o', chrmrm_bam),
                Parameter(unmappedrm_bam),
                *all_chr
            )

            # Stage delete for temporary files
            staging_delete.extend([
                sortmerged_bam,
                sortmerged_bam + '.bai',  # BAM index file
                steric_filter_bam,
                unique_bam,
                duprm_bam,
                unmappedrm_bam,
                unmappedrm_bam + '.bai',  # BAM index file
                chrmrm_bam
            ])

        if step <= 5:
            # Generate filename for final processed BAM and BED
            processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix))
            unshifted_bed = os.path.join(output_dir, '{}.unshifted.bed'.format(lib_prefix))
            processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))

            # staging_delete.append(unshifted_bed)

            # Generate filename for chrM removed BAM
            chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))

            # Remove blacklisted genomic regions
            bedtools_intersect.run(
                Parameter('-v'),
                Parameter('-abam', chrmrm_bam),
                Parameter('-b', pipeline_config['bedtools']['blacklist-bed']),
                Parameter('-f', '0.5'),
                Redirect(stream=Redirect.STDOUT, dest=processed_bam)
            )

            # QC: Generate insert size metrics PDF
            picard_insert_metrics.run(
                Parameter('INPUT={}'.format(processed_bam)),
                Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))),
                Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf')))
            )

            # Generate index for processed BAM
            samtools_index.run(
                Parameter(processed_bam)
            )

            # Convert BAM to BED
            bedtools_bamtobed.run(
                Parameter('-i', processed_bam),
                Redirect(stream=Redirect.STDOUT, dest=unshifted_bed)
            )

            staging_delete.append(unshifted_bed)

            # Shifting + strand by 4 and - strand by -5, according to
            # the ATACseq paper

            # This used to be bedtools shift, but they are fired
            self.shift_reads(
                input_bed_filepath=unshifted_bed,
                output_bed_filepath=processed_bed,
                log_filepath=os.path.join(logs_dir, 'shift_reads.logs'),
                genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'],
                minus_strand_shift=MINUS_STRAND_SHIFT,
                plus_strand_shift=PLUS_STRAND_SHIFT
            )

        if step <= 6:
            processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))
            homer_tagdir = os.path.join(output_dir, '{}_tagdir'.format(lib_prefix))
            unsorted_peaks = os.path.join(output_dir, '{}.unsorted.peaks.bed'.format(lib_prefix))
            sorted_peaks = os.path.join(output_dir, '{}.sorted.peaks.bed'.format(lib_prefix))
            merged_peaks = os.path.join(output_dir, '{}.peaks.bed'.format(lib_prefix))

            # Populate HOMER tag directory
            homer_maketagdir.run(
                Parameter(homer_tagdir),
                Parameter('-format', 'bed'),
                Parameter(processed_bed),
                Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'maketagdir.log'))
            )

            # Run HOMER peak calling program
            homer_findpeaks.run(
                Parameter(homer_tagdir),
                Parameter('-fragLength', '0'),
                Parameter('-fdr', '0.01'),
                Parameter('-localSize', '50000'),
                Parameter('-o', 'auto'),
                Parameter('-style', 'dnase'),
                Parameter('-size', '150'),
                Parameter('-minDist', '50'),
                Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'findpeaks.log'))
            )

            # Convert HOMER peaks file to bed format
            homer_pos2bed.run(
                Parameter(os.path.join(homer_tagdir, 'peaks.txt')),
                Redirect(stream=Redirect.STDOUT, dest=unsorted_peaks),
                Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'pos2bed.log'))
            )

            # Sort called peaks bed file
            bedtools_sort.run(
                Parameter('-i', unsorted_peaks),
                Redirect(stream=Redirect.STDOUT, dest=sorted_peaks)
            )

            # Merge peaks to create final peaks file
            bedtools_merge.run(
                Parameter('-i', sorted_peaks),
                Redirect(stream=Redirect.STDOUT, dest=merged_peaks)
            )

            # Stage delete for temporary files
            staging_delete.extend([
                unsorted_peaks,
                sorted_peaks
            ])

        # QC: Output QC data to file
        with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file:
            qc_data_file.write(str(qc_data) + '\n')

        # Delete temporary files
        for delete_file in staging_delete:
            subprocess.call(['rm', '-rf', delete_file])
    def run_pipeline(self, pipeline_args, pipeline_config):
        # create variables from parser if wanted
        bamFiles = pipeline_args['bam:lib']
        outputDir = pipeline_args['output']
        adapter = pipeline_args['adapter']
        numThreads = pipeline_args['threads']

        # Create output directory
        subprocess.call(['mkdir', outputDir])

        # Software
        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        star = Software('STAR', pipeline_config['STAR']['path'])
        bedtools = Software('bedtools', pipeline_config['bedtools']['path'])
        bowtie2 = Software('bowtie2', pipeline_config['bowtie2']['path'])
        samtools = Software('samtools', pipeline_config['samtools']['path'])
        samtools_sort = Software('samtools sort',
                                 pipeline_config['samtools']['path'])
        read_distribution = Software(
            'read_distribution.py',
            pipeline_config['read_distribution']['path'])
        featureCounts = Software('featureCounts',
                                 pipeline_config['featureCounts']['path'])
        fastQC = Software('FastQC', pipeline_config['FastQC']['path'])
        picard = Software('picard', pipeline_config['picard']['path'])

        # Change these to just be done in python script?

        # Common software tools
        awk = Software('awk', 'awk')
        sort = Software('sort', 'sort')
        uniq = Software('uniq', 'uniq')
        paste = Software('paste', 'paste')
        cat = Software('cat', 'cat')
        grep = Software('grep', 'grep')

        # Directories and Files
        pathToGenomeDir = pipeline_config['STAR']['genomeDir']
        pathToGenome = pipeline_config['bowtie2']['genome_ref']
        pathToGtf = pipeline_config['STAR']['GTF_ref']
        pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed']
        pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100']
        pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed']
        pathTo_genomeFasta = pipeline_config['picard']['genomeFasta']
        '''

      remove adaptor and trim
      adaptor sequence: AGATCGGAAGAGCACACGTCT
      -m 25 discard any reads shorter than 25 nucleotides
      keep only reads that had the adaptor sequence --discard-untrimmed

      cutadapt -a AGATCGGAAGAGCACACGTCT -m 25 --discard-untrimmed {filename}.fastq.gz
       > {filename}_trimmed.fastq.gz 2> {filename}_report.txt
      
      Remove adapters
      Only keep reads with adapters, otherwise artifact
      Discard reads shorter than 25 bp
      
    '''

        # Keep track of Bids in pipeline

        bid_list = []
        for bamLib in bamFiles:
            bid_list.append(bamLib.split(':')[-1])
        '''
      Sort and extract uniquely mapped reads for QC and further analyses
        samtools view -H $file > header.sam
        samtools view $file | grep -w NH:i:1 | cat header.sam - | samtools view -bS - | samtools sort - ${filename}_uniq_sorted
        rm header.sam

      Using this file for the rest of the analysis
    '''

        for bamLib in bamFiles:
            bam, bid = bamLib.split(':')
            newDir = new_dir(outputDir, bid)
            samtools.run(
                Parameter('view'),
                Parameter('-H'),
                Parameter(bam),  # star outfile name
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(newDir,
                                           '{}.header.sam'.format(bid))))
            samtools.run(
                Parameter('view'),
                Parameter(bam),  # star outfile name
                Pipe(
                    grep.pipe(
                        Parameter('-w'), Parameter('NH:i:1'),
                        Pipe(
                            cat.pipe(
                                Parameter(
                                    os.path.join(newDir,
                                                 '{}.header.sam'.format(bid)),
                                    '-'),
                                Pipe(
                                    samtools.pipe(
                                        Parameter('view'),
                                        Parameter('-bS', '-'),
                                        Pipe(
                                            samtools.pipe(
                                                Parameter('sort'),
                                                Parameter(
                                                    '-', '-o',
                                                    '{}/{}.uniq_sorted.bam'.
                                                    format(newDir,
                                                           bid)))))))))))
            # subprocess.call(['rm', '{}/{}.header.sam'.format(newDir, bid)])
        '''
      SeQC to evaluate percent reads mapped to each genomic features
        read_distribution.py -r hg19_RefSeq.bed12 -i $file
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            read_distribution.run(
                Parameter('-r'),
                Parameter(pathTo_hg19_bed),
                Parameter('-i'),
                Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid)),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.read_distribution.log'.format(bid))),
                shell=True)
        '''
      codon periodicity
        annotation=/glusterfs/users/ashieh/annotations/hg19_ccds_exons_plus_start100.bed

        bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100
        awk -v OFS='\t' '{print ($2-($14+100))}' ${filename}_intersect_start100.bed
         | sort | uniq -c > ${filename}_relative_pos_aggregate.table
    '''

        # bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            bedtools.run(
                Parameter('intersect'),
                Parameter('-a {}'.format(pathTo_hg19_bed_start100)),
                Parameter('-b {}/{}.uniq_sorted.bam'.format(newDir, bid)),
                Parameter('-s'),
                Parameter('-bed'),
                Parameter('-wa'),
                Parameter('-wb'),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.intersect_start100.bed'.format(bid))),
                shell=True)
            awk.run(
                Parameter('-v'), Parameter("OFS='\\t'"),
                Parameter('{print ($8-($2+100))}'),
                Parameter('{}/{}.intersect_start100.bed'.format(newDir, bid)),
                Pipe(
                    sort.pipe(
                        Pipe(
                            uniq.pipe(
                                Parameter('-c'),
                                Redirect(stream=Redirect.STDOUT,
                                         dest=os.path.join(
                                             newDir,
                                             '{}_relative_pos_aggregate.table'.
                                             format(bid))))))))

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            rpaFile = open(
                '{dir}/{bid}_relative_pos_aggregate.table'.format(dir=newDir,
                                                                  bid=bid),
                'rb')
            myDict = {}

            for i in range(-30, 31):
                myDict[i] = 0

            for line in rpaFile:
                Frequency, start = line.strip().split(' ')
                if int(start) >= -30 and int(start) <= 30:
                    print start
                    myDict[int(start)] = Frequency

            # print times

            freqs = []
            starts = []
            for i in range(-30, 31):
                starts.append(i)
                freqs.append(myDict[i])

            # print freqs

            fig, ax = plt.subplots()
            # plt.set_title('{} codon periodicity'.format(bid))
            plt.xlabel("-30 to 30 relative position")
            plt.ylabel("Frequency")
            plt.bar(starts, freqs)
            fig.savefig('{dir}/{bid}_codon_periodicity_plot.png'.format(
                dir=newDir, bid=bid))
        '''
    Picard tools

    java -jar picard.jar CollectMultipleMetrics 
    I=2017-221.uniq_sorted.bam 
    O= multiple_metrics 
    R=GRCh37.p13.genome.fa

    java -jar picard.jar CollectGcBiasMetrics
    I= .uniq
    O=gc_bias_metrics.txt 
    CHART=gc_bias_metrics.pdf 
    S=summary_metrics.txt 
    R=reference_sequence.fasta

    java -jar picard.jar CollectRnaSeqMetrics
    I=input.bam 
    O=output.RNA_Metrics 
    REF_FLAT=ref_flat.txt 
    STRAND=FIRST_READ_TRANSCRIPTION_STRAND

    java -jar picard.jar MarkDuplicates
    I=input.bam 
    O=marked_duplicates.bam 
    M=marked_dup_metrics.txt
    ASSUME_SORTED=true
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)

            picard.run(
                Parameter('CollectMultipleMetrics'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.multiple_metrics'.format(newDir,
                                                            bid)),  # output
                Parameter('R={}'.format(pathTo_genomeFasta))  # genomeReference
            )

            picard.run(
                Parameter('CollectGcBiasMetrics'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.gc_bias_metrics'.format(newDir,
                                                           bid)),  # output
                Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format(
                    newDir, bid)),  # chart
                Parameter('S={}/{}.summary_metrics'.format(
                    newDir, bid)),  # summary metrics
                Parameter(
                    'R={}'.format(pathTo_genomeFasta))  # genome reference
            )

            picard.run(
                Parameter('CollectRnaSeqMetrics'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)),  # output
                Parameter('REF_FLAT={}/{}'.format(newDir, bid)),  # ref_flat
                Parameter(
                    'STRAND=FIRST_READ_TRANSCRIPTION_STRAND')  # strandedness
            )

            picard.run(
                Parameter('MarkDuplicates'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.marked_duplicates.bam'.format(
                    newDir, bid)),  # output
                Parameter(
                    'M={}/{}.marked_dup_metrics.txt'),  # marked dup metrics
                Parameter('ASSUME_SORTED=true')  # sorted
            )
        '''
    subread: featureCounts

      featureCounts -a /path_to_gtf/gencode.v19.annotation.gtf -o <bid>.featureCounts <bid>.uniq_sorted.bam
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            featureCounts.run(
                Parameter('-a', '{}'.format(pathToGtf)),  # gtf
                Parameter('-s', '1'),  # strand-specific read counting 
                Parameter('-o', '{}/{}.featureCounts'.format(newDir,
                                                             bid)),  # output
                Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid))  # input
            )
Exemplo n.º 7
0
    def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate options
        bam = pipeline_args['bam']
        output_dir = pipeline_args['output']
        logs_dir = os.path.join(output_dir, 'logs')

        # Create output, tmp, and logs directories
        subprocess.call([
            'mkdir', '-p', output_dir, logs_dir,
            os.path.join(output_dir, 'tmp')
        ])

        # Timing functions for getting running time
        start_time = datetime.now()

        # Gather QC data
        qc_data = {
            'total_raw_reads_counts': [],
            'trimmed_reads_counts': [],
            'num_reads_mapped': '0',
            'running_time_seconds': '',
            'running_time_readable': ''
        }

        # Keep list of items to delete
        staging_delete = [os.path.join(output_dir, 'tmp')]

        # Establish software instances
        rsem_calculat_expression = Software(
            'RSEM', pipeline_config['RSEM']['path-calculate-expression'])
        rsem_plot_model = Software('RSEM',
                                   pipeline_config['RSEM']['path-plot-model'])

        # Set up RSEM parameters
        rsem_common = [
            Parameter('--bam'),
            Parameter('--estimate-rspd'),
            Parameter('--calc-ci'),
            Parameter('--no-bam-output'),
            Parameter('--seed', '12345')
        ]

        rsem_run = [
            Parameter('-p', pipeline_config['RSEM']['threads']),
            Parameter('--ci-memory', pipeline_config['RSEM']['memory'])
        ]

        rsem_type = []
        if pipeline_args['is_paired_end']:
            rsem_type.append(Parameter('--paired-end'))
        if pipeline_args['is_stranded']:
            rsem_type.append(Parameter('--forward-prob', '0'))

        # Run RSEM quantification step
        rsem_calculat_expression.run(*(rsem_common + rsem_run + rsem_type + [
            Parameter(bam),
            Parameter(pipeline_config['RSEM']['reference-dir']),
            Parameter(os.path.join(output_dir, 'RSEM_Quant')),
            Redirect(Redirect.BOTH, dest=os.path.join(logs_dir, 'Log.rsem'))
        ]))

        # Generate RSEM plot model
        rsem_plot_model.run(
            Parameter(os.path.join(output_dir, 'RSEM_Quant'),
                      os.path.join(output_dir, 'Quant.pdf')))

        # QC: Get time delta
        elapsed_time = datetime.now() - start_time
        qc_data['running_time_seconds'] = str(elapsed_time.seconds)
        qc_data['running_time_readable'] = str(elapsed_time)

        # QC: Output QC data to file
        with open(os.path.join(logs_dir, 'qc_metrics.txt'),
                  'w') as qc_data_file:
            qc_data_file.write(json.dumps(qc_data, indent=4) + '\n')

        # Delete temporary files
        for delete_file in staging_delete:
            subprocess.call(['rm', '-rf', delete_file])
Exemplo n.º 8
0
    def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate Software instances
        fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
        rnaseqc = Software('RNA-SeQC', pipeline_config['RNA-SeQC']['path'])

        picard = {
            subprogram_name: Software(
                'picard {}'.format(subprogram_name),
                pipeline_config['picard']['path'] +
                ' {}'.format(subprogram_name))
            for subprogram_name in {
                'CreateSequenceDictionary', 'MarkDuplicates',
                'CollectRnaSeqMetrics', 'CollectInsertSizeMetrics',
                'CollectAlignmentSummaryMetrics', 'CollectGcBiasMetrics',
                'EstimateLibraryComplexity', 'AddOrReplaceReadGroups'
            }
        }

        preseq = {
            subprogram_name: Software(
                'preseq {}'.format(subprogram_name),
                pipeline_config['preseq']['path'] +
                ' {}'.format(subprogram_name))
            for subprogram_name in {'c_curve', 'lc_extrap', 'gc_extrap'}
        }
        bam2mr = Software('bam2mr', pipeline_config['preseq']['bam2mr'])

        featurecounts = Software('featureCounts',
                                 pipeline_config['featureCounts']['path'])

        samtools_faidx = Software(
            'samtools faidx', pipeline_config['samtools']['path'] + ' faidx')
        novosort = Software('novosort', pipeline_config['novosort']['path'])

        # Create output directory
        subprocess.call('mkdir -p {}'.format(pipeline_args['output_dir']),
                        shell=True)
        subprocess.call('mkdir -p /mnt/analysis/tmp', shell=True)

        # Sort bam file
        sorted_bam = os.path.join(pipeline_args['output_dir'],
                                  'sorted.tmp.bam')
        novosort.run(Parameter('--index'), Parameter('--output', sorted_bam),
                     Parameter(pipeline_args['bam']))

        # Run FastQC
        self.run_fastqc(fastqc=fastqc, pipeline_args=pipeline_args)

        # Run RNA-SeQC
        self.run_rnaseqc(rnaseqc=rnaseqc,
                         picard=picard,
                         samtools_faidx=samtools_faidx,
                         pipeline_config=pipeline_config,
                         pipeline_args=pipeline_args,
                         sorted_bam=sorted_bam)

        # Run Picard suite
        self.run_picard_suite(picard=picard,
                              sorted_bam=sorted_bam,
                              pipeline_config=pipeline_config,
                              pipeline_args=pipeline_args)

        # self.run_preseq(
        #     preseq=preseq,
        #     bam2mr=bam2mr,
        #     sorted_bam=sorted_bam,
        #     pipeline_args=pipeline_args
        # )

        self.run_featurecounts(featurecounts=featurecounts,
                               sorted_bam=sorted_bam,
                               pipeline_args=pipeline_args,
                               pipeline_config=pipeline_config)

        self.run_chrm_percentage(sorted_bam=sorted_bam,
                                 pipeline_args=pipeline_args)

        # Remove temporary sorted bam
        os.remove(sorted_bam)
        os.remove(sorted_bam + '.bai')
        subprocess.call('rm -rf /mnt/analysis/tmp', shell=True)
    def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate options
        reads = pipeline_args['reads']
        output_dir = pipeline_args['output']
        logs_dir = os.path.join(output_dir, 'logs')
        lib_prefix = pipeline_args['lib']
        step = pipeline_args['step']
        forward_adapter = pipeline_args['forward_adapter']
        reverse_adapter = pipeline_args['reverse_adapter']
        run_is_stranded = pipeline_args['is_stranded']

        # Determine if run is paired-end from input
        run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1

        # Create output, tmp, and logs directories
        tmp_dir = os.path.join(output_dir, 'tmp')
        subprocess.call(['mkdir', '-p', output_dir, logs_dir, tmp_dir])

        # Keep list of items to delete
        staging_delete = [os.path.join(output_dir, 'tmp')]

        qc_metrics = {
            'total_raw_reads': [],
            'total_trimmed_reads': [],
            'percent_num_reads_mapped_genome': [],
            'percent_num_reads_mapped_transcriptome': [],
            'percent_duplicate_reads': '0',
            'num_reads_multimapped': [],
            'percent_num_reads_rrna': '',
            'viral_rna': []

        }

        synapse_metadata = {
            'Assay': 'RNAseq',
            'Individual_ID': '',
            'Sample_ID': '',
            'File_Name': '',
            'BrodmannArea': '',
            'BrainRegion': '',
            'Hemisphere': '',
            'CellType': 'NA',
            'TissueState': '',
            'RNAIsolationBatch': '',
            'RIN': '',
            'LibraryBatch': '',
            'LibraryPrep': 'stranded, rRNA depletion',
            'LibraryKit': 'Illumina RS-122-2301',
            'ERCC_Added': '',
            'RunType': 'paired-end',
            'ReadLength': '100bp',
            'FlowcellBatch': '',
            'SequencingPlatform': '',
            'TotalReads': '',
            'MappedReads_Primary': '0',
            'MappedReads_Multimapped': '0',
            'rRNARate': '0',
            'Notes': ''
        }

        # Establish Software instances
        cutadapt = Software('Cutadapt', pipeline_config['cutadapt']['path'])
        fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
        star = Software('STAR Two-Pass', pipeline_config['STAR']['path'])
        novosort = Software('Novosort', pipeline_config['novosort']['path'])
        samtools_flagstat = Software('Samtools Flagstat', pipeline_config['samtools']['path'] + ' flagstat')
        samtools_index = Software('Samtools Index', pipeline_config['samtools']['path'] + ' index')
        samtools_faidx = Software('Samtools Faidx', pipeline_config['samtools']['path'] + ' faidx')
        picard_markduplicates = Software('Picard MarkDuplicates',
                                         'java -Xmx{heap_size}g -jar {path} MarkDuplicates'.format(
                                             heap_size=pipeline_config['picard'].get('heap_size',
                                                                                     JAVA_DEFAULT_HEAP_SIZE),
                                             path=pipeline_config['picard']['path']
                                         ))
        picard_create_seq_dict = Software('Picard CreateSequenceDictionary',
                                          'java -Xmx{heap_size}g -jar {path} CreateSequenceDictionary'.format(
                                             heap_size=pipeline_config['picard'].get('heap_size',
                                                                                     JAVA_DEFAULT_HEAP_SIZE),
                                             path=pipeline_config['picard']['path']
                                          ))
        rnaseqc = Software('RNAseQC', 'java -Xmx{heap_size}g -jar {path}'.format(
                                             heap_size=pipeline_config['picard'].get('heap_size',
                                                                                     JAVA_DEFAULT_HEAP_SIZE),
                                             path=pipeline_config['RNAseQC']['path']
                                      ))
        picard_add_read_groups = Software('Picard AddOrReplaceReadGroups',
                                          'java -Xmx{heap_size}g -jar {path} AddOrReplaceReadGroups'.format(
                                             heap_size=pipeline_config['picard'].get('heap_size',
                                                                                     JAVA_DEFAULT_HEAP_SIZE),
                                             path=pipeline_config['picard']['path']
                                          ))
        bedtools_coverage = Software('Bedtools Coverage', pipeline_config['bedtools']['path'] + ' coverage')
        bedtools_bamtobed = Software('Bedtools Bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed')

        # Housekeeping
        star_output = []
        novosort_outfile = ''

        # Step 1: Trimming | Cutadapt
        if step <= 1:
            for i, read in enumerate(reads):
                if run_is_paired_end:
                    # Get paired-end reads, construct new filenames
                    read1, read2 = read.split(':')

                    # QC: Get raw fastq read counts
                    qc_metrics['total_raw_reads'].append([
                        str(int(self.count_gzipped_lines(read1))/4),
                        str(int(self.count_gzipped_lines(read2))/4)
                    ])

                    trimmed_read1_filename = os.path.join(output_dir,
                                                          lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i))
                    trimmed_read2_filename = os.path.join(output_dir,
                                                          lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i))
                    staging_delete.extend([
                        trimmed_read1_filename,
                        trimmed_read2_filename
                    ])

                    # Run cutadapt
                    cutadapt.run(
                        Parameter('--quality-base={}'.format(pipeline_config['cutadapt']['quality-base'])),
                        Parameter('--minimum-length=5'),
                        Parameter('--output={}'.format(trimmed_read1_filename)),
                        Parameter('--paired-output={}'.format(trimmed_read2_filename)),
                        Parameter('-a', forward_adapter),
                        Parameter('-A', reverse_adapter),
                        Parameter('-q', '30'),
                        Parameter(read1),
                        Parameter(read2),
                        Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary'))
                    )

                    # QC: Get trimmed fastq read counts
                    qc_metrics['total_trimmed_reads'].append([
                        str(int(self.count_gzipped_lines(trimmed_read1_filename))/4),
                        str(int(self.count_gzipped_lines(trimmed_read2_filename))/4)
                    ])

                    # Update reads list
                    reads[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename])
                else:
                    # QC: Get raw fastq read counts
                    qc_metrics['total_raw_reads'].append([
                        str(int(self.count_gzipped_lines(read))/4)
                    ])

                    # Construct new filename
                    trimmed_read_filename = os.path.join(output_dir,
                                                         lib_prefix + '_{}.trimmed.fastq.gz'.format(i))
                    staging_delete.append(trimmed_read_filename)

                    # Run cutadapt
                    cutadapt.run(
                        Parameter('--quality-base={}'.format(pipeline_config['cutadapt']['quality-base'])),
                        Parameter('--minimum-length=5'),
                        Parameter('--output={}'.format(trimmed_read_filename)),
                        Parameter('-a', forward_adapter),
                        Parameter('-q', '30'),
                        Parameter(read),
                        Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.chicago.summary'))
                    )

                    # QC: Get trimmed fastq read counts
                    qc_metrics['total_trimmed_reads'].append([
                        str(int(self.count_gzipped_lines(trimmed_read_filename))/4)
                    ])

                    # Update reads list
                    reads[i] = trimmed_read_filename

        # Step 2: FastQC
        if step <= 2:
            # Make FastQC directory
            fastqc_output_dir = os.path.join(output_dir, 'fastqc')
            subprocess.call(['mkdir', '-p', fastqc_output_dir])

            all_fastqs = []

            if run_is_paired_end:
                for read in reads:
                    all_fastqs.extend(read.split(':'))
            else:
                all_fastqs.extend(reads)

            for fastq in all_fastqs:
                fastqc.run(
                    Parameter('--outdir={}'.format(fastqc_output_dir)),
                    Parameter(fastq)
                )

        # Step 3: Alignment | STAR 2-pass, Alignment Stats | samtools flagstat
        if step <= 3:
            # Set up common STAR parameters
            star_common = [
                Parameter('--runMode', 'alignReads'),
                Parameter('--twopassMode', 'Basic'),
                Parameter('--runThreadN', pipeline_config['STAR']['threads']),
                Parameter('--genomeDir', pipeline_config['STAR']['genome-dir']),
                Parameter('--readFilesCommand', 'zcat'),
                Parameter('--quantMode', 'TranscriptomeSAM', 'GeneCounts'),
                Parameter('--outSAMtype', 'BAM', 'Unsorted'),
                Parameter('--outFilterType', 'BySJout'),
                Parameter('--outFilterMultimapNmax', '20'),
                Parameter('--alignSJoverhangMin', '8'),
                Parameter('--alignSJDBoverhangMin', '1'),
                Parameter('--outFilterMismatchNmax', '2'),
                Parameter('--alignIntronMin', '20'),
                Parameter('--alignIntronMax', '1000000'),
                Parameter('--alignMatesGapMax', '1000000'),
                (
                    Parameter('--outFilterIntronMotifs', 'RemoveNoncanonical') if run_is_stranded
                    else Parameter('--outSAMstrandField', 'intronMotif')
                )
            ]

            # Get STAR output file prefix
            star_outfile_prefix = os.path.join(output_dir,
                                               lib_prefix + ('_' if lib_prefix[-1] != '.' else '') + '{}.')

            # Align each read or read pair
            for i, read in enumerate(reads):
                star_output_bam = star_outfile_prefix.format(i) + 'Aligned.out.bam'
                star_output_transcriptome_bam = star_outfile_prefix.format(i) + 'Aligned.toTranscriptome.out.bam'
                star_output.append(star_output_bam)

                if run_is_paired_end:
                    read1, read2 = read.split(':')

                    star_paired_end = [
                        Parameter('--readFilesIn', read1, read2),
                        Parameter('--outFileNamePrefix', star_outfile_prefix.format(i))
                    ]

                    star.run(*(star_common + star_paired_end))
                else:
                    star_single_end = [
                        Parameter('--readFilesIn', read),
                        Parameter('--outFileNamePrefix', star_outfile_prefix.format(i))
                    ]

                    star.run(*(star_common + star_single_end))

                # Get flagstats for both alignments
                samtools_flagstat.run(
                    Parameter(star_output_bam),
                    Redirect(stream=Redirect.STDOUT, dest=star_output_bam + '.flagstat')
                )
                samtools_flagstat.run(
                    Parameter(star_output_transcriptome_bam),
                    Redirect(stream=Redirect.STDOUT, dest=star_output_transcriptome_bam + '.flagstat')
                )

                # QC: Get number of mapped reads to the genome from this BAM
                try:
                    with open(star_output_bam + '.flagstat') as flagstats:
                        flagstats_contents = flagstats.read()

                        # Pull out mapped reads
                        target_line = re.search(r'(\d+) \+ \d+ mapped \(([0-9\.]+)%', flagstats_contents)
                        if target_line is not None:
                            num_mapped = int(target_line.group(1))
                            qc_metrics['percent_num_reads_mapped_genome'].append(
                                [str(num_mapped/2), '{}%'.format(target_line.group(2))]
                            )

                            num_secondary = int(re.search(r'(\d+) \+ \d+ secondary', flagstats_contents)
                                                .group(1)
                                                )
                            num_supplementary = int(re.search(r'(\d+) \+ \d+ supplementary', flagstats_contents)
                                                    .group(1)
                                                    )

                            synapse_metadata['MappedReads_Primary'] = str(
                                int(synapse_metadata['MappedReads_Primary']) +
                                num_mapped - num_secondary - num_supplementary
                            )
                            synapse_metadata['MappedReads_Multimapped'] = str(
                                int(synapse_metadata['MappedReads_Multimapped']) + num_secondary
                            )
                        else:
                            qc_metrics['percent_num_reads_mapped_genome'].append('0')

                        # Pull out multimapped reads
                        target_line = re.search(r'(\d+) \+ \d+ secondary', flagstats_contents)
                        if target_line is not None:
                            qc_metrics['num_reads_multimapped'].append(
                                str(int(target_line.group(1))/2)
                            )
                        else:
                            qc_metrics['num_reads_multimapped'].append('0')
                except:
                    qc_metrics['percent_num_reads_mapped_genome'].append(
                        'Could not open flagstats for {}'.format(star_output_bam)
                    )
                    qc_metrics['num_reads_multimapped'].append(
                        'Could not open flagstats for {}'.format(star_output_bam)
                    )

                # QC: Get number of mapped reads to the transcriptome from this BAM
                try:
                    with open(star_output_transcriptome_bam + '.flagstat') as flagstats:
                        flagstats_contents = flagstats.read()
                        target_line = re.search(r'(\d+) \+ \d+ mapped \(([0-9\.]+)%', flagstats_contents)
                        if target_line is not None:
                            qc_metrics['percent_num_reads_mapped_transcriptome'].append(
                                [str(int(target_line.group(1))/2), '{}%'.format(target_line.group(2))]
                            )
                        else:
                            qc_metrics['percent_num_reads_mapped_transcriptome'].append('0')
                except:
                    qc_metrics['percent_num_reads_mapped_transcriptome'].append(
                        'Could not open flagstats for {}'.format(star_output_bam)
                    )

        # Step 4: BAM Merge | Novosort
        if step <= 4:
            # Novosort to sort and merge BAM files
            novosort_outfile = os.path.join(output_dir,
                                            lib_prefix + ('.' if lib_prefix[-1] != '.' else '') +
                                            'merged.Aligned.out.bam')
            novosort.run(
                Parameter('--tmpdir', os.path.join(output_dir, 'tmp')),
                Parameter(*[bam for bam in star_output]),
                Redirect(stream=Redirect.STDOUT, dest=novosort_outfile)
            )

            """
            The step below was commented out on 27 June 2016. It was taking up large amounts of memory, more than
            Beagle could handle, and some samples were consistently failing as a result. I think RNAseQC does this
            step anyway, I only left it in because I figured it wasn't doing any harm. Well now it is, so it's gone.
            """
            # QC: Get number of reads mapped to rRNA regions
            # aligned_bed_file = os.path.join(output_dir, str(uuid.uuid4()) + '.bed')
            # coverage_file = os.path.join(output_dir, str(uuid.uuid4()) + '.coverage.bed')
            # staging_delete.extend([aligned_bed_file, coverage_file])
            #
            # bedtools_bamtobed.run(
            #     Parameter('-i', novosort_outfile),
            #     Redirect(stream=Redirect.STDOUT, dest=aligned_bed_file)
            # )
            # bedtools_coverage.run(
            #     Parameter('-s'),
            #     Parameter('-counts'),
            #     Parameter('-a', pipeline_config['qc']['rRNA-bed']),
            #     Parameter('-b', aligned_bed_file),
            #     Redirect(stream=Redirect.STDOUT, dest=coverage_file)
            # )
            # try:
            #     rRNA_count = 0
            #     with open(coverage_file) as coverage:
            #         for line in coverage:
            #             rRNA_count += int(line.strip().split('\t')[6])
            #     percent_rRNA = (rRNA_count /
            #                     float(sum([int(aln[MAPPED_READS_COUNT])
            #                                for aln
            #                                in qc_metrics['percent_num_reads_mapped_transcriptome']]))
            #                     )
            #     qc_metrics['percent_num_reads_rrna'] = [str(rRNA_count), str(percent_rRNA)]
            #     synapse_metadata['rRNARate'] = str(percent_rRNA)
            # except Exception as e:
            #     qc_metrics['percent_num_reads_rrna'] = ['error', 'error', e.message]

            # Prepare genome fasta for RNAseQC
            genome_fa = pipeline_config['qc']['genome-fa']
            genome_fai = genome_fa + '.fai'
            genome_dict = os.path.splitext(genome_fa)[0] + '.dict'

            if not os.path.isfile(genome_fai):
                samtools_faidx.run(
                    Parameter(genome_fa)
                )
            if not os.path.isfile(genome_dict):
                picard_create_seq_dict.run(
                    Parameter('REFERENCE={}'.format(genome_fa)),
                    Parameter('OUTPUT={}'.format(genome_dict))
                )

            # Add read group to alignment file
            read_group_bam = os.path.join(output_dir, 'readgroup.bam')
            staging_delete.append(read_group_bam)
            picard_add_read_groups.run(
                Parameter('INPUT={}'.format(novosort_outfile)),
                Parameter('OUTPUT={}'.format(read_group_bam)),
                Parameter('RGLB={}'.format(lib_prefix)),
                Parameter('RGPL=Illumina'),
                Parameter('RGPU=1'),
                Parameter('RGSM=Sample')
            )

            # Generate BAM index for RNAseQC
            samtools_index.run(
                Parameter(read_group_bam)
            )
            staging_delete.append(read_group_bam + '.bai')

            # QC: Get RNAseQC output
            rnaseqc_output_dir = os.path.join(output_dir, 'RNAseQC')
            subprocess.call(['mkdir', '-p', rnaseqc_output_dir])
            rnaseqc.run(
                Parameter('-o', rnaseqc_output_dir),
                Parameter('-r', genome_fa),
                Parameter('-t', pipeline_config['cufflinks']['transcriptome-gtf']),
                Parameter('-s', '"{sample_id}|{bam_file}|{notes}"'.format(
                    sample_id=lib_prefix,
                    bam_file=read_group_bam,
                    notes='None'
                )),
                Parameter('-singleEnd') if not run_is_paired_end else Parameter()
            )

            # Picard MarkDuplicates to get duplicates metrics
            markduplicates_outfile = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix))
            markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics')
            picard_markduplicates.run(
                Parameter('INPUT={}'.format(novosort_outfile)),
                Parameter('OUTPUT={}'.format(markduplicates_outfile)),
                Parameter('TMP_DIR={}'.format(tmp_dir)),
                Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)),
                Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log'))
            )

            # QC: Get percent duplicates
            try:
                with open(markduplicates_metrics_filepath) as markdup_metrics:
                    for line in markdup_metrics:
                        if line[FIRST_CHAR] == '#':
                            continue
                        record = line.strip().split('\t')
                        if len(record) == 9:
                            if re.match(r'\d\.\d+', record[7]) is not None:
                                qc_metrics['percent_duplicate_reads'] = record[7]
            except Exception as e:
                qc_metrics['percent_duplicate_reads'] = ['Could not open MarkDuplicates metrics', e.message]

        # Write out QC metrics to file
        with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file:
            qc_data_file.write(json.dumps(qc_metrics, indent=4) + '\n')

        # Populate Synapse QC matrix
        if re.match(r'\d{4}-\d{4}', lib_prefix.strip()) is not None:
            synapse_metadata['Individual_ID'] = lib_prefix
            synapse_metadata['File_Name'] = 'PEC_BrainGVEX_UIC-UChicago_FC_mRNA_HiSeq2000_{}'.format(lib_prefix)

        re_raw_filename = re.match(r'\d{4}-\d{4}_.+_(.+)_.+_(.+_\d)_\d_sequence\.txt\.gz',
                                   os.path.basename(pipeline_args['reads'][0].split(':')[0]))
        if re_raw_filename is not None:
            sequencing_inst_name = re_raw_filename.group(1)
            if '673' in sequencing_inst_name or '484' in sequencing_inst_name:
                synapse_metadata['SequencingPlatform'] = 'HiSeq2000'
            elif '1070' in sequencing_inst_name:
                synapse_metadata['SequencingPlatform'] = 'HiSeq2500'
            flowcell_batch = re_raw_filename.group(2)
            synapse_metadata['FlowcellBatch'] = flowcell_batch

        total_raw_reads_end1 = sum([int(count[0]) for count in qc_metrics['total_raw_reads']])/4
        synapse_metadata['TotalReads'] = str(total_raw_reads_end1)

        # Write out Synapse metadata
        with open(os.path.join(logs_dir, 'synapse_metadata.txt'), 'w') as synapse_metadata_file:
            synapse_metadata_file.write(json.dumps(synapse_metadata, indent=4) + '\n')

        # Delete temporary files
        for delete_file in staging_delete:
            subprocess.call(['rm', '-rf', delete_file])
	def run_pipeline(self, pipeline_args, pipeline_config):
		# Instantiate variable from argparse
		read_pairs = pipeline_args['reads']
		output_dir = os.path.abspath(pipeline_args['output'])
		logs_dir = os.path.join(output_dir, 'logs')
		lib_prefix = pipeline_args['lib']
		step = int(pipeline_args['step'])
		forward_adapter = pipeline_args['forward_adapter']
		reverse_adapter = pipeline_args['reverse_adapter']

		# Create output, tmp, and logs directories
		tmp_dir = os.path.join(output_dir, 'tmp')
		subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir])

		#Keep list of items to delete
		staging_delete = [tmp_dir]
		bwa_bam_outs = []
		qc_data = {
			'total_raw_reads_counts': [],
			'trimmed_reads_counts': [],
			'num_reads_mapped': [],
			'num_read_removed_steric_hinderence': '0',
			'percent_duplicate_reads': '0',
			'num_unique_reads_mapped': [], #implemented
			'num_mtDNA_reads_mapped': [],
			'percent_mtDNA_reads_mapped': '0' ,
			'num_reads_mapped_after_filtering': '-1', #TODO This isn't implemented
			'num_peaks_called': '-1',
			#TODO Get number of peaks in annotation sites
		}

		#Instatiate software instances
		cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
		fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
		bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln')
		bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe')
		samtools_view = Software('samtools view', pipeline_config['samtools']['path'] + ' view')
		samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat')
		samtools_index = Software('samtools index', pipeline_config['samtools']['path'] + ' index')
		novosort = Software('novosort', pipeline_config['novosort']['path'])
		picard_mark_dup = Software('Picard MarkDuplicates', pipeline_config['picard']['path'] + ' MarkDuplicates')
		picard_insert_metrics = Software('Picard CollectInsertSizeMetrics', pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics')
		bedtools_bamtobed = Software('bedtools bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed')
		bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + 'sort')
		bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge')
		bedtools_intersect = Software('bedtools intersect', pipeline_config['bedtools']['path'] + ' intersect')
		macs2_callpeak = Software('macs2 callpeak', pipeline_config['macs2']['path'] + ' callpeak')

		if step <= 1:
			for i, read_pair in enumerate(read_pairs):
				read1, read2 = read_pair.split(':')

				#QC: Get raw fastq read counts 
				qc_data['total_raw_reads_counts'].append([
					str(int(self.count_gzipped_lines(read1))/4),
					str(int(self.count_gzipped_lines(read2))/4)
				])

				trimmed_read1_filename = os.path.join(output_dir,
														lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i))
				trimmed_read2_filename = os.path.join(output_dir,
														lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i))

				cutadapt.run(
					Parameter('--quality-base=33'),
					Parameter('--minimum-length=5'),
					Parameter('-q',  '30'), # Minimum quality score
					Parameter('--output={}'.format(trimmed_read1_filename)),
					Parameter('--paired-output={}'.format(trimmed_read2_filename)),
					Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'),
					Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'),
					Parameter(read1),
					Parameter(read2),
					Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log'))
				)

				# QC: Get trimmed fastq read counts
				qc_data['trimmed_reads_counts'].append([
					str(int(self.count_gzipped_lines(trimmed_read1_filename))/4),
					str(int(self.count_gzipped_lines(trimmed_read2_filename))/4)
					])

				staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename])
				read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename])

		if step <= 2:
			#Make FastQC Directory
			fastqc_output_dir = os.path.join(output_dir, 'fastqc')
			subprocess.call(['mkdir', '-p', fastqc_output_dir])
			for i, read_pair in enumerate(read_pairs):
				for read in read_pair.split(':'):
					fastqc.run(
						Parameter('--outdir={}'.format(fastqc_output_dir)),
						Parameter(read)
					)

					bwa_aln.run(
						Parameter('-t', pipeline_config['bwa']['threads']),
						Parameter(pipeline_config['bwa']['index-dir']),
						Parameter(read),
						Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read))
					)

					staging_delete.append('{}.sai'.format(read))

		if step <= 3:
			for i, read_pair in enumerate(read_pairs):
				read1, read2 = read_pair.split(':')
				bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i))

				bwa_sampe.run(
					Parameter('-a', '2000'), # Maximum insert size
					Parameter('-n', '1'),
					Parameter(pipeline_config['bwa']['index-dir']),
					Parameter('{}.sai'.format(read1)),
					Parameter('{}.sai'.format(read2)),
					Parameter(read1),
					Parameter(read2),
					Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')),
					Pipe(
						samtools_view.pipe(
							Parameter('-hSb'),
							Parameter('-o', bwa_bam_output),
							Parameter('-') # Get input from stdin
						)
					)
				)

				bwa_bam_outs.append(bwa_bam_output)

		if step <= 4:
			for i, bwa_bam in enumerate(bwa_bam_outs):
				samtools_flagstat.run(
					Parameter(bwa_bam),
					Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat')
				)

				#QC: Get number of mapped reads from this bam
				try:
					with open(bwa_bam + '.flagstat') as flagstats:
						flagstats_contents = flagstats.read()
						target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents)
						if target_line is not None:
							qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2))
						else:
							qc_data['num_reads_mapped'].append('0')
				except:
					qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format(
						bwa_bam + '.flagstat'
					))

			sortmerged_bam = os.path.join(output_dir, '{}.sortmerged_bam'.format(lib_prefix))
			steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix))
			duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix))
			unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix))
			unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix))
			chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))
			# binning read based off template size
			nucleosome_free_reads = os.path.join(output_dir, '{}.nucleosome_free.bam'.format(lib_prefix))
			mononucleosome_reads = os.path.join(output_dir, '{}.mononucleosome.bam'.format(lib_prefix))
			dinucleosome_reads = os.path.join(output_dir, '{}.dinucleosome.bam'.format(lib_prefix))
			trinucleosome_reads = os.path.join(output_dir, '{}.trinucleosome.bam'.format(lib_prefix))
			chrM_bam = os.path.join(output_dir, '{}.chrM.bam'.format(lib_prefix))
			
			novosort.run(
				Parameter('--threads', pipeline_config['novosort']['threads']),
				Parameter('--tmpcompression', '6'),
				Parameter('--tmpdir', tmp_dir),
				Parameter(*[bam for bam in bwa_bam_outs]),
				Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam),
				Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log'))
			)

			# This creates a dependency on pysam
			# Removes reads with template length < 38 due to steric hinderence
			samtools_index.run(Parameter(sortmerged_bam))
			sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb')
			steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb',
																	template=sortmerged_bam_alignmentfile)
			
			num_removed=0
			for read in sortmerged_bam_alignmentfile.fetch():
				if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF:
					steric_filter_bam_alignmentfile.write(read)
				else:
					num_removed += 1
			qc_data['num_read_removed_steric_hinderence']=str(num_removed)
			
			
			sortmerged_bam_alignmentfile.close()
			steric_filter_bam_alignmentfile.close()

			# Mark and remove MarkDuplicates
			markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics')
			picard_mark_dup.run(
				Parameter('INPUT={}'.format(steric_filter_bam)),
				Parameter('OUTPUT={}'.format(duprm_bam)),
				Parameter('TMP_DIR={}'.format(tmp_dir)),
				Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)),
				Parameter('REMOVE_DUPLICATES=true'),
				Parameter('VALIDATION_STRINGENCY=LENIENT'),
				Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log'))
			)

			#QC: Get percent MarkDuplicates
			try:
				with open(markduplicates_metrics_filepath) as markdup_metrics:
					for line in markdup_metrics:
						if line[FIRST_CHAR] == '#':
							continue
						record = line.strip().split('\t')
						if len(record) == 9:
							if re.match(r'\d+', record[7]) is not None:
								qc_data['percent_duplicate_reads'] = record[7]
			except:
				qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics'

			# Filter down to uniquely mapped reads
			samtools_view.run(
				Parameter('-b'),
				Parameter('-F', '256'),
				Parameter('-q', '10'),
				Parameter('-o', unique_bam),
				Parameter(duprm_bam)
			)

			# gets statistics on uniquely mapped reads
			for i, unique_map in enumerate(unique_bam):
				samtools_flagstat.run(
					Parameter(unique_bam),
					Redirect(stream=Redirect.STDOUT, dest=unique_bam + '.flagstat')
				)

				#QC: Get number of mapped reads from unique bams
				try:
					with open(unique_bam + '.flagstat') as flagstats:
						unique_flagstats_contents = flagstats.read()
						target_line = re.search(r'(\d+) \+ \d+ mapped', unique_flagstats_contents)
						if target_line is not None:
							qc_data['num_unique_reads_mapped'].append(str(int(target_line.group(1))/2))
						else:
							qc_data['num_unique_reads_mapped'].append('0')
				except:
					qc_data['num_unique_reads_mapped'] + '.flagstat'

			# make AlignmentFile object to extract binned reads and chrM reads from the unique bam
			samtools_index.run(Parameter(unique_bam))
			unique_bam_alignmentfile = pysam.AlignmentFile(unique_bam, 'rb')
			# Bins reads into 4 categories depending on template length read is derived from:
			# 50-115 (nucleosome-free), 180-247 (mononucleosome), 315-473 (dinucleosome), 558-615 (trinucleosome)
			nucleosome_free_reads_alignmentfile = pysam.AlignmentFile(nucleosome_free_reads, 'wb',
																	template=unique_bam_alignmentfile)
			mononucleosome_reads_alignmentfile = pysam.AlignmentFile(mononucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			dinucleosome_reads_alignmentfile = pysam.AlignmentFile(dinucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			trinucleosome_reads_alignmentfile = pysam.AlignmentFile(trinucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			
			# Extract chrM into new BAM
			chrM_reads_alignmentfile = pysam.AlignmentFile(chrM_bam, 'wb',
														template=unique_bam_alignmentfile)

			# Binning of nucleosome reads
			for read in unique_bam_alignmentfile.fetch():
				if abs(int(read.template_length)) >= 50 and abs(int(read.template_length)) <= 115:
					nucleosome_free_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 180 and abs(int(read.template_length)) <= 247:
					mononucleosome_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 315 and abs(int(read.template_length)) <= 473:
					dinucleosome_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 558 and abs(int(read.template_length)) <= 615:
					trinucleosome_reads_alignmentfile.write(read)
				else:
					continue;

			#stores chrM reads in separate file
			for read in unique_bam_alignmentfile.fetch():
				if read.reference_name == 'chrM':
					chrM_reads_alignmentfile.write(read)
	
			nucleosome_free_reads_alignmentfile.close()
			mononucleosome_reads_alignmentfile.close()
			dinucleosome_reads_alignmentfile.close()
			trinucleosome_reads_alignmentfile.close()
			chrM_reads_alignmentfile.close()
			
			# gets series of flagstats results for non-main files
			samtools_flagstat.run(
					Parameter(nucleosome_free_reads),
					Redirect(stream=Redirect.STDOUT, dest=nucleosome_free_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(mononucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=mononucleosome_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(dinucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=dinucleosome_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(trinucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=trinucleosome_reads + '.flagstat'))

			
			# gets statistics on chrM mapped reads
			samtools_index.run(Parameter(chrM_bam))
			for i, chrM_map in enumerate(chrM_bam):
				samtools_flagstat.run(
					Parameter(chrM_bam),
					Redirect(stream=Redirect.STDOUT, dest=chrM_bam + '.flagstat')
				)
				try:
					with open(chrM_bam + '.flagstat') as flagstats:
						chrM_flagstats_contents = flagstats.read()
						target_line = re.search(r'(\d+) \+ \d+ mapped', chrM_flagstats_contents)
						if target_line is not None:
							qc_data['num_mtDNA_reads_mapped'].append(str(int(target_line.group(1))/2))
						else:
							qc_data['num_mtDNA_reads_mapped'].append('0')
				except:
					qc_data['num_mtDNA_reads_mapped'] + '.flagstat'



			# Remove unmapped reads
			samtools_view.run(
				Parameter('-b'),
				Parameter('-F', '12'),
				Parameter('-o', unmappedrm_bam),
				Parameter(unique_bam)
			)

			# Create BAM index, then remove chrM
			samtools_index.run(
				Parameter(unmappedrm_bam)
			)

			# Remove chrM
			all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']]
			samtools_view.run(
				Parameter('-b'),
				Parameter('-o', chrmrm_bam),
				Parameter(unmappedrm_bam),
				*all_chr
			)

			# Stage delete for temporary files
			staging_delete.extend([
				sortmerged_bam,
				sortmerged_bam + '.bai', # BAM index file
				steric_filter_bam,
				unique_bam,
				duprm_bam,
				unmappedrm_bam,
				unmappedrm_bam + '.bai', # BAM index file
				chrmrm_bam
			])

		if step <= 5:
			# Generate filename for final processed BAM and BED
			processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix))
			unshifted_bed = os.path.join(output_dir, '{}.unshifted_bed'.format(lib_prefix))
			processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))

			# staging_delete.append(unshifted_bed)

			# Generate filename for chrM removed BAM
			chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))

			# Remove blacklisted genomic regions
			bedtools_intersect.run(
				Parameter('-v'),
				Parameter('-abam', chrmrm_bam),
				Parameter('-b', pipeline_config['bedtools']['blacklist-bed']),
				Parameter('-f', '0.5'),
				Redirect(stream=Redirect.STDOUT, dest=processed_bam)
			)

			# QC: Generate insert size metrics PDF
			picard_insert_metrics.run(
				Parameter('INPUT={}'.format(processed_bam)),
				Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))),
				Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf')))
			)

			# Generate index for processed BAM
			samtools_index.run(
				Parameter(processed_bam)
			)

			# Convert BAM to BED
			bedtools_bamtobed.run(
				Parameter('-i', processed_bam),
				Redirect(stream=Redirect.STDOUT, dest=unshifted_bed)
			)

			staging_delete.append(unshifted_bed)

			# Shifting + strand by 4 and - strand by -5, according to the ATACseq paper

			# This ysed to be bedtools shift, but they are fired
			self.shift_reads(
				input_bed_filepath=unshifted_bed,
				output_bed_filepath=processed_bed,
				log_filepath=os.path.join(logs_dir, 'shift_reads.logs'),
				genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'],
				minus_strand_shift=MINUS_STRAND_SHIFT,
				plus_strand_shift=PLUS_STRAND_SHIFT
			)

		# Peak-calling; MACS2
		if step <= 6:
			# for regular peak calling, including narrow, default q-value=0.01
			processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))
			macs2_callpeak.run(
				Parameter('-t', processed_bed),
				Parameter('-f', 'BED'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bed) + '_regular_peak_calls'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads
				Parameter('--call-summits'),
				Parameter('--keep-dup', 'all')
			)

			#for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks
			macs2_callpeak.run(
				Parameter('-t', processed_bed),
				Parameter('-f', 'BED'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bed) + '_broad_peak_calls'),
				Parameter('-q', '0.05'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('--broad'),
				Parameter('--keep-dup', 'all')
			)

		# QC: Output QC data to file
		with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file:
			qc_data_file.write(str(qc_data) + '\n')

		# Delete temporary files
		for delete_file in staging_delete:
			subprocess.call(['rm', '-rf', delete_file])
Exemplo n.º 11
0
    def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate options
        reads = pipeline_args['reads']
        output_dir = pipeline_args['output']
        logs_dir = os.path.join(output_dir, 'logs')
        lib_prefix = pipeline_args['lib']
        step = pipeline_args['step']
        forward_adapter = pipeline_args['forward_adapter']
        reverse_adapter = pipeline_args['reverse_adapter']
        run_is_stranded = pipeline_args['is_stranded']

        # Determine if run is paired-end from input
        run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1

        # Create output, tmp, and logs directories
        subprocess.call(['mkdir', '-p', output_dir,
                         logs_dir, os.path.join(output_dir, 'tmp')])

        # Timing functions for getting running time
        start_time = datetime.now()

        # Gather QC data
        qc_data = {
            'total_raw_reads_counts': [],
            'trimmed_reads_counts': [],
            'num_reads_mapped': '0',
            'running_time_seconds': '',
            'running_time_readable': ''
        }

        # Keep list of items to delete
        staging_delete = [os.path.join(output_dir, 'tmp')]

        # Establish software instances
        cat = Software('cat', '/bin/cat')
        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        star = Software('STAR', pipeline_config['STAR']['path'])
        rsem_calculat_expression = Software('RSEM', pipeline_config['RSEM']['path-calculate-expression'])
        rsem_plot_model = Software('RSEM', pipeline_config['RSEM']['path-plot-model'])
        bedGraph_to_bw = Software('bedGraphToBigWig', pipeline_config['bedgraph_to_bw']['path'])
        bed_sort = Software('bedSort', pipeline_config['bedSort']['path'])
        samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat')

        # Step 1: If more than one reads pairs are provided, combine them
        if step <= 1 and len(reads) >= 2:
            if run_is_paired_end:
                # Aggregate read1s and read2s
                read1s, read2s = [], []
                for reads_set in reads:
                    read1, read2 = reads_set.split(':')
                    read1s.append(read1)
                    read2s.append(read2)

                # Combine reads groups
                combined_reads = []
                for name, reads_group in [('read1', read1s), ('read2', read2s)]:
                    combined_read_filename = os.path.join(output_dir, '{}.combined.{}.fastq.gz'.format(lib_prefix, name))
                    combined_reads.append(combined_read_filename)
                    staging_delete.append(combined_read_filename)
                    cat.run(
                        Parameter(*[read for read in reads_group]),
                        Redirect(stream=Redirect.STDOUT, dest=combined_read_filename)
                    )

                # Update reads list
                reads = [':'.join(combined_reads)]
            else:
                # Combine reads
                combined_read_filename = os.path.join(output_dir, '{}.combined.fastq.gz'.format(lib_prefix))
                staging_delete.append(combined_read_filename)
                cat.run(
                    Parameter(*[read for read in reads]),
                    Redirect(stream=Redirect.STDOUT, dest=combined_read_filename)
                )

                # Update reads list
                reads = [combined_read_filename]

        # Step 2: Trim adapters with cutadapt
        if step <= 2:
            reads_set = reads[FIRST_READS_PAIR]
            if run_is_paired_end:
                # Get paired-end reads, construct new filenames
                read1, read2 = reads_set.split(':')

                # QC: Get raw fastq read counts
                qc_data['total_raw_reads_counts'].extend([
                    str(int(self.count_gzipped_lines(read1))/4),
                    str(int(self.count_gzipped_lines(read2))/4)
                ])

                trimmed_read1_filename = os.path.join(output_dir, lib_prefix + '_read1.trimmed.fastq.gz')
                trimmed_read2_filename = os.path.join(output_dir, lib_prefix + '_read2.trimmed.fastq.gz')

                staging_delete.append(trimmed_read1_filename)
                staging_delete.append(trimmed_read2_filename)

                # Run cutadapt
                cutadapt.run(
                    Parameter('--quality-base={}'.format(pipeline_config['cutadapt']['quality-base'])),
                    Parameter('--minimum-length=5'),
                    Parameter('--output={}'.format(trimmed_read1_filename)),
                    Parameter('--paired-output={}'.format(trimmed_read2_filename)),
                    Parameter('-a', forward_adapter),
                    Parameter('-A', reverse_adapter),
                    Parameter('-q', '30'),
                    Parameter(read1),
                    Parameter(read2),
                    Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log'))
                )

                # QC: Get trimmed fastq read counts
                qc_data['trimmed_reads_counts'].extend([
                    str(int(self.count_gzipped_lines(trimmed_read1_filename))/4),
                    str(int(self.count_gzipped_lines(trimmed_read2_filename))/4)
                ])

                # Update reads list
                reads = ':'.join([trimmed_read1_filename, trimmed_read2_filename])

            else:
                # QC: Get raw fastq read count
                qc_data['total_raw_reads_counts'].append(
                    str(int(self.count_gzipped_lines(
                        os.path.join(output_dir, '{}.combined.fastq.gz'.format(lib_prefix))
                    ))/4)
                )

                # Construct new filename
                trimmed_read_filename = os.path.join(output_dir, lib_prefix + '.trimmed.fastq.gz')

                staging_delete.append(trimmed_read_filename)

                # Run cutadapt
                cutadapt.run(
                    Parameter('--quality-base={}'.format(pipeline_config['cutadapt']['quality-base'])),
                    Parameter('--minimum-length=5'),
                    Parameter('--output={}'.format(trimmed_read_filename)),
                    Parameter('-a', forward_adapter),
                    Parameter('-q', '30'),
                    Parameter(reads[FIRST_READS_PAIR]),
                    Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary'))
                )

                # QC: Get trimmed fastq read count
                qc_data['trimmed_reads_counts'].append(
                    str(int(self.count_gzipped_lines(trimmed_read_filename))/4)
                )

                # Update reads list
                reads = [trimmed_read_filename]

        # Step 3: Alignment
        if step <= 3:
            # Gets reads for paired-end and single-end
            if run_is_paired_end:
                read1, read2 = reads.split(':')
            else:
                read1 = reads[FIRST_READS_PAIR]
                read2 = ''

            # Set up STAR parameters
            star_outfile_prefix = os.path.join(output_dir,
                                               lib_prefix + ('.' if lib_prefix[-1] != '.' else ''))
            star_common = [
                Parameter('--outFileNamePrefix', star_outfile_prefix),
                Parameter('--genomeDir', pipeline_config['STAR']['genome-dir']),
                Parameter('--readFilesIn', read1, read2),
                Parameter('--readFilesCommand', 'zcat'),
                Parameter('--outFilterType', 'BySJout'),
                Parameter('--outFilterMultimapNmax', '20'),
                Parameter('--alignSJoverhangMin', '8'),
                Parameter('--alignSJDBoverhangMin', '1'),
                Parameter('--outFilterMismatchNmax', '999'),
                Parameter('--alignIntronMin', '20'),
                Parameter('--alignIntronMax', '1000000'),
                Parameter('--alignMatesGapMax', '1000000'),
                Parameter('--outSAMunmapped', 'Within'),
                Parameter('--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD'),
                Parameter('--outFilterMismatchNoverReadLmax', '0.04'),
                Parameter('--sjdbScore', '1')
            ]

            star_run = [
                Parameter('--runThreadN', pipeline_config['STAR']['threads']),
                #Parameter('--genomeLoad', 'LoadAndKeep'),
                #Parameter('--limitBAMsortRAM', '10000000000')
            ]

            star_bam = [
                Parameter('--outSAMtype', 'BAM', 'SortedByCoordinate'),
                Parameter('--quantMode', 'TranscriptomeSAM')
            ]

            star_strand, star_wig = [], []

            # STAR strandedness parameters
            if run_is_stranded:
                star_wig.append(Parameter('--outWigStrand', 'Stranded'))
            else:
                star_strand.append(Parameter('--outSAMstrandField', 'intronMotif'))
                star_wig.append(Parameter('--outWigStrand', 'Unstranded'))

            # TODO Encode has SAM Header metadata here, but I'm going to skip it for now
            star_meta = []

            # Run STAR alignment step
            star.run(*(star_common + star_run + star_bam + star_strand + star_meta))

            # Store STAR output files
            star_output_bam = star_outfile_prefix + 'Aligned.sortedByCoord.out.bam'

            # QC: Get samtools flagstat
            samtools_flagstat.run(
                Parameter(star_output_bam),
                Redirect(stream=Redirect.STDOUT, dest=star_output_bam + '.flagstat')
            )

            # QC: Get number of mapped reads from this BAM
            with open(star_output_bam + '.flagstat') as flagstats:
                flagstats_contents = flagstats.read()
                target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents)
                if target_line is not None:
                    qc_data['num_reads_mapped'] = str(int(target_line.group(1))/2)

            # Generate bedGraph
            signal_output_dir = os.path.join(output_dir, 'signal')
            subprocess.call(['mkdir', '-p', signal_output_dir])
            signal_output_prefix = os.path.join(signal_output_dir,
                                                lib_prefix + ('.' if lib_prefix[-1] != '.' else ''))

            # Run STAR for signal generation
            star.run(
                Parameter('--runMode', 'inputAlignmentsFromBAM'),
                Parameter('--inputBAMfile', star_output_bam),
                Parameter('--outWigType', 'bedGraph'),
                Parameter('--outFileNamePrefix', signal_output_prefix),
                Parameter('--outWigReferencesPrefix', 'chr'),
                *star_wig
            )

            # Convert bedGraph to bigWig
            chrNL_txt = os.path.join(output_dir, 'chrNL.txt')
            with open(chrNL_txt, 'w') as chrNL_filehandle:
                subprocess.call(['grep', '^chr',
                                 os.path.join(pipeline_config['STAR']['genome-dir'], 'chrNameLength.txt')
                                 ],
                                stdout=chrNL_filehandle)

            # Generate temporary signal file path
            sig_tmp = os.path.join(output_dir, 'sig.tmp')
            staging_delete.append(sig_tmp)
            if run_is_stranded:
                strand = [None, '-', '+']
                for i_strand in [1, 2]:
                    for i_mult in ['Unique', 'UniqueMultiple']:
                        # Get signal file for this iteration
                        signal_file = '{}Signal.{}.str{}.out.bg'.format(signal_output_prefix, i_mult, str(i_strand))
                        # Write to temporary signal file
                        with open(sig_tmp, 'w') as sig_tmp_filehandle:
                            subprocess.call(['grep', '^chr', signal_file],
                                            stdout=sig_tmp_filehandle)
                        # Sort sig.tmp with bedSort
                        bed_sort.run(
                            Parameter(sig_tmp),
                            Parameter(sig_tmp)
                        )
                        # Run bedGraph to bigWig conversion
                        bedGraph_to_bw.run(
                            Parameter(sig_tmp),
                            Parameter(chrNL_txt),
                            Parameter('{}Signal.{}.strand{}.bw'.format(
                                signal_output_prefix,i_mult, strand[i_strand]
                            ))
                        )
            else:
                for i_mult in ['Unique', 'UniqueMultiple']:
                    # Get signal file for this iteration
                    signal_file = '{}Signal.{}.str1.out.bg'.format(signal_output_prefix, i_mult)
                    # Write to temporary signal file
                    with open(sig_tmp, 'w') as sig_tmp_filehandle:
                        subprocess.call(['grep', '^chr', signal_file],
                                        stdout=sig_tmp_filehandle)
                    # Sort sig.tmp with bedSort
                    bed_sort.run(
                        Parameter(sig_tmp),
                        Parameter(sig_tmp)
                    )
                    # Run bedGraph to bigWig conversion
                    bedGraph_to_bw.run(
                        Parameter(sig_tmp),
                        Parameter(chrNL_txt),
                        Parameter('{}Signal.{}.unstranded.bw'.format(signal_output_prefix, i_mult))
                    )

        # Step 4: Sort transcriptome BAM to ensure order of reads to make RSEM output deterministic
        if step <= 4:
            # Set BAM file paths, mv transcriptome BAM to temporary name
            star_outfile_prefix = os.path.join(output_dir,
                                               lib_prefix + ('.' if lib_prefix[-1] != '.' else ''))
            transcriptome_bam = star_outfile_prefix + 'Aligned.toTranscriptome.out.bam'
            tr_bam = star_outfile_prefix + 'Tr.bam'
            staging_delete.append(tr_bam)
            subprocess.call(['mv', transcriptome_bam, tr_bam])

            # Template command
            merge_cmd = 'cat <({input1}) <({input2}) | {compress} > {output}'
            input1_cmd = '{samtools} view -H {bam}'
            compress_cmd = 'samtools view -@ {threads} -bS -'

            if run_is_paired_end:
                input2_cmd = ('{samtools} view -@ {threads} {bam} | ' +
                              'awk \'{{printf "%s", $0 " "; getline; print}}\' | ' +
                              'sort -S {ram} -T {tmpdir} | ' +
                              'tr \' \' \'\\n\'')
            else:
                input2_cmd = ('{samtools} view -@ {threads} {bam} | ' +
                              'sort -S {ram} -T {tmpdir}')

            print merge_cmd.format(
                input1=input1_cmd.format(
                    samtools=pipeline_config['samtools']['path'],
                    bam=tr_bam
                ),
                input2=input2_cmd.format(
                    samtools=pipeline_config['samtools']['path'],
                    threads=pipeline_config['RSEM']['threads'],
                    bam=tr_bam,
                    ram=pipeline_config['sort']['memory'],
                    tmpdir=os.path.join(output_dir, 'tmp')
                ),
                compress=compress_cmd.format(
                    threads=pipeline_config['RSEM']['threads']
                ),
                output=transcriptome_bam
            )

            subprocess.call(merge_cmd.format(
                input1=input1_cmd.format(
                    samtools=pipeline_config['samtools']['path'],
                    bam=tr_bam
                ),
                input2=input2_cmd.format(
                    samtools=pipeline_config['samtools']['path'],
                    threads=pipeline_config['RSEM']['threads'],
                    bam=tr_bam,
                    ram=pipeline_config['sort']['memory'],
                    tmpdir=os.path.join(output_dir, 'tmp')
                ),
                compress=compress_cmd.format(
                    threads=pipeline_config['RSEM']['threads']
                ),
                output=transcriptome_bam
            ), shell=True, executable='/bin/bash')

            subprocess.call(['rm', tr_bam])

        # Step 5: Run RSEM to get quantification
        if step <= 5:
            star_outfile_prefix = os.path.join(output_dir,
                                               lib_prefix + ('.' if lib_prefix[-1] != '.' else ''))
            transcriptome_bam = star_outfile_prefix + 'Aligned.toTranscriptome.out.bam'

            # Set up RSEM parameters
            rsem_common = [
                Parameter('--bam'),
                Parameter('--estimate-rspd'),
                Parameter('--calc-ci'),
                Parameter('--no-bam-output'),
                Parameter('--seed', '12345')
            ]

            rsem_run = [
                Parameter('-p', pipeline_config['RSEM']['threads']),
                Parameter('--ci-memory', pipeline_config['RSEM']['memory'])
            ]

            rsem_type = []
            if run_is_paired_end:
                rsem_type.append(Parameter('--paired-end'))
            if run_is_stranded:
                rsem_type.append(Parameter('--forward-prob', '0'))

            # Run RSEM quantification step
            rsem_calculat_expression.run(*(rsem_common + rsem_run + rsem_type + [
                Parameter(transcriptome_bam),
                Parameter(pipeline_config['RSEM']['reference-dir']),
                Parameter(os.path.join(output_dir, 'RSEM_Quant')),
                Redirect(Redirect.BOTH, dest=os.path.join(logs_dir, 'Log.rsem'))
            ]))

            # Generate RSEM plot model
            rsem_plot_model.run(
                Parameter(os.path.join(output_dir, 'RSEM_Quant'), os.path.join(output_dir, 'Quant.pdf'))
            )

        # QC: Get time delta
        elapsed_time = datetime.now() - start_time
        qc_data['running_time_seconds'] = str(elapsed_time.seconds)
        qc_data['running_time_readable'] = str(elapsed_time)

        # QC: Output QC data to file
        with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file:
            qc_data_file.write(json.dumps(qc_data, indent=4) + '\n')

        # Delete temporary files
        for delete_file in staging_delete:
            subprocess.call(['rm', '-rf', delete_file])
Exemplo n.º 12
0
    def add_pipeline_args(self, parser):
        parser.add_argument(
            '--fastq:lib',
            required=True,
            nargs='*',
            help='Fastq input for pipeline:library name(prefix for files)')
        parser.add_argument('--output',
                            required=True,
                            help='Where pipeline output should go')
        parser.add_argument('--adapter',
                            default='AGATCGGAAGAGCACACGTCT',
                            help='Adapter sequence for trimming')
        parser.add_argument(
            '--threads',
            default=defaultThreads,
            help='Threads to be used for multi-threaded programs. Default is 8'
        )

        # chunky run RiboSeq_pipe.py --fastqs
        #  /mnt/cinder/thomas/RiboSeq/Lane5/AWS-3_S3_L005_R1_001.fastq.gz
        #  --output /mnt/cinder/thomas/RiboSeq/test --threads

        # create variables from parser if wanted
        fastqFiles = pipeline_args['fastq:lib']
        outputDir = pipeline_args['output']
        adapter = pipeline_args['adapter']
        numThreads = pipeline_args['threads']

        # Create output directory
        subprocess.call(['mkdir', outputDir])

        # Software
        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        star = Software('STAR', pipeline_config['STAR']['path'])
        bedtools = Software('bedtools', pipeline_config['bedtools']['path'])
        bowtie2 = Software('bowtie2', pipeline_config['bowtie2']['path'])
        samtools = Software('samtools', pipeline_config['samtools']['path'])
        samtools_sort = Software('samtools sort',
                                 pipeline_config['samtools']['path'])
        read_distribution = Software(
            'read_distribution.py',
            pipeline_config['read_distribution']['path'])
        featureCounts = Software('featureCounts',
                                 pipeline_config['featureCounts']['path'])
        fastQC = Software('FastQC', pipeline_config['FastQC']['path'])
        picard = Software('picard', pipeline_config['picard']['path'])

        # Change these to just be done in python script?

        # Common software tools
        awk = Software('awk', 'awk')
        sort = Software('sort', 'sort')
        uniq = Software('uniq', 'uniq')
        paste = Software('paste', 'paste')
        cat = Software('cat', 'cat')
        grep = Software('grep', 'grep')

        # Directories and Files
        pathToGenomeDir = pipeline_config['STAR']['genomeDir']
        pathToGenome = pipeline_config['bowtie2']['genome_ref']
        pathToGtf = pipeline_config['STAR']['GTF_ref']
        pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed']
        pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100']
        pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed']
        pathTo_genomeFasta = pipeline_config['picard']['genomeFasta']
        pathTo_ref_flat = pipeline_config['picard']['refFlat']
        '''

      remove adaptor and trim
      adaptor sequence: AGATCGGAAGAGCACACGTCT
      -m 25 discard any reads shorter than 25 nucleotides
      keep only reads that had the adaptor sequence --discard-untrimmed

      cutadapt -a AGATCGGAAGAGCACACGTCT -m 25 --discard-untrimmed {filename}.fastq.gz
       > {filename}_trimmed.fastq.gz 2> {filename}_report.txt
      
      Remove adapters
      Only keep reads with adapters, otherwise artifact
      Discard reads shorter than 25 bp
      
    '''

        # Keep track of Bids in pipeline

        bid_list = []
        for fastqlib in fastqFiles:
            bid_list.append(fastqlib.split(':')[-1])

        # Cutadapt

        for fastqlib in fastqFiles:
            fastq, bid = fastqlib.split(':')
            newDir = new_dir(outputDir, bid)
            # Make new directories to store data
            subprocess.call(['mkdir', newDir])

            # consider multi-threading by splitting in multiple files and then combining

            cutadapt.run(
                Parameter('--quality-base=33'),
                Parameter('--minimum-length=25'),
                Parameter('--discard-untrimmed'),
                Parameter('--output={}/{}.trimmed.fastq.gz'.format(
                    newDir, bid)),
                # Parameter('-a', forward_adapter if forward_adapter else 'AGATCGGAAGAGCACACGTCT'),
                Parameter('-a', adapter),
                Parameter(fastq),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.cutadapt.summary.log'.format(bid))))
        ''' 
    Bowtie2
    
    bowtie2 --seedlen=23 --un-fq=${filename}_filtered.fq -x $genome -U $file
     -S | samtools view -Sb - > ${filename}.rts.bam

    Remove snoRNA, rRNA, tRNA, keep only mRna for alignment

    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            bowtie2.run(
                Parameter('--seedlen=23'),
                Parameter('--threads', numThreads),
                Parameter('--un-gz {}/{}_filtered.fq.gz'.format(newDir, bid)),
                Parameter('-x', pathToGenome),  # Path to rtsRNA_seqs files
                Parameter('-U', '{}/{}.trimmed.fastq.gz'.format(newDir, bid)),
                Parameter('-S'),
                Parameter('{}/{}.rts.sam'.format(newDir, bid)),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(newDir,
                                           '{}.bowtie2.log'.format(bid))),
                Redirect(stream=Redirect.STDERR,
                         dest=os.path.join(newDir,
                                           '{}.bowtie2.log2'.format(bid))),
                shell=True  # Look into changing     
            )

            # This doesn't work

            samtools.run(
                Parameter('view'),
                Parameter('-Sb'),
                Parameter('{}/{}.rts.sam'.format(newDir, bid)),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(newDir, '{}.rts.bam'.format(bid))),
            )
        '''
    Star 
      STAR --runThreadN 6 --sjdbGTFfile gtfFile --outSAMtype  BAM Unsorted 
        --outFileNamePrefix {filename}_ --genomeDir /path/to/genome/index 
        --genomeFastaFiles --readFilesIn 
        {filename}_filtered.fq.gz --readFilesCommand zcat

    Basically RNAseq at this point

    Align the kept reads from bowtie to the genome
    '''

        # Only load the genome one time: genomeLoad = 'LoadAndKeep'.....Doesn't really work

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            # remove genome from memory on last run
            # genomeLoad = 'LoadAndRemove'
            star.run(
                Parameter(
                    '--runThreadN',
                    numThreads),  # Change to command line parameter --threads
                Parameter('--sjdbGTFfile', pathToGtf),
                Parameter('--outSAMtype', 'BAM', 'Unsorted'),
                Parameter('--outFileNamePrefix', '{}/{}_'.format(newDir, bid)),
                Parameter('--genomeDir', pathToGenomeDir),
                # Parameter('--genomeLoad', genomeLoad), broken
                Parameter('--readFilesIn',
                          '{}/{}_filtered.fq.gz'.format(newDir, bid)),
                Parameter('--readFilesCommand zcat')  # reads gzipped files
            )
        '''
      Sort and extract uniquely mapped reads for QC and further analyses
        samtools view -H $file > header.sam
        samtools view $file | grep -w NH:i:1 | cat header.sam - | samtools view -bS - | samtools sort - ${filename}_uniq_sorted
        rm header.sam

      Using this file for the rest of the analysis
    '''

        for bid in bid_list:

            newDir = new_dir(outputDir, bid)
            samtools.run(
                Parameter('view'),
                Parameter('-H'),
                Parameter('{}/{}_Aligned.out.bam'.format(
                    newDir, bid)),  # star outfile name
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(newDir,
                                           '{}.header.sam'.format(bid))))
            samtools.run(
                Parameter('view'),
                Parameter('{}/{}_Aligned.out.bam'.format(
                    newDir, bid)),  # star outfile name
                Pipe(
                    grep.pipe(
                        Parameter('-w'), Parameter('NH:i:1'),
                        Pipe(
                            cat.pipe(
                                Parameter(
                                    os.path.join(newDir,
                                                 '{}.header.sam'.format(bid)),
                                    '-'),
                                Pipe(
                                    samtools.pipe(
                                        Parameter('view'),
                                        Parameter('-bS', '-'),
                                        Pipe(
                                            samtools.pipe(
                                                Parameter('sort'),
                                                Parameter(
                                                    '-', '-o',
                                                    '{}/{}.uniq_sorted.bam'.
                                                    format(newDir,
                                                           bid)))))))))))
            # subprocess.call(['rm', '{}/{}.header.sam'.format(newDir, bid)])
        '''
      rSeQC to evaluate percent reads mapped to each genomic features
        read_distribution.py -r hg19_RefSeq.bed12 -i $file
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            read_distribution.run(
                Parameter('-r'),
                Parameter(pathTo_hg19_bed),
                Parameter('-i'),
                Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid)),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.read_distribution.log'.format(bid))),
                shell=True)
        '''
      codon periodicity
        annotation=/glusterfs/users/ashieh/annotations/hg19_ccds_exons_plus_start100.bed

        bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100
        awk -v OFS='\t' '{print ($2-($14+100))}' ${filename}_intersect_start100.bed
         | sort | uniq -c > ${filename}_relative_pos_aggregate.table
    '''

        # bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            bedtools.run(
                Parameter('intersect'),
                Parameter('-a {}'.format(pathTo_hg19_bed_start100)),
                Parameter('-b {}/{}.uniq_sorted.bam'.format(newDir, bid)),
                Parameter('-s'),
                Parameter('-bed'),
                Parameter('-wa'),
                Parameter('-wb'),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.intersect_start100.bed'.format(bid))),
                shell=True)
            start100_file = open(
                '{}/{}.intersect_start100.bed'.format(newDir, bid), 'rb')
            relativePos_file = open(
                '{}/{}_relative_pos_aggregate.table'.format(newDir, bid), 'wb')
            distanceList = []
            for line in start100_file:
                splitLine = line.split('\t')
                # Really is relative start
                if len(splitLine) >= 7:
                    distance = int(splitLine[7]) - (int(splitLine[1]) + 100)
                    distanceList.append(distance)
            distanceList.sort()
            distanceCounting = Counter(distanceList)
            for key, value in distanceCounting.iteritems():
                relativePos_file.write("{}\t{}\n".format(value, key))

        # Create chart of relative_positions_aggregate to see codon periodicity
        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            rpaFile = open(
                '{dir}/{bid}_relative_pos_aggregate.table'.format(dir=newDir,
                                                                  bid=bid),
                'rb')
            myDict = {}

            for i in range(-30, 31):
                myDict[i] = 0

            for line in rpaFile:
                Frequency, start = line.strip().split(' ')
                if int(start) >= -30 and int(start) <= 30:
                    # print start
                    myDict[int(start)] = Frequency

            # Change to log scaling?

            freqs = []
            starts = []
            for i in range(-30, 31):
                starts.append(i)
                freqs.append(myDict[i])

            # print freqs

            fig, ax = plt.subplots()
            # plt.set_title('{} codon periodicity'.format(bid))
            plt.xlabel("-30 to 30 relative position")
            plt.ylabel("Frequency")
            plt.bar(starts, freqs)
            fig.savefig('{dir}/{bid}_codon_periodicity_plot.png'.format(
                dir=newDir, bid=bid))
        '''
    Picard tools

    java -jar picard.jar CollectMultipleMetrics 
    I=2017-221.uniq_sorted.bam 
    O= multiple_metrics 
    R=GRCh37.p13.genome.fa

    java -jar picard.jar CollectGcBiasMetrics
    I= .uniq
    O=gc_bias_metrics.txt 
    CHART=gc_bias_metrics.pdf 
    S=summary_metrics.txt 
    R=reference_sequence.fasta

    java -jar picard.jar CollectRnaSeqMetrics
    I=input.bam 
    O=output.RNA_Metrics 
    REF_FLAT=ref_flat.txt 
    STRAND=FIRST_READ_TRANSCRIPTION_STRAND

    java -jar picard.jar MarkDuplicates
    I=input.bam 
    O=marked_duplicates.bam 
    M=marked_dup_metrics.txt
    ASSUME_SORTED=true
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)

            picard.run(
                Parameter('CollectMultipleMetrics'),
                Parameter('I={}'.format(bam)),  # input
                Parameter('O={}/{}.multiple_metrics'.format(newDir,
                                                            bid)),  # output
                Parameter('R={}'.format(pathTo_genomeFasta))  # genomeReference
            )

            picard.run(
                Parameter('CollectGcBiasMetrics'),
                Parameter('I={}'.format(bam)),  # input
                Parameter('O={}/{}.gc_bias_metrics'.format(newDir,
                                                           bid)),  # output
                Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format(
                    newDir, bid)),  # chart
                Parameter('S={}/{}.summary_metrics'.format(
                    newDir, bid)),  # summary metrics
                Parameter(
                    'R={}'.format(pathTo_genomeFasta))  # genome reference
            )

            picard.run(
                Parameter('CollectRnaSeqMetrics'),
                Parameter('I={}'.format(bam)),  # input
                Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)),  # output
                Parameter('REF_FLAT={}'.format(
                    '{}'.format(pathTo_ref_flat))),  # ref_flat
                Parameter(
                    'STRAND=FIRST_READ_TRANSCRIPTION_STRAND')  # strandedness
            )

            picard.run(
                Parameter('MarkDuplicates'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.marked_duplicates.bam'.format(
                    newDir, bid)),  # output
                Parameter('M={}/{}.marked_dup_metrics.txt'.format(
                    newDir, bid)),  # marked dup metrics
                Parameter('ASSUME_SORTED=true')  # It is sorted
            )
        '''
    subread: featureCounts

      featureCounts -a /path_to_gtf/gencode.v19.annotation.gtf -o <bid>.featureCounts <bid>.uniq_sorted.bam
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            featureCounts.run(
                Parameter('-a', '{}'.format(pathToGtf)),  # gtf
                Parameter('-s', '1'),  # strand-specific read counting 
                Parameter('-o', '{}/{}.featureCounts'.format(newDir,
                                                             bid)),  # output
                Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid))  # input
            )
        '''
    FastQC

      fastqc --outdir=/path_to/<bid>/ /path_to_fastq/<bid>.fastq.gz
    '''

        for fastqlib in fastqFiles:
            fastq, bid = fastqlib.split(':')
            newDir = new_dir(outputDir, bid)
            fastQC.run(
                Parameter('--outdir={}'.format(newDir)),  # output
                Parameter('--t', numThreads),
                Parameter(fastq)  # input
            )
	def run_pipeline(self, pipeline_args, pipeline_config):
		# Instantiate variable from argparse
		read_pairs = pipeline_args['reads']
		output_dir = os.path.abspath(pipeline_args['output'])
		logs_dir = os.path.join(output_dir, 'logs')
		lib_prefix = pipeline_args['lib']
		step = int(pipeline_args['step'])
		forward_adapter = pipeline_args['forward_adapter']
		reverse_adapter = pipeline_args['reverse_adapter']

		# Create output, tmp, and logs directories
		tmp_dir = os.path.join(output_dir, 'tmp')
		subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir])

		#Keep list of items to delete
		staging_delete = [tmp_dir]
		bwa_bam_outs = []
		qc_data = {
			'total_raw_reads_counts': [],
			'trimmed_reads_counts': [],
			'num_reads_mapped': [],
			'num_read_removed_steric_hinderence': '0',
			'percent_duplicate_reads': '0',
			'num_unique_reads_mapped': [], #implemented
			'num_mtDNA_reads_mapped': [],
			'percent_mtDNA_reads_mapped': '0' ,
			'num_reads_mapped_after_filtering': '-1', #TODO This isn't implemented
			'num_peaks_called': '-1',
			#TODO Get number of peaks in annotation sites
		}

		#Instatiate software instances
		cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
		fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
		bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln')
		bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe')
		samtools_view = Software('samtools view', pipeline_config['samtools']['path'] + ' view')
		samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat')
		samtools_index = Software('samtools index', pipeline_config['samtools']['path'] + ' index')
		samtools_sort = Software('samtools sort', pipeline_config['samtools']['path'] + ' sort')
		novosort = Software('novosort', pipeline_config['novosort']['path'])
		picard_mark_dup = Software('Picard MarkDuplicates', pipeline_config['picard']['path'] + ' MarkDuplicates')
		picard_insert_metrics = Software('Picard CollectInsertSizeMetrics', pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics')
		bedtools_bamtobed = Software('bedtools bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed')
		bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + 'sort')
		bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge')
		bedtools_intersect = Software('bedtools intersect', pipeline_config['bedtools']['path'] + ' intersect')
		macs2_callpeak = Software('macs2 callpeak', pipeline_config['macs2']['path'] + ' callpeak')

		if step <= 1:
			for i, read_pair in enumerate(read_pairs):
				read1, read2 = read_pair.split(':')

				#QC: Get raw fastq read counts 
				qc_data['total_raw_reads_counts'].append([
					str(int(self.count_gzipped_lines(read1))/4),
					str(int(self.count_gzipped_lines(read2))/4)
				])

				trimmed_read1_filename = os.path.join(output_dir,
														lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i))
				trimmed_read2_filename = os.path.join(output_dir,
														lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i))

				cutadapt.run(
					Parameter('--quality-base=33'),
					Parameter('--minimum-length=5'),
					Parameter('-q',  '30'), # Minimum quality score
					Parameter('--output={}'.format(trimmed_read1_filename)),
					Parameter('--paired-output={}'.format(trimmed_read2_filename)),
					Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'),
					Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'),
					Parameter(read1),
					Parameter(read2),
					Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log'))
				)

				# QC: Get trimmed fastq read counts
				qc_data['trimmed_reads_counts'].append([
					str(int(self.count_gzipped_lines(trimmed_read1_filename))/4),
					str(int(self.count_gzipped_lines(trimmed_read2_filename))/4)
					])

				staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename])
				read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename])

		if step <= 2:
			#Make FastQC Directory
			fastqc_output_dir = os.path.join(output_dir, 'fastqc')
			subprocess.call(['mkdir', '-p', fastqc_output_dir])
			for i, read_pair in enumerate(read_pairs):
				for read in read_pair.split(':'):
					fastqc.run(
						Parameter('--outdir={}'.format(fastqc_output_dir)),
						Parameter(read)
					)

					bwa_aln.run(
						Parameter('-t', pipeline_config['bwa']['threads']),
						Parameter(pipeline_config['bwa']['index-dir']),
						Parameter(read),
						Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read))
					)

					staging_delete.append('{}.sai'.format(read))

		if step <= 3:
			for i, read_pair in enumerate(read_pairs):
				read1, read2 = read_pair.split(':')
				bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i))

				bwa_sampe.run(
					Parameter('-a', '2000'), # Maximum insert size
					Parameter('-n', '1'),
					Parameter(pipeline_config['bwa']['index-dir']),
					Parameter('{}.sai'.format(read1)),
					Parameter('{}.sai'.format(read2)),
					Parameter(read1),
					Parameter(read2),
					Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')),
					Pipe(
						samtools_view.pipe(
							Parameter('-hSb'),
							Parameter('-o', bwa_bam_output),
							Parameter('-') # Get input from stdin
						)
					)
				)

				bwa_bam_outs.append(bwa_bam_output)

		if step <= 4:
			for i, bwa_bam in enumerate(bwa_bam_outs):
				samtools_flagstat.run(
					Parameter(bwa_bam),
					Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat')
				)

				#QC: Get number of mapped reads from this bam
				try:
					with open(bwa_bam + '.flagstat') as flagstats:
						flagstats_contents = flagstats.read()
						target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents)
						if target_line is not None:
							qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2))
						else:
							qc_data['num_reads_mapped'].append('0')
				except:
					qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format(
						bwa_bam + '.flagstat'
					))

			sortmerged_bam = os.path.join(output_dir, '{}.sortmerged_bam'.format(lib_prefix))
			steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix))
			duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix))
			unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix))
			unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix))
			chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))
			# binning read based off template size
			nucleosome_free_reads = os.path.join(output_dir, '{}.nucleosome_free.bam'.format(lib_prefix))
			mononucleosome_reads = os.path.join(output_dir, '{}.mononucleosome.bam'.format(lib_prefix))
			dinucleosome_reads = os.path.join(output_dir, '{}.dinucleosome.bam'.format(lib_prefix))
			trinucleosome_reads = os.path.join(output_dir, '{}.trinucleosome.bam'.format(lib_prefix))
			chrM_bam = os.path.join(output_dir, '{}.chrM.bam'.format(lib_prefix))
			sorted_for_PE_bam = os.path.join(output_dir, '{}.sorted_for_PE'.format(lib_prefix))

			novosort.run(
				Parameter('--threads', pipeline_config['novosort']['threads']),
				Parameter('--tmpcompression', '6'),
				Parameter('--tmpdir', tmp_dir),
				Parameter(*[bam for bam in bwa_bam_outs]),
				Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam),
				Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log'))
			)

			# This creates a dependency on pysam
			# Removes reads with template length < 38 due to steric hinderence
			samtools_index.run(Parameter(sortmerged_bam))
			sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb')
			steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb',
																	template=sortmerged_bam_alignmentfile)
			
			num_removed=0
			for read in sortmerged_bam_alignmentfile.fetch():
				if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF:
					steric_filter_bam_alignmentfile.write(read)
				else:
					num_removed += 1
			qc_data['num_read_removed_steric_hinderence']=str(num_removed)
			
			
			sortmerged_bam_alignmentfile.close()
			steric_filter_bam_alignmentfile.close()

			# Mark and remove MarkDuplicates
			markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics')
			picard_mark_dup.run(
				Parameter('INPUT={}'.format(steric_filter_bam)),
				Parameter('OUTPUT={}'.format(duprm_bam)),
				Parameter('TMP_DIR={}'.format(tmp_dir)),
				Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)),
				Parameter('REMOVE_DUPLICATES=true'),
				Parameter('VALIDATION_STRINGENCY=LENIENT'),
				Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log'))
			)

			#QC: Get percent MarkDuplicates
			try:
				with open(markduplicates_metrics_filepath) as markdup_metrics:
					for line in markdup_metrics:
						if line[FIRST_CHAR] == '#':
							continue
						record = line.strip().split('\t')
						if len(record) == 9:
							if re.match(r'\d+', record[7]) is not None:
								qc_data['percent_duplicate_reads'] = record[7]
			except:
				qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics'

			# Filter down to uniquely mapped reads
			samtools_view.run(
				Parameter('-b'),
				Parameter('-F', '256'),
				Parameter('-q', '10'),
				Parameter('-o', unique_bam),
				Parameter(duprm_bam)
			)

			# gets statistics on uniquely mapped reads
			for i, unique_map in enumerate(unique_bam):
				samtools_flagstat.run(
					Parameter(unique_bam),
					Redirect(stream=Redirect.STDOUT, dest=unique_bam + '.flagstat')
				)

				#QC: Get number of mapped reads from unique bams
			try:
				with open(unique_bam + '.flagstat') as flagstats:
					unique_flagstats_contents = flagstats.read()
					target_line = re.search(r'(\d+) \+ \d+ mapped', unique_flagstats_contents)
					if target_line is not None:
						qc_data['num_unique_reads_mapped'].append(str(int(target_line.group(1))/2))
					else:
						qc_data['num_unique_reads_mapped'].append('0')
			except:
				qc_data['num_unique_reads_mapped'] + '.flagstat'

			# make AlignmentFile object to extract binned reads and chrM reads from the unique bam
			samtools_index.run(Parameter(unique_bam))
			unique_bam_alignmentfile = pysam.AlignmentFile(unique_bam, 'rb')
			# Bins reads into 4 categories depending on template length read is derived from:
			# 50-115 (nucleosome-free), 180-247 (mononucleosome), 315-473 (dinucleosome), 558-615 (trinucleosome)
			nucleosome_free_reads_alignmentfile = pysam.AlignmentFile(nucleosome_free_reads, 'wb',
																	template=unique_bam_alignmentfile)
			mononucleosome_reads_alignmentfile = pysam.AlignmentFile(mononucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			dinucleosome_reads_alignmentfile = pysam.AlignmentFile(dinucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			trinucleosome_reads_alignmentfile = pysam.AlignmentFile(trinucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			
			# Extract chrM into new BAM
			chrM_reads_alignmentfile = pysam.AlignmentFile(chrM_bam, 'wb',
														template=unique_bam_alignmentfile)

			# Binning of nucleosome reads
			for read in unique_bam_alignmentfile.fetch():
				if abs(int(read.template_length)) >= 50 and abs(int(read.template_length)) <= 115:
					nucleosome_free_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 180 and abs(int(read.template_length)) <= 247:
					mononucleosome_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 315 and abs(int(read.template_length)) <= 473:
					dinucleosome_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 558 and abs(int(read.template_length)) <= 615:
					trinucleosome_reads_alignmentfile.write(read)
				else:
					continue;

			#stores chrM reads in separate file
			for read in unique_bam_alignmentfile.fetch():
				if read.reference_name == 'chrM':
					chrM_reads_alignmentfile.write(read)
	
			nucleosome_free_reads_alignmentfile.close()
			mononucleosome_reads_alignmentfile.close()
			dinucleosome_reads_alignmentfile.close()
			trinucleosome_reads_alignmentfile.close()
			chrM_reads_alignmentfile.close()
			
			# gets series of flagstats results for non-main files
			samtools_flagstat.run(
					Parameter(nucleosome_free_reads),
					Redirect(stream=Redirect.STDOUT, dest=nucleosome_free_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(mononucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=mononucleosome_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(dinucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=dinucleosome_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(trinucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=trinucleosome_reads + '.flagstat'))

			
			# gets statistics on chrM mapped reads
			samtools_index.run(Parameter(chrM_bam))
			for i, chrM_map in enumerate(chrM_bam):
				samtools_flagstat.run(
					Parameter(chrM_bam),
					Redirect(stream=Redirect.STDOUT, dest=chrM_bam + '.flagstat')
				)
			try:
				with open(chrM_bam + '.flagstat') as flagstats:
					chrM_flagstats_contents = flagstats.read()
					target_line = re.search(r'(\d+) \+ \d+ mapped', chrM_flagstats_contents)
					if target_line is not None:
						qc_data['num_mtDNA_reads_mapped'].append(str(int(target_line.group(1))/2))
					else:
						qc_data['num_mtDNA_reads_mapped'].append('0')
			except:
				qc_data['num_mtDNA_reads_mapped'] + '.flagstat'



			# Remove unmapped reads
			samtools_view.run(
				Parameter('-b'),
				Parameter('-F', '12'),
				Parameter('-o', unmappedrm_bam),
				Parameter(unique_bam)
			)

			# Create BAM index, then remove chrM
			samtools_index.run(
				Parameter(unmappedrm_bam)
			)

			# Remove chrM
			all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']]
			samtools_view.run(
				Parameter('-b'),
				Parameter('-o', chrmrm_bam),
				Parameter(unmappedrm_bam),
				*all_chr
			)

			# Stage delete for temporary files
			staging_delete.extend([
				sortmerged_bam,
				sortmerged_bam + '.bai', # BAM index file
				steric_filter_bam,
				unique_bam,
				duprm_bam,
				unmappedrm_bam,
				unmappedrm_bam + '.bai', # BAM index file
				chrmrm_bam
			])

		if step <= 5:
			# Generate filename for final processed BAM and BED
			processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix))
			unshifted_bed = os.path.join(output_dir, '{}.unshifted_bed'.format(lib_prefix))
			processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))
			unshifted_bedpe = os.path.join(output_dir, '{}.unshifted_bedpe'.format(lib_prefix))
			processed_bedpe_to_bed = os.path.join(output_dir,'{}.processed_bedpe_to_bed'.format(lib_prefix))
			# staging_delete.append(unshifted_bed)

			# Generate filename for chrM removed BAM
			chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))

			# Remove blacklisted genomic regions
			bedtools_intersect.run(
				Parameter('-v'),
				Parameter('-abam', chrmrm_bam),
				Parameter('-b', pipeline_config['bedtools']['blacklist-bed']),
				Parameter('-f', '0.5'),
				Redirect(stream=Redirect.STDOUT, dest=processed_bam)
			)

			# QC: Generate insert size metrics PDF
			picard_insert_metrics.run(
				Parameter('INPUT={}'.format(processed_bam)),
				Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))),
				Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf')))
			)

			# Generate index for processed BAM
			samtools_index.run(
				Parameter(processed_bam)
			)

			# Convert BAM to BED
			bedtools_bamtobed.run(
				Parameter('-i', processed_bam),
				Redirect(stream=Redirect.STDOUT, dest=unshifted_bed)
			)

			# Convert BAM to BEDPE, with specific quality and only properly paired reads, sorted by name
			samtools_view.run(
				Parameter('-uf', '0x2'),
				Parameter('-F', '1548'),
				Parameter('-q', '30'),
				Parameter(processed_bam),
				Pipe(
					samtools_sort.pipe(
						Parameter('-n'),
						Parameter('-'),
						Parameter(sorted_for_PE_bam)
					)
				)
			)

			# convert bam to BEDPE
			bedtools_bamtobed.run(
				Parameter('-i', str(sorted_for_PE_bam)+'.bam'),
				Parameter('-bedpe'),
				Redirect(stream=Redirect.STDOUT, dest=unshifted_bedpe)
			)
			
			unshifted_bedpe_to_bed = open(output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix), 'w')
			
			with open(unshifted_bedpe) as convertToBed:
				for line in convertToBed:
					chrpos1, start1, end1, chrpos2, start2, end2, name, score, strand1, strand2=line.split('\t')
					bedformat=[chrpos1, start1, end2, name, score, strand1, strand2.rstrip('\n')]
					unshifted_bedpe_to_bed.write('\t'.join(bedformat)+'\n')

					
			staging_delete.append(unshifted_bed)
			staging_delete.append(output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix))

			# Shifting + strand by 4 and - strand by -5, according to the ATACseq paper

			# This ysed to be bedtools shift, but they are fired
			self.shift_reads(
				input_bed_filepath=unshifted_bed,
				output_bed_filepath=processed_bed,
				log_filepath=os.path.join(logs_dir, 'shift_reads.logs'),
				genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'],
				minus_strand_shift=MINUS_STRAND_SHIFT,
				plus_strand_shift=PLUS_STRAND_SHIFT
			)

			##TO DO, needs modification for bedpe format
			self.shift_reads_bedpe(
				input_bed_filepath=output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix),
				output_bed_filepath=processed_bedpe_to_bed,
				log_filepath=os.path.join(logs_dir, 'shift_reads_bedpe_to_bed.logs'),
				genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'],
				minus_strand_shift=MINUS_STRAND_SHIFT,
				plus_strand_shift=PLUS_STRAND_SHIFT
			)

		# Peak-calling; MACS2
		if step <= 6:
			# for regular peak calling, including narrow, default q-value=0.01
			macs2_callpeak.run(
				Parameter('-t', processed_bed),
				Parameter('-f', 'BED'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bed) + '_regular_peak_calls'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads
				Parameter('--call-summits'),
				Parameter('--keep-dup', 'all')
			)

			#for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks
			macs2_callpeak.run(
				Parameter('-t', processed_bed),
				Parameter('-f', 'BED'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bed) + '_broad_peak_calls'),
				Parameter('-q', '0.05'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('--broad'),
				Parameter('--keep-dup', 'all')
			)

			# for regular peak calling, including narrow, default q-value=0.01 for processed bedpe to bed file
			# NOTE: BEDPE for MACS2 is not the same format at BEDPE accepted by NGS/UCSC standards
			macs2_callpeak.run(
				Parameter('-t', processed_bedpe_to_bed),
				Parameter('-f', 'BEDPE'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bedpe_to_bed) + '_regular_peak_calls'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads
				Parameter('--call-summits'),
				Parameter('--keep-dup', 'all')
			)

			#for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks for processed bedpe to bed file
			# NOTE: BEDPE for MACS2 is not the same format at BEDPE accepted by NGS/UCSC standards
			macs2_callpeak.run(
				Parameter('-t', processed_bedpe_to_bed),
				Parameter('-f', 'BEDPE'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bedpe_to_bed) + '_broad_peak_calls'),
				Parameter('-q', '0.05'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('--broad'),
				Parameter('--keep-dup', 'all')
			)


		# QC: Output QC data to file
		with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file:
			qc_data_file.write(str(qc_data) + '\n')

		# Delete temporary files
		for delete_file in staging_delete:
			subprocess.call(['rm', '-rf', delete_file])
Exemplo n.º 14
0
    def run_pipeline(self, pipeline_args, pipeline_config):
        reads = pipeline_args['reads']
        output_dir = pipeline_args['output']
        logs_dir = os.path.join(output_dir, 'logs')
        lib_prefix = pipeline_args['lib']
        forward_adapter = pipeline_args['forward_adapter']
        reverse_adapter = pipeline_args['reverse_adapter']
        sailfish_libtype = pipeline_args['sailfish_libtype']

        # Determine if run is paired-end from input
        run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1

        # Create output, tmp, and logs directories
        tmp_dir = os.path.join(output_dir, 'tmp')
        subprocess.call(['mkdir', '-p', output_dir, logs_dir, tmp_dir])

        # Keep list of items to delete
        staging_delete = [os.path.join(output_dir, 'tmp')]

        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        kallisto = Software('kallisto', pipeline_config['kallisto']['path'])
        sailfish = Software('sailfish', pipeline_config['sailfish']['path'])

        # Combine reads with extra sequencing depth
        if run_is_paired_end:
            # Aggregate read1s and read2s
            read1s, read2s = [], []
            for read in reads:
                read1, read2 = read.split(':')
                read1s.append(read1)
                read2s.append(read2)

            # Combine reads groups
            combined_reads = []
            for name, reads_group in [('read1', read1s), ('read2', read2s)]:
                combined_read_filename = os.path.join(
                    output_dir,
                    '{}.combined.{}.fastq.gz'.format(lib_prefix, name))
                combined_reads.append(combined_read_filename)
                staging_delete.append(combined_read_filename)
                with open(combined_read_filename, 'w') as combined_reads_fastq:
                    subprocess.call(['cat'] + [read for read in reads_group],
                                    stdout=combined_reads_fastq)

            # Update reads list
            reads = ':'.join(combined_reads)
        else:
            # Combine reads
            combined_read_filename = os.path.join(
                output_dir, '{}.combined.fastq.gz'.format(lib_prefix))
            staging_delete.append(combined_read_filename)
            with open(combined_read_filename, 'w') as combined_reads_fastq:
                subprocess.call(['cat'] + [read for read in reads],
                                stdout=combined_reads_fastq)

            # Update reads list
            reads = combined_read_filename

        cutadapt_common = [
            Parameter('--quality-base={}'.format(
                pipeline_config['cutadapt']['quality-base'])),
            Parameter('--minimum-length={}'.format(
                pipeline_config['cutadapt']['minimum-length'])),
            Parameter('-q', '30'),
            Redirect(stream=Redirect.STDOUT,
                     dest=os.path.join(logs_dir, 'cutadapt.summary'))
        ]

        if run_is_paired_end:
            read1, read2 = reads.split(':')
            trimmed_read1_filename = os.path.join(
                output_dir, lib_prefix + '_read1.trimmed.fastq.gz')
            trimmed_read2_filename = os.path.join(
                output_dir, lib_prefix + '_read2.trimmed.fastq.gz')

            staging_delete.append(trimmed_read1_filename)
            staging_delete.append(trimmed_read2_filename)

            cutadapt_specific = [
                Parameter('--output={}'.format(trimmed_read1_filename)),
                Parameter('--paired-output={}'.format(trimmed_read2_filename)),
                Parameter('-a', forward_adapter),
                Parameter('-A', reverse_adapter),
                Parameter(read1),
                Parameter(read2)
            ]

            # Update reads list
            reads = ':'.join([trimmed_read1_filename, trimmed_read2_filename])
        else:
            # Construct new filename
            trimmed_read_filename = os.path.join(
                output_dir, lib_prefix + '.trimmed.fastq.gz')

            staging_delete.append(trimmed_read_filename)

            cutadapt_specific = [
                Parameter('--output={}'.format(trimmed_read_filename)),
                Parameter('-a', forward_adapter),
                Parameter(reads)
            ]

            # Update reads list
            reads = [trimmed_read_filename]

        # Run cutadapt
        cutadapt.run(*(cutadapt_common + cutadapt_specific))

        # Step 3: Kallisto Quantification
        kallisto_common = [
            Parameter('--index={}'.format(
                pipeline_config['kallisto']['index-path'])),
            Parameter('--output-dir={}'.format(
                os.path.join(output_dir, 'kallisto_quant')))
        ]

        if run_is_paired_end:
            read1, read2 = reads.split(':')
            kallisto_ended = [Parameter(read1), Parameter(read2)]
        else:
            kallisto_ended = [Parameter(reads)]

        # Run kallisto
        kallisto.run(*(kallisto_common + kallisto_ended))

        # Step 4: Sailfish Quantification
        sailfish_common = [
            Parameter('--index', pipeline_config['sailfish']['index-path']),
            Parameter('--libType', '"{}"'.format(sailfish_libtype)),
            Parameter('--output', os.path.join(output_dir, 'sailfish_quant'))
        ]

        if run_is_paired_end:
            read1, read2 = reads.split(':')
            sailfish_ended = [
                Parameter('-1', '<(zcat {})'.format(read1)),
                Parameter('-2', '<(zcat {})'.format(read2)),
            ]
        else:
            sailfish_ended = [Parameter('-r', '<(zcat {})'.format(reads))]

        sailfish.run(*(sailfish_common + sailfish_ended), shell=True)

        # Delete staged items
        for item in staging_delete:
            subprocess.call(['rm', '-rf', item])
    def run_pipeline(self, pipeline_args, pipeline_config):

        # Instantiate Software instances
        fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
        rnaseqc = Software('RNA-SeQC', pipeline_config['RNA-SeQC']['path'])

        picard = {
            subprogram_name: Software('picard {}'.format(subprogram_name),
                                      pipeline_config['picard']['path'] + ' {}'.format(subprogram_name))
            for subprogram_name
            in {'CreateSequenceDictionary', 'MarkDuplicates', 'CollectRnaSeqMetrics',
                'CollectInsertSizeMetrics', 'CollectAlignmentSummaryMetrics', 'CollectGcBiasMetrics',
                'EstimateLibraryComplexity', 'AddOrReplaceReadGroups'}
        }

        preseq = {
            subprogram_name: Software('preseq {}'.format(subprogram_name),
                                      pipeline_config['preseq']['path'] + ' {}'.format(subprogram_name))
            for subprogram_name
            in {'c_curve', 'lc_extrap', 'gc_extrap'}
        }
        bam2mr = Software('bam2mr', pipeline_config['preseq']['bam2mr'])

        featurecounts = Software('featureCounts', pipeline_config['featureCounts']['path'])

        samtools_faidx = Software('samtools faidx', pipeline_config['samtools']['path'] + ' faidx')
        novosort = Software('novosort', pipeline_config['novosort']['path'])

        # Create output directory
        subprocess.call('mkdir -p {}'.format(pipeline_args['output_dir']), shell=True)
        subprocess.call('mkdir -p {}'.format(pipeline_config['tmp-dir']), shell=True)

        # Sort bam file
        # sorted_bam = os.path.join(pipeline_args['output_dir'], 'sorted.tmp.bam')
        sorted_bam = os.path.join(pipeline_args['output_dir'], '{}.sorted.tmp.bam'.format(pipeline_args['lib']))
        novosort.run(
            Parameter('--index'),
            Parameter('--output', sorted_bam),
            Parameter(pipeline_args['bam'])
        )

        # Run FastQC
        self.run_fastqc(
            fastqc=fastqc,
            pipeline_args=pipeline_args
        )

        # Run RNA-SeQC
        self.run_rnaseqc(
            rnaseqc=rnaseqc,
            picard=picard,
            samtools_faidx=samtools_faidx,
            pipeline_config=pipeline_config,
            pipeline_args=pipeline_args,
            sorted_bam=sorted_bam
        )

        # Run Picard suite
        self.run_picard_suite(
            picard=picard,
            sorted_bam=sorted_bam,
            pipeline_config=pipeline_config,
            pipeline_args=pipeline_args
        )

        # self.run_preseq(
        #     preseq=preseq,
        #     bam2mr=bam2mr,
        #     sorted_bam=sorted_bam,
        #     pipeline_args=pipeline_args
        # )

        self.run_featurecounts(
            featurecounts=featurecounts,
            sorted_bam=sorted_bam,
            pipeline_args=pipeline_args,
            pipeline_config=pipeline_config
        )

        self.run_chrm_percentage(
            sorted_bam=sorted_bam,
            pipeline_args=pipeline_args
        )

        # Remove temporary sorted bam
        os.remove(sorted_bam)
        os.remove(sorted_bam + '.bai')
        # subprocess.call('rm -rf /mnt/analysis/tmp', shell=True)