示例#1
0
def run_ATACseq_pipeline(software_config_path, fastq_files, fastq_dir, parent_syn_id, step=0):
    log.info('Running ATACseq pipeline')
    
    # Preprocess fastq filenames
    fastq_files_prefix = [fastq_files[0].split('.')[0], fastq_files[1].split('.')[0]]
    # If the first element in list is not the first read group, switch the elements
    if fastq_files_prefix[0].split('_')[6] != '1':
        fastq_files_prefix[0], fastq_files_prefix[1] = fastq_files_prefix[1], fastq_files_prefix[1]
    # Get the library prefix
    lib_prefix = '_'.join(fastq_files_prefix[0].split('_')[:6])
    
    # Inject SoftwareConfigService
    PipelineSoftwareBase.set_software_config_service(SoftwareConfigService(software_config_path))
    
    # Create software instances
    fastqc = FastQC()
    bwa_aln = BwaAln()
    bwa_sampe = BwaSampe()
    samtools_view = SamtoolsView()
    samtools_flagstat = SamtoolsFlagstat()
    fseq = FSeq()
    script_recover_fragments = ScriptRecoverFragments()
    cutadapt = CutAdapt()
    
    novosort = Software('novosort')
    bedtools_intersect = Software('bedtools_intersect')
    bedtools_bamtobed = Software('bedtools_bamtobed')
    bedtools_merge = Software('bedtools_merge')
    igvtools_sort = Software('igvtools_sort')
    igvtools_count = Software('igvtools_count')
    script_offset_ATACseq = Software('script_offset_ATACseq')
    macs2_callpeak = Software('MACS2_callpeak')
    picard_mark_duplicates = Software('picard_MarkDuplicates')
    homer_findpeaks = Software('HOMER_findPeaks')
    homer_maketagdirectory = Software('HOMER_makeTagDirectory')
    homer_pos2bed = Software('HOMER_pos2bed')
    sicer_rb = Software('SICER_rb')
    
    # Make temporary directory
    tmp_dir = fastq_dir + 'tmp/'
    subprocess.call('mkdir -p ' + tmp_dir, shell=True)
    
#    syn_file_raw_fastqs = []
#    for i, fastq in enumerate(fastq_files_prefix):
#        syn_fastq = File(
#            path='',
#            name='Raw ATACseq fastq ' + str(i),
#            parent=parent_syn_id
#        )
        
    # Pipeline Step 1
    if step <= 1:
        ###########################
        # Pipeline step: cutadapt
        # Input files start out as .txt, but I like to change it
        # to .fastq so that it's more obvious what it is
        
        # This is the paired-end run of cutadapt
        log.info('Running pair-end cutadapt')
        cutadapt.generate_cmd({
            'output_file1': fastq_files_prefix[0] + '.clipped.fastq.gz',
            'output_file2': fastq_files_prefix[1] + '.clipped.fastq.gz',
            'min_quality_score': '30',
            'quality_base': '33',
            'input_file1': fastq_files_prefix[0] + '.txt.gz',
            'input_file2': fastq_files_prefix[1] + '.txt.gz',
            'summary_file': lib_prefix + '.cutadapt.summary.log'
        }).run()
    
#    syn_act_cutadapt = Activity(
#        name='Cutadapt Paired-end',
#        description='Cutadapt Paired-end, trimming to adapters and quality >= 30',
#        used=[syn_file_raw_fastqs[0], syn_file_raw_fastq[1]],
#        executed=cutadapt.get_path()
#    )
#    
#    syn_file_clipped_fastqs = []
    
    # Pipeline Step 2
    if step <= 2:
        # Make directory for FastQC output
        fastqc_output_dir = fastq_dir + 'fastqc_output/'
        subprocess.call('mkdir -p ' + fastqc_output_dir, shell=True)
        
        log.info('Running FastQC')
        # Run the steps in this for loop separately for each fastq file
        for fastq in fastq_files_prefix:
            #########################
            # Pipeline step: FastQC 
            # Get QC stats for fastq files
            # TODO Are we interested in parsing output?
            fastqc.generate_cmd({
                'out_dir': fastqc_output_dir,
                'input_file': fastq + '.clipped.fastq.gz'
            }).run()
            
            ##########################
            # Pipeline step: bwa aln
            # Generate suffix array for bwa aligner
            # Done for each fastq file
            bwa_aln.generate_cmd({
                'input_file': fastq + '.clipped.fastq.gz',
                'output_file': fastq + '.sai',
                'output_log': 'bwa_aln.summary.log'
            }).run()
      
    # Pipeline Step 3
    if step <= 3:  
        ############################
        # Pipeline step: bwa sampe
        # Align reads in fastq files to reference
        # Output in sorted BAM format
        bwa_sampe.generate_cmd({
            'sai_1': fastq_files_prefix[0] + '.sai',
            'sai_2': fastq_files_prefix[1] + '.sai',
            'fastq_1': fastq_files_prefix[0] + '.clipped.fastq.gz',
            'fastq_2': fastq_files_prefix[1] + '.clipped.fastq.gz',
            'output_log': 'bwa_sampe.summary.log'
        }, pipe=(
            samtools_view.generate_cmd({
                'input_file': '-',
                'output_file': lib_prefix + '.bam'
            })
        )).run()
    
    
    # If this actually works, it will only output unique reads
#    bwa_sampe.generate_cmd({
#        'sai_1':fastq_files_prefix[0] + '.sai',
#        'sai_2':fastq_files_prefix[1] + '.sai',
#        'fastq_1':fastq_files_prefix[0] + '.clipped.fastq.gz',
#        'fastq_2':fastq_files_prefix[1] + '.clipped.fastq.gz'
#    }, pipe=(
#        grep_unique_alignments.generate_cmd({}, pipe=(
#            samtools_view.generate_cmd({
#                'input_file': '-',
#                'output_file': lib_prefix + '.bam'
#            })
#        ))
#    )).run()
    
    ################################
    # Pipeline step: samtools flagstat
    # Generate alignment statistics
    
    # Pipeline Step 4
    if step <= 4:
        samtools_flagstat.generate_cmd({
            'input_file': lib_prefix + '.bam',
            'output_file': lib_prefix + '.bam.flagstat'
        }).run()
        
        # Proceed with only uniquely mapped reads
        (samtools_view.clear_flags().add_flag('-b')
            .add_flag_with_argument('-F', ['256'])
            .add_flag_with_argument('-q', ['10'])
            .add_flag_with_argument('-o', [lib_prefix + '.unique.bam'])
            .generate_cmd({
                'input_file': lib_prefix + '.bam'
            }).run()
        )
    
        novosort.generate_cmd({
            'tmp_dir': tmp_dir,
            'input_file': lib_prefix + '.unique.bam',
            'output_file': lib_prefix + '.sorted.unique.bam',
            'output_log': 'novosort.summary.log'
        }).run()
        
        picard_mark_duplicates.generate_cmd({
            'input_file': lib_prefix + '.sorted.unique.bam',
            'output_file': lib_prefix + '.duprm.sorted.unique.bam',
            'metrics_file': lib_prefix + '.markduplicates.metrics.log',
            'tmp_dir': tmp_dir,
            'output_log': 'Picard_MarkDuplicates.summary.log'
        }).run()
        
        (samtools_view.clear_flags().add_flag('-b')
            .add_flag_with_argument('-F', ['12'])
            .add_flag_with_argument('-o', [lib_prefix + '.unmappedrm.duprm.sorted.unique.bam'])
            .generate_cmd({
                'input_file': lib_prefix + '.duprm.sorted.unique.bam'
            }).run()
        )
    
    # Pipeline Step 5
    if step <= 5:
        #####################################################
        # Pipeline step: Remove blacklisted genomic regions
        bedtools_intersect.generate_cmd({
            'input_file': lib_prefix + '.unmappedrm.duprm.sorted.unique.bam',
            'blacklist_bed': '/mnt/cinder/dfitzgeraldSCRATCH/annotation/hg19_blacklisted/hg19-blacklist.bed',
            'output_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bam'
        }).run()
    
#    bedtools_bamtobed.generate_cmd({
#        'input_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bam',
#        'output_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bed'
#    }).run()
    
    # Pipeline Step 6
    if step <= 6:
        # Separate into single-end strands for peak calling, +/- strand for .tdf generation
        for sam_flag in [['64', 'readgroup1'], ['128', 'readgroup2'], ['16', 'minusstrand'], ['32', 'plusstrand']]:
            samtools_view.clear_flags().add_flag_with_argument('-bf', [sam_flag[0]]).generate_cmd({
                'input_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bam'
            }, pipe=(
                bedtools_bamtobed.generate_cmd({
                    'input_file': 'stdin',
                    'output_file': lib_prefix + '.' + sam_flag[1] + '.bl.unmappedrm.duprm.sorted.unique.bed'
                })
            )).run()
    
    # Pipeline Step 7
    if step <= 7:
        # Shift bed files for .tdf generation
        for directionality in ['minusstrand', 'plusstrand']:
            with open(lib_prefix + '.' + directionality + '.bl.unmappedrm.duprm.sorted.unique.bed') as in_bed:
                with open(lib_prefix + '.shifted.' + directionality + '.bl.unmappedrm.duprm.sorted.unique.bed', 'w') as out_bed:
                    for line in in_bed:
                        minus_strand = True if directionality == 'minusstrand' else False
                        out_bed.write(get_offset_line(line.rstrip('\n').split('\t'), minus_strand))
            
            # Generate .tdf for each +/- strand individually
            igvtools_sort.generate_cmd({
                'tmp_dir': tmp_dir,
                'input_file': lib_prefix + '.shifted.' + directionality + '.bl.unmappedrm.duprm.sorted.unique.bed',
                'output_file': lib_prefix + '.shifted.' + directionality + '.bl.unmappedrm.duprm.igvsorted.unique.bed'
            }).run()
            
            igvtools_count.generate_cmd({
                'input_file': lib_prefix + '.shifted.' + directionality + '.bl.unmappedrm.duprm.igvsorted.unique.bed',
                'output_file': lib_prefix + '.' + directionality + '.tdf',
                'genome_sizes_file': '/mnt/cinder/dfitzgeraldSCRATCH/annotation/hg19_chrom_sizes/hg19.chrom.sizes'
            }).run()
        
        # Combine +/- bed files, generate .tdf from that
        combine_beds = ('cat ' + lib_prefix + '.shifted.minusstrand.bl.unmappedrm.duprm.sorted.unique.bed '
            + lib_prefix + '.shifted.plusstrand.bl.unmappedrm.duprm.sorted.unique.bed >'
            + lib_prefix + '.combined.shifted.bl.unmappedrm.duprm.sorted.unique.bed')
        subprocess.call(combine_beds, shell=True)
        
        bedtools_merge.generate_cmd({
            'input_file': lib_prefix + '.combined.shifted.bl.unmappedrm.duprm.sorted.unique.bed',
            'output_file': lib_prefix + '.merged.shifted.bl.unmappedrm.duprm.sorted.unique.bed'
        }).run()
        
        igvtools_sort.generate_cmd({
            'tmp_dir': tmp_dir,
            'input_file': lib_prefix + '.merged.shifted.bl.unmappedrm.duprm.sorted.unique.bed',
            'output_file': lib_prefix + '.merged.shifted.bl.unmappedrm.duprm.igvsorted.unique.bed'
        }).run()
        
        igvtools_count.generate_cmd({
            'input_file': lib_prefix + '.merged.shifted.bl.unmappedrm.duprm.igvsorted.unique.bed',
            'output_file': lib_prefix + '.merged.tdf',
            'genome_sizes_file': '/mnt/cinder/dfitzgeraldSCRATCH/annotation/hg19_chrom_sizes/hg19.chrom.sizes'
        }).run()
    
    # Pipeline Step 8
    if step <= 8:
        # Peak Calling
        #for read in ["read1", "read2"]:
        for read in ['singleend']:		
            #MACS2 Peak Calling
            macs2_output_dir = fastq_dir + 'macs2_' + read + '_output/'
            subprocess.call('mkdir -p ' + macs2_output_dir, shell=True)
            
            macs2_callpeak.generate_cmd({
                'output_dir': macs2_output_dir,
                'input_bed': lib_prefix + '.' + read + '.bl.unmappedrm.duprm.sorted.unique.bed',
                'lib_prefix': lib_prefix,
                'output_log': 'MACS2_callpeak.' + read + '.summary.log'
            }).run()
            
            # HOMER Peak Calling
            homer_tagdir = fastq_dir + 'HOMER_tagdir_' + read + '/'
            
            homer_maketagdirectory.generate_cmd({
                'input_bed': lib_prefix + '.' + read + '.bl.unmappedrm.duprm.sorted.unique.bed',
                'out_tag_directory': homer_tagdir,
                'output_log': 'HOMER_maketagdir.' + read + '.summary.log'
            }).run()
            
            homer_findpeaks.generate_cmd({
                'tag_directory': homer_tagdir,
                'output_log': 'HOMER_findpeaks.' + read + '.summary.log'
            }).run()
            
            homer_pos2bed.generate_cmd({
                'input_txt': homer_tagdir + 'regions.txt',
                'output_bed': homer_tagdir + lib_prefix + '.regions.bed',
                'output_log': 'HOMER_pos2bed.' + read + '.summary.log'
            }).run()
            
            # SICER Peak Calling
#            sicer_output_dir = fastq_dir + 'SICER_' + read + '_output/'
#            subprocess.call('mkdir -p ' + sicer_output_dir, shell=True)
#            sicer_rb.generate_cmd({
#                'input_dir': fastq_dir,
#                'input_bed': lib_prefix + '.' + read + '.bl.unmappedrm.duprm.sorted.unique.bed',
#                'output_dir': sicer_output_dir,
#                'output_log': 'SICER.' + read + '.summary.log'
#            }).run()
    
    
    ###################################################
    # Pipeline step: Generate .tdf file with IGVTools
#    igvtools_sort.generate_cmd({
#        'tmp_dir': tmp_dir,
#        'input_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bed',
#        'output_file': lib_prefix + '.bl.unmappedrm.duprm.igvsorted.unique.bed'
#    }).run()
#    
#    igvtools_count.generate_cmd({
#        'input_file': lib_prefix + '.bl.unmappedrm.duprm.igvsorted.unique.bed',
#        'output_file': lib_prefix + '.tdf',
#        'genome_sizes_file': '/mnt/cinder/dfitzgeraldSCRATCH/annotation/hg19_chrom_sizes/hg19.chrom.sizes'
#    }).run()
#    
#    ## Start processing files for peak calling
#    for partial_bed in ['99', '147', '83', '163']:
#        samtools_view.clear_flags().add_flag_with_argument('-bf', [partial_bed]).generate_cmd({
#            'input_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bam'
#        }, pipe=(
#            bedtools_bamtobed.generate_cmd({
#                'input_file': 'stdin',
#                'output_file': lib_prefix + '.' + partial_bed + '.bl.unmappedrm.duprm.sorted.unique.bed'
#            })
#        )).run()
#    
#    script_offset_ATACseq.generate_cmd({
#        'input_beds_prefix': lib_prefix
#    }).run()
    
    ###### Peak Calling #########
#    fseq_output_dir = fastq_dir + 'fseq_output/'
#    subprocess.call('mkdir -p ' + fseq_output_dir, shell=True)
#    macs2_output_dir = fastq_dir + 'macs2_output/'
#    subprocess.call('mkdir -p ' + macs2_output_dir, shell=True)
    
    # Pipeline step: Peak calling with Fseq
    
    # Pipeline step: Peak calling with MACS2
#    macs2_callpeak.generate_cmd({
#        'output_dir': macs2_output_dir,
#        'input_bed': lib_prefix + '.adjusted.bl.unmappedrm.duprm.sorted.unique.bed',
#        'lib_prefix': lib_prefix
#    }).run()
    
    # Clean up a bit
#    files_to_remove = (['*.clipped.fastq.gz', '*.txt.gz', '*.sai',
#        lib_prefix + '.83.bl.unmappedrm.sorted.bed', lib_prefix + '.99.bl.unmappedrm.sorted.bed',
#        lib_prefix + '.147.bl.unmappedrm.sorted.bed', lib_prefix + '.163.bl.unmappedrm.sorted.bed'])
#    for file_to_rm in files_to_remove:
#        subprocess.call('rm -rf ' + fastq_dir + file_to_rm, shell=True)
    
    
    
    
    
    
    
#    script_recover_fragments.generate_cmd({
#        'input_file':lib_prefix + '.12.sam',
#        'output_file':lib_prefix + '.frag.bed'
#    }).run()
    
    # TODO Run perl script to get alignment stats
    # TODO Run perl script to recover fragments
    
    # Pipeline step: re-gzip files
#    for fastq in fastq_files_prefix:
#        gzip.generate_cmd({}, {
#            'input_file':fastq + '.clipped.fastq'
#        }).run()
    
    
    # Run fastx_clipper
#    Gzip().generate_cmd({}, {
#        'input_file':fastq_files[0]
#    }).run()
#    FastxClipper().generate_cmd({
#        'input_file':'input.fastq.1',
#        'output_file':'input.clipped.fastq.1'
#    }, {}).run()
#    Gunzip().generate_cmd({}, {'input_file':'input.clippsed.fastq.1'}).run()
    
    
    # TODO Check to make sure we're in the correct directory
    log.info('ATACseq pipeline ran successfully')
示例#2
0
def run_ATACseq_pipeline(software_config_path, fastq_files, fastq_dir, parent_syn_id):
    log.info('Running ATACseq pipeline')
    
    # Preprocess fastq filenames
    fastq_files_prefix = [fastq_file.split('.')[0] for fastq_file in fastq_files]
    # Get the library prefix
    lib_prefix = '_'.join(fastq_files_prefix[0].split('_')[:3])
    
    # Inject SoftwareConfigService
    PipelineSoftwareBase.set_software_config_service(SoftwareConfigService(software_config_path))
    
    # Create software instances
    fastqc = FastQC()
    bwa_aln = BwaAln()
    samtools_view = SamtoolsView()
    samtools_flagstat = SamtoolsFlagstat()
    
    novosort = Software('novosort')
    bedtools_intersect = Software('bedtools_intersect')
    bedtools_bamtobed = Software('bedtools_bamtobed')
    bedtools_merge = Software('bedtools_merge')
    igvtools_sort = Software('igvtools_sort')
    igvtools_count = Software('igvtools_count')
    script_offset_ATACseq = Software('script_offset_ATACseq')
    macs2_callpeak = Software('MACS2_callpeak')
    picard_mark_duplicates = Software('picard_MarkDuplicates')
    homer_findpeaks = Software('HOMER_findPeaks')
    homer_maketagdirectory = Software('HOMER_makeTagDirectory')
    homer_pos2bed = Software('HOMER_pos2bed')
    sicer_rb = Software('SICER_rb')
    cutadapt = Software('cutadapt_se')
    bwa_samse = Software('bwa_samse')
    
    # Make temporary directory
    tmp_dir = fastq_dir + 'tmp/'
    subprocess.call('mkdir -p ' + tmp_dir, shell=True)
    
    # Make directory for FastQC output
    fastqc_output_dir = fastq_dir + 'fastqc_output/'
    subprocess.call('mkdir -p ' + fastqc_output_dir, shell=True)
    
#    syn_file_raw_fastqs = []
#    for i, fastq in enumerate(fastq_files_prefix):
#        syn_fastq = File(
#            path='',
#            name='Raw ATACseq fastq ' + str(i),
#            parent=parent_syn_id
#        )
        
    ###########################
    # Pipeline step: cutadapt
    # Input files start out as .txt, but I like to change it
    # to .fastq so that it's more obvious what it is
    
    # This is the paired-end run of cutadapt
    log.info('Running pair-end cutadapt')
    for fastq in fastq_files_prefix:
        cutadapt.generate_cmd({
            'output_file1': fastq + '.clipped.fastq.gz',
            'min_quality_score': '30',
            'quality_base': '33',
            'input_file1': fastq + '.fastq.gz',
            'summary_file': fastq + '.cutadapt.summary.log'
        }).run()
        
        fastqc.generate_cmd({
            'out_dir': fastqc_output_dir,
            'input_file': fastq + '.clipped.fastq.gz'
        }).run()
        
        bwa_aln.generate_cmd({
            'input_file': fastq + '.clipped.fastq.gz',
            'output_file': fastq + '.sai',
            'output_log': 'bwa_aln.summary.log'
        }).run()
    
        bwa_samse.generate_cmd({
            'sai_1': fastq + '.sai',
            'fastq_1': fastq + '.clipped.fastq.gz',
            'output_log': 'bwa_samse.summary.log'
        }, pipe=(
            samtools_view.generate_cmd({
                'input_file': '-',
                'output_file': fastq + '.bam'
            })
        )).run()
        
        samtools_flagstat.generate_cmd({
            'input_file': fastq + '.bam',
            'output_file': fastq + '.bam.flagstat'
        }).run()
    
    flagstats_output_dir = fastq_dir + 'flagstats/'
    subprocess.call('mkdir -p ' + flagstats_output_dir, shell=True)
    subprocess.call('mv *.flagstat ' + flagstats_output_dir, shell=True)
    
    novosort.generate_cmd({
        'tmp_dir': tmp_dir,
        'input_files': ' '.join([fastq + '.bam' for fastq in fastq_files_prefix]),
        'output_file': lib_prefix + '.sorted.bam',
        'output_log': 'novosort.summary.log'
    }).run()
    
    # Proceed with only uniquely mapped reads
    (samtools_view.clear_flags().add_flag('-b')
        .add_flag_with_argument('-F', ['256'])
        .add_flag_with_argument('-q', ['10'])
        .add_flag_with_argument('-o', [lib_prefix + '.sorted.unique.bam'])
        .generate_cmd({
            'input_file': lib_prefix + '.sorted.bam'
        }).run()
    )
    
    picard_mark_duplicates.generate_cmd({
        'input_file': lib_prefix + '.sorted.unique.bam',
        'output_file': lib_prefix + '.duprm.sorted.unique.bam',
        'metrics_file': lib_prefix + '.markduplicates.metrics.log',
        'tmp_dir': tmp_dir,
        'output_log': 'Picard_MarkDuplicates.summary.log'
    }).run()
    
    (samtools_view.clear_flags().add_flag('-b')
        .add_flag_with_argument('-F', ['12'])
        .add_flag_with_argument('-o', [lib_prefix + '.unmappedrm.duprm.sorted.unique.bam'])
        .generate_cmd({
            'input_file': lib_prefix + '.duprm.sorted.unique.bam'
        }).run()
    )
    
    ####################################
    # Pipeline step: recover fragments
    # Whatever that actually means
    
    #####################################################
    # Pipeline step: Generate genome coverage bed files
    
    #####################################################
    # Pipeline step: Remove blacklisted genomic regions
    bedtools_intersect.generate_cmd({
        'input_file': lib_prefix + '.unmappedrm.duprm.sorted.unique.bam',
        'blacklist_bed': '/mnt/cinder/dfitzgeraldSCRATCH/annotation/hg19_blacklisted/hg19-blacklist.bed',
        'output_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bam'
    }).run()
    
    bedtools_bamtobed.generate_cmd({
        'input_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bam',
        'output_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bed'
    }).run()
    
    # Separate into single-end strands for peak calling, +/- strand for .tdf generation
    for sam_flag in [['0', 'singleend'], ['16', 'minusstrand'], ['32', 'plusstrand']]:
        samtools_view.clear_flags().add_flag_with_argument('-bf', [sam_flag[0]]).generate_cmd({
            'input_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bam'
        }, pipe=(
            bedtools_bamtobed.generate_cmd({
                'input_file': 'stdin',
                'output_file': lib_prefix + '.' + sam_flag[1] + '.bl.unmappedrm.duprm.sorted.unique.bed'
            })
        )).run()
    
    # Shift bed files for .tdf generation
    for directionality in ['minusstrand', 'plusstrand']:
        with open(lib_prefix + '.' + directionality + '.bl.unmappedrm.duprm.sorted.unique.bed') as in_bed:
            with open(lib_prefix + '.shifted.' + directionality + '.bl.unmappedrm.duprm.sorted.unique.bed', 'w') as out_bed:
                for line in in_bed:
                    minus_strand = True if directionality == 'minusstrand' else False
                    out_bed.write(get_offset_line(line.rstrip('\n').split('\t'), minus_strand))
        
        # Generate .tdf for each +/- strand individually
        igvtools_sort.generate_cmd({
            'tmp_dir': tmp_dir,
            'input_file': lib_prefix + '.shifted.' + directionality + '.bl.unmappedrm.duprm.sorted.unique.bed',
            'output_file': lib_prefix + '.shifted.' + directionality + '.bl.unmappedrm.duprm.igvsorted.unique.bed'
        }).run()
        
        igvtools_count.generate_cmd({
            'input_file': lib_prefix + '.shifted.' + directionality + '.bl.unmappedrm.duprm.igvsorted.unique.bed',
            'output_file': lib_prefix + '.' + directionality + '.tdf',
            'genome_sizes_file': '/mnt/cinder/dfitzgeraldSCRATCH/annotation/hg19_chrom_sizes/hg19.chrom.sizes'
        }).run()
    
    # Combine +/- bed files, generate .tdf from that
    combine_beds = ('cat ' + lib_prefix + '.shifted.minusstrand.bl.unmappedrm.duprm.sorted.unique.bed '
        + lib_prefix + '.shifted.plusstrand.bl.unmappedrm.duprm.sorted.unique.bed >'
        + lib_prefix + '.combined.shifted.bl.unmappedrm.duprm.sorted.unique.bed')
    subprocess.call(combine_beds, shell=True)
    
    bedtools_merge.generate_cmd({
        'input_file': lib_prefix + '.combined.shifted.bl.unmappedrm.duprm.sorted.unique.bed',
        'output_file': lib_prefix + '.merged.shifted.bl.unmappedrm.duprm.sorted.unique.bed'
    }).run()
    
    igvtools_sort.generate_cmd({
        'tmp_dir': tmp_dir,
        'input_file': lib_prefix + '.merged.shifted.bl.unmappedrm.duprm.sorted.unique.bed',
        'output_file': lib_prefix + '.merged.shifted.bl.unmappedrm.duprm.igvsorted.unique.bed'
    }).run()
    
    igvtools_count.generate_cmd({
        'input_file': lib_prefix + '.merged.shifted.bl.unmappedrm.duprm.igvsorted.unique.bed',
        'output_file': lib_prefix + '.merged.tdf',
        'genome_sizes_file': '/mnt/cinder/dfitzgeraldSCRATCH/annotation/hg19_chrom_sizes/hg19.chrom.sizes'
    }).run()
    
    # Peak Calling
    for read_group in ["singleend"]:
        #MACS2 Peak Calling
        macs2_output_dir = fastq_dir + 'macs2_' + read_group + '_output/'
        subprocess.call('mkdir -p ' + macs2_output_dir, shell=True)
        
        macs2_callpeak.generate_cmd({
            'output_dir': macs2_output_dir,
            'input_bed': lib_prefix + '.' + read_group + '.bl.unmappedrm.duprm.sorted.unique.bed',
            'lib_prefix': lib_prefix,
            'output_log': 'MACS2_callpeak.summary.log'
        }).run()
        
        # HOMER Peak Calling
        homer_tagdir = fastq_dir + 'HOMER_tagdir_' + read_group + '/'
        
        homer_maketagdirectory.generate_cmd({
            'input_bed': lib_prefix + '.' + read_group + '.bl.unmappedrm.duprm.sorted.unique.bed',
            'out_tag_directory': homer_tagdir,
            'output_log': 'HOMER_maketagdir.summary.log'
        }).run()
        
        homer_findpeaks.generate_cmd({
            'tag_directory': homer_tagdir,
            'output_log': 'HOMER_findpeaks.summary.log'
        }).run()
        
        homer_pos2bed.generate_cmd({
            'input_txt': homer_tagdir + 'regions.txt',
            'output_bed': homer_tagdir + lib_prefix + '.regions.bed',
            'output_log': 'HOMER_pos2bed.summary.log'
        }).run()
        
        # SICER Peak Calling
        sicer_output_dir = fastq_dir + 'SICER_' + read_group + '_output/'
        subprocess.call('mkdir -p ' + sicer_output_dir, shell=True)
        sicer_rb.generate_cmd({
            'input_dir': fastq_dir,
            'input_bed': lib_prefix + '.' + read_group + '.bl.unmappedrm.duprm.sorted.unique.bed',
            'output_dir': sicer_output_dir,
            'output_log': 'SICER.summary.log'
        }).run()
    
    
    ###################################################
    # Pipeline step: Generate .tdf file with IGVTools
#    igvtools_sort.generate_cmd({
#        'tmp_dir': tmp_dir,
#        'input_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bed',
#        'output_file': lib_prefix + '.bl.unmappedrm.duprm.igvsorted.unique.bed'
#    }).run()
#    
#    igvtools_count.generate_cmd({
#        'input_file': lib_prefix + '.bl.unmappedrm.duprm.igvsorted.unique.bed',
#        'output_file': lib_prefix + '.tdf',
#        'genome_sizes_file': '/mnt/cinder/dfitzgeraldSCRATCH/annotation/hg19_chrom_sizes/hg19.chrom.sizes'
#    }).run()
#    
#    ## Start processing files for peak calling
#    for partial_bed in ['99', '147', '83', '163']:
#        samtools_view.clear_flags().add_flag_with_argument('-bf', [partial_bed]).generate_cmd({
#            'input_file': lib_prefix + '.bl.unmappedrm.duprm.sorted.unique.bam'
#        }, pipe=(
#            bedtools_bamtobed.generate_cmd({
#                'input_file': 'stdin',
#                'output_file': lib_prefix + '.' + partial_bed + '.bl.unmappedrm.duprm.sorted.unique.bed'
#            })
#        )).run()
#    
#    script_offset_ATACseq.generate_cmd({
#        'input_beds_prefix': lib_prefix
#    }).run()
    
    ###### Peak Calling #########
#    fseq_output_dir = fastq_dir + 'fseq_output/'
#    subprocess.call('mkdir -p ' + fseq_output_dir, shell=True)
#    macs2_output_dir = fastq_dir + 'macs2_output/'
#    subprocess.call('mkdir -p ' + macs2_output_dir, shell=True)
    
    # Pipeline step: Peak calling with Fseq
    
    # Pipeline step: Peak calling with MACS2
#    macs2_callpeak.generate_cmd({
#        'output_dir': macs2_output_dir,
#        'input_bed': lib_prefix + '.adjusted.bl.unmappedrm.duprm.sorted.unique.bed',
#        'lib_prefix': lib_prefix
#    }).run()
    
    # Clean up a bit
#    files_to_remove = (['*.clipped.fastq.gz', '*.txt.gz', '*.sai',
#        lib_prefix + '.83.bl.unmappedrm.sorted.bed', lib_prefix + '.99.bl.unmappedrm.sorted.bed',
#        lib_prefix + '.147.bl.unmappedrm.sorted.bed', lib_prefix + '.163.bl.unmappedrm.sorted.bed'])
#    for file_to_rm in files_to_remove:
#        subprocess.call('rm -rf ' + fastq_dir + file_to_rm, shell=True)
    
    
    
    
    
    
    
#    script_recover_fragments.generate_cmd({
#        'input_file':lib_prefix + '.12.sam',
#        'output_file':lib_prefix + '.frag.bed'
#    }).run()
    
    # TODO Run perl script to get alignment stats
    # TODO Run perl script to recover fragments
    
    # Pipeline step: re-gzip files
#    for fastq in fastq_files_prefix:
#        gzip.generate_cmd({}, {
#            'input_file':fastq + '.clipped.fastq'
#        }).run()
    
    
    # Run fastx_clipper
#    Gzip().generate_cmd({}, {
#        'input_file':fastq_files[0]
#    }).run()
#    FastxClipper().generate_cmd({
#        'input_file':'input.fastq.1',
#        'output_file':'input.clipped.fastq.1'
#    }, {}).run()
#    Gunzip().generate_cmd({}, {'input_file':'input.clippsed.fastq.1'}).run()
    
    
    # TODO Check to make sure we're in the correct directory
    log.info('ATACseq pipeline ran successfully')
示例#3
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'script_recover_fragments')
示例#4
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'cutadapt_pe')
示例#5
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'picard_MarkDuplicates')
示例#6
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'IGVTools')
示例#7
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'MACS2')
示例#8
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'bedtools')
示例#9
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'samtools_flagstat')
示例#10
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'FSeq')
示例#11
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'samtools_view')
示例#12
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'bwa_sampe')
示例#13
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'bwa_aln')
示例#14
0
 def __init__(self):
     PipelineSoftwareBase.__init__(self, 'fastqc')
示例#15
0
 def __init__(self, name):
     PipelineSoftwareBase.__init__(self, name)