def remove_aberrant_reads(self, threads=1):
        if not os.path.exists(self.raw_bam):
            raise Exception("raw BAM file does not exist")

        job = jr.JobRunner()

        tmp_bam = "%s/%s.tmp.bam" %(self.output_dir, self.prefix)
        tmp_clean_bam = "%s/%s.tmp.clean.bam" %(self.output_dir, self.prefix)

        if self.is_paired:
            # Filter unmapped and low quality (MAPQ<20) reads
            command = "samtools view -@ %s -F1804 -f2 -q 20 -h" %(threads)\
                       + "-u %s | " %(self.raw_bam)\
                       + "samtools sort -@ %s -o %s -n -" %(threads, tmp_bam)
            job.append([[ command ]])
            job.run()

            # Filter orphan reads (pair was removed), and read pairs
            # mapping to different chromosomes
            command = "samtools fixmate -r %s %s" %(tmp_bam, tmp_clean_bam)
            job.append([[ command ]])
            job.run()
            command = "samtools view -@ %s -F1804 -f2 -u %s | " %(threads, tmp_clean_bam)\
                       + "samtools sort -@ %s -o %s -" %(threads, self.flt_bam)
            job.append([[ command ]])
            job.run()

            job.append([[ "rm %s %s" %(tmp_bam, tmp_clean_bam) ]])
            job.run()
        else:
            # Filter unmapped and low quality (MAPQ<20) reads
            command = "samtools view -@ %s -F1804 -q 20 " %(threads)\
                      + "-u %s -o %s" %(self.raw_bam, self.flt_bam)
            job.append([[ command ]])
            job.run()
예제 #2
0
def peak_oracle(inputs):
    insert = 0
    if inputs['fraglen'] > 0:
        insert = inputs['fraglen']
    else:
        job = jr.JobRunner()
        for prefix in inputs['prefix_inputs']:
            if inputs['spp']:
                job.append([[
                    "cat %s/%s/bwa_out/*.cc.qc | cut -f3" %
                    (inputs['outdir'], prefix)
                ]])
                insert = insert + int(job.run()[0].rstrip())
            else:
                job.append([[
                    "grep -A2 \"## METRICS CLASS\" %s/%s/bwa_out/InsertSizeMetrics.txt | tail -1 | cut -f1"
                    % (inputs['outdir'], prefix)
                ]])
                insert = insert + int(job.run()[0].rstrip())
        insert = insert / len(inputs['prefix_inputs'])

    ### PEAK
    run_peak = pc.Macs2PeakCaller(
        "%s/oracle_peaks/pooled_inputs.bed.gz" % (inputs['outdir']),
        "%s/oracle_peaks/pooled_controls.bed.gz" % (inputs['outdir']), insert,
        "BEDPE", "%s/oracle_peaks/macs_output" % (inputs['outdir']),
        inputs['genome'], "hs", inputs['macs2'], inputs['common'])
    run_peak.run()
    def remove_badcigar(self, threads=1):
        job = jr.JobRunner()
        peek = []

        if self.is_paired:
            # Find reads with bad CIGAR strings
            badcigar = "%s/%s.badcigar" %(self.output_dir, self.prefix)
            command = """zcat %s | \
                         awk 'BEGIN{FS="\t"; OFS="\t"}
                              !/^@/ && $6!="*" {
                              cigar=$6; gsub("[0-9]+D","",cigar);
                              n=split(cigar, vals, "[A-Z]");
                              s=0; for(i=1;i<=n;i++) s=s+vals[i];
                              seqlen=length($10);
                              if(s!=seqlen) print $1"\t";}' | \
                         sort | uniq > %s""" %(self.raw_sam, badcigar)
            job.append([[ command ]])
            job.run()

            with open(badcigar) as fp:
                peek = [x for i,x in enumerate(fp) if i<10 and x.rstrip()]

        # Remove the reads with bad CIGAR strings
        if len(peek):
            command = "zcat %s | grep -vF -f %s | " %(self.raw_sam, badcigar)\
                       + "samtools view -@ %s -Su - | " %(threads) \
                       + "samtools sort -@ %s -o %s -" %(threads, self.raw_bam)
        else:
            command = "samtools view -@ %s -Su %s | " %(threads, self.raw_sam)\
                       + "samtools sort -@ %s -o %s -" %(threads, self.raw_bam)
        job.append([[ command ]])
        job.run()

        job.append([[ "rm %s" %(badcigar) ]])
        job.run()
    def generate_bed(self, threads=1):
        if not os.path.exists(self.final_bam):
            raise Exception("final BAM file does not exist")

        job = jr.JobRunner()

        if self.is_paired:
            tmp_bam = "%s/%s.tmp.bam" %(self.output_dir, self.prefix)
            command = "samtools sort -@ %s -n -o %s %s" %(threads, tmp_bam, self.final_bam)
            job.append([[ command ]])
            job.run()

            command = "bedtools bamtobed -bedpe -mate1 -i %s | " %(tmp_bam)\
                       + "gzip -c > %s" %(self.bedpe)
            job.append([[ command ]])
            job.run()
            command = """zcat %s | \
                         awk 'BEGIN{OFS="\t"; FS="\t"}
                              { chrom=$1; beg=$2; end=$6;
                                if($2>$5){beg=$5} if($3>$6){end=$3}
                                print chrom,beg,end
                              }' - | sort --parallel=%s -k1,1 -k2,2n | \
                         gzip -c > %s""" %(self.bedpe, threads, self.bed)
            job.append([[ command ]])
            job.run()

            job.append([[ "rm %s" %(tmp_bam) ]])
            job.run()
        else:
            command = "bedtools bamtobed -i %s | cut -f1-3 | " %(self.final_bam)\
                      + "sort --parallel=%s -k1,1 -k2,2n | " %(threads)\
                      + "gzip -c > %s" %(self.bed)
예제 #5
0
    def cross_correlation(self, bedpe, phantompeak_script, cpu_num):
        if not os.path.exists(bedpe):
            raise Exception("BEDPE file does not exist")

        job = jr.JobRunner()

        prefix = bedpe.split(".bedpe.")[0]
        subsample_tags = prefix + ".subsample.tagAlign.gz"

        cc_plot = prefix + "_crossCorrPlot.QC.pdf"
        cc_score = prefix + "_crossCorrScore.QC.txt"

        command = """zcat %s | grep -v 'chrM' | shuf -n 15000000 | \
                     awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"N","1000",$9}' | \
                     gzip -c > %s""" % (bedpe, subsample_tags)
        job.append([[command]])
        job.run()

        command = "Rscript %s " %(phantompeak_script)\
                   + "-c=%s -p=%s -filtchr=chrM " %(subsample_tags, cpu_num)\
                   + "-savp=%s -out=%s" %(cc_plot, cc_score)
        job.append([[command]])
        job.run()

        command = """sed -r 's/,[^\t]+//g' %s | \
                     awk -F ".subsample.tagAlign.gz" '{print $1".bam"$2}' \
                     >> %s.tmp""" % (cc_score, cc_score)
        job.append([[command]])
        job.run()

        job.append([["mv %s.tmp %s" % (cc_score, cc_score)]])
        job.append([["rm %s" % (subsample_tags)]])
        job.run()
    def run(self):
        job = jr.JobRunner()
        # narrowPeak
        np_oracle = "%s/%s/macs_output/%s/%s.narrowPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)
        np_psr1 = "%s/%s/macs_output/psr_%s.00/psr_%s.00.narrowPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)
        np_psr2 = "%s/%s/macs_output/psr_%s.01/psr_%s.01.narrowPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)

        # broadPeak
        bp_oracle = "%s/%s/macs_output/%s/%s.broadPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)
        bp_psr1 = "%s/%s/macs_output/psr_%s.00/psr_%s.00.broadPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)
        bp_psr2 = "%s/%s/macs_output/psr_%s.01/psr_%s.01.broadPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)

        # gappedPeak
        gp_oracle = "%s/%s/macs_output/%s/%s.gappedPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)
        gp_psr1 = "%s/%s/macs_output/psr_%s.00/psr_%s.00.gappedPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)
        gp_psr2 = "%s/%s/macs_output/psr_%s.01/psr_%s.01.gappedPeak.gz" % (
            self.out, self.input_file, self.input_file, self.input_file)

        if not os.path.isdir("%s/gappedPeaks/" % (self.out)):
            os.mkdir("%s/gappedPeaks/" % (self.out))

        if not os.path.isdir("%s/broadPeaks/" % (self.out)):
            os.mkdir("%s/broadPeaks/" % (self.out))

        if not os.path.isdir("%s/narrowPeaks/" % (self.out)):
            os.mkdir("%s/narrowPeaks/" % (self.out))


        job.append([["bedtools intersect \
                     -a %s -b %s -f 0.50 -F 0.50 -e |\
                     bedtools intersect \
                     -a stdin -b %s -f 0.50 -F 0.50 -e -u> \
                     %s/narrowPeaks/%s.final.narrowPeak.gz"\
                     %(np_oracle,np_psr1,np_psr2, \
                       self.out, self.input_file)]])

        job.append([["bedtools intersect \
                     -a %s -b %s -f 0.50 -F 0.50 -e |\
                     bedtools intersect \
                     -a stdin -b %s -f 0.50 -F 0.50 -e -u > \
                     %s/broadPeaks/%s.final.broadPeak.gz"\
                     %(bp_oracle,bp_psr1,np_psr2, \
                       self.out, self.input_file)]])
        job.append([["bedtools intersect \
                     -a %s -b %s -f 0.50 -F 0.50 -e |\
                     bedtools intersect \
                     -a stdin -b %s -f 0.50 -F 0.50 -e -u > \
                     %s/gappedPeaks/%s.final.gappedPeak.gz"\
                     %(gp_oracle,gp_psr1,np_psr2, \
                       self.out, self.input_file)]])
        job.run()
예제 #7
0
    def run(self):
        job = jr.JobRunner()

        # Shuffle input
        print("Shuffle input")
        shuf_out = os.path.join(self.out, "shuf_%s.bedpe" % (self.prefix))
        job.append([[
            "zcat %s | shuf --random-source=%s > %s" %
            (self.input_file, self.input_file, shuf_out)
        ]])
        job.run()

        # Split into two files
        print("split in two")
        psr_prefix = os.path.join(self.out, "psr_%s." % (self.prefix))
        job.append([[
            "split -d -nl/2 --additional-suffix=\".bedpe\"\
                      %s %s" % (shuf_out, psr_prefix)
        ]])
        job.run()

        # TODO core to be set according to input parameters
        print("sort")
        job = jr.JobRunner()
        job.append([[
            "sort --parallel=%s -S 2G \
                     -k1,1 -k2,2n %s00.bedpe | gzip -c > %s00.bedpe.gz" %
            (self.cpus, psr_prefix, psr_prefix)
        ]])
        job.append([[
            "sort --parallel=%s -S 2G \
                     -k1,1 -k2,2n %s01.bedpe | gzip -c > %s01.bedpe.gz" %
            (self.cpus, psr_prefix, psr_prefix)
        ]])
        #job.append([["zcat %s | sort --parallel=%s -S 2G \
        #             -k1,1 -k2,2n -o %s && gzip -c %s"
        #             %(self.input_file, self.cpus, self.input_file, self.input_file )]])
        job.run()

        # Clean
        job.append([[
            "rm %s00.bedpe %s01.bedpe %s" % (psr_prefix, psr_prefix, shuf_out)
        ]])
        job.run()
예제 #8
0
def main(inputs):
    with open('%s/inputs.json' % (inputs['outdir']), 'w') as outfile:
        json.dump(inputs, outfile)

    job = jr.JobRunner(cpus=inputs['threads'])

    print("--- Welcome to the pipeline ---")
    if inputs['pooled']:
        print("Warning : The controls libraries will be pooled together.")

        for idx in range(len(inputs['inputs'])):
            job.append([[
                "python %s/run_psychip_pool_input.py %s/inputs.json map %s inputs"
                % (os.path.dirname(
                    os.path.abspath(__file__)), inputs['outdir'], idx)
            ]])
        for idx in range(len(inputs['controls'])):
            job.append([[
                "python %s/run_psychip_pool_input.py %s/inputs.json map %s controls"
                % (os.path.dirname(
                    os.path.abspath(__file__)), inputs['outdir'], idx)
            ]])
        job.run()

        job.append([[
            "python %s/run_psychip_pool_input.py %s/inputs.json pooling" %
            (os.path.dirname(os.path.abspath(__file__)), inputs['outdir'])
        ]])
        job.run()
        for idx in range(len(inputs['inputs']) +
                         1):  # the plus one is for the pooled input
            job.append([[
                "python %s/run_psychip_pool_input.py %s/inputs.json peak %s" %
                (os.path.dirname(
                    os.path.abspath(__file__)), inputs['outdir'], idx)
            ]])
        job.run()
    else:
        for idx in range(len(inputs['inputs'])):
            job.append([[
                "python %s/run_psychip.py %s/inputs.json %s" %
                (os.path.dirname(
                    os.path.abspath(__file__)), inputs['outdir'], idx)
            ]])
        job.run()
        job.append([[
            "python %s/run_psychip_pool_input.py %s/inputs.json pooling" %
            (os.path.dirname(os.path.abspath(__file__)), inputs['outdir'])
        ]])
        job.run()
        job.append([[
            "python %s/run_psychip_pool_input.py %s/inputs.json peak %s" %
            (os.path.dirname(os.path.abspath(__file__)), inputs['outdir'],
             len(inputs['inputs']) + 1)
        ]])
        job.run()
    def clean(self):
        # Clean up intermediate files
        job = jr.JobRunner()
        job.append([[ "rm %s" %(self.raw_sam) ]])
        job.append([[ "rm %s" %(self.raw_bam) ]])
        job.append([[ "rm %s" %(self.flt_bam) ]])
        job.run()

        self.raw_sam = None
        self.raw_bam = None
        self.flt_bam = None
예제 #10
0
def peak(inputs, idx):
    job = jr.JobRunner()
    if inputs['fraglen'] > 0:
        insert = inputs['fraglen']
    elif inputs['spp']:
        job.append([[
            "cat %s/%s/bwa_out/*.cc.qc | cut -f3" %
            (inputs['outdir'], inputs['prefix_inputs'][idx])
        ]])
        insert = job.run()[0].rstrip()
    else:
        job.append([[
            "grep -A2 \"## METRICS CLASS\" %s/%s/bwa_out/InsertSizeMetrics.txt | tail -1 | cut -f1"
            % (inputs['outdir'], inputs['prefix_inputs'][idx])
        ]])
        insert = job.run()[0].rstrip()
    ### PEAK
    run_peak = pc.Macs2PeakCaller(
        "%s/%s/macs_input/%s.bed.gz" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]),
        "%s/oracle_peaks/pooled_controls.bed.gz" % (inputs['outdir']), insert,
        "BEDPE", "%s/%s/macs_output/%s" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), inputs['genome'], "hs",
        inputs['macs2'], inputs['common'])
    run_peak.run()

    run_peak = pc.Macs2PeakCaller(
        "%s/%s/macs_input/psr_%s.00.bedpe.gz" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]),
        "%s/oracle_peaks/pooled_controls.bed.gz" % (inputs['outdir']), insert,
        "BEDPE", "%s/%s/macs_output/psr_%s.00" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), inputs['genome'], "hs",
        inputs['macs2'], inputs['common'])
    run_peak.run()

    run_peak = pc.Macs2PeakCaller(
        "%s/%s/macs_input/psr_%s.01.bedpe.gz" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]),
        "%s/oracle_peaks/pooled_controls.bed.gz" % (inputs['outdir']), insert,
        "BEDPE", "%s/%s/macs_output/psr_%s.01" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), inputs['genome'], "hs",
        inputs['macs2'], inputs['common'])
    run_peak.run()

    ### OVERLAP
    run_overlap = ovrlp.Overlap(inputs['prefix_inputs'][idx], inputs['outdir'])
    run_overlap.run()
예제 #11
0
    def get_mapstats(self, raw_bam, final_bam):
        if not os.path.exists(raw_bam):
            raise Exception("raw BAM file does not exist")

        if not os.path.exists(final_bam):
            raise Exception("final BAM file does not exist")

        job = jr.JobRunner()

        raw_bam_qc = raw_bam[:-3] + "flagstat.QC.txt"
        final_bam_qc = final_bam[:-3] + "flagstat.QC.txt"

        command = "samtools flagstat %s > %s"
        job.append([[command % (raw_bam, raw_bam_qc)]])
        job.append([[command % (final_bam, final_bam_qc)]])
        job.run()
예제 #12
0
    def get_library_complexity(self, flt_bam, is_paired=True, threads=1):
        if not os.path.exists(flt_bam):
            raise Exception("filtered BAM file does not exist")

        job = jr.JobRunner()

        prefix = os.path.basename(flt_bam).split(".flt.")[0]

        tmp_bam = "%s/%s.tmp.bam" % (self.output_dir, prefix)
        qc_pbc = "%s/%s.libComplexity.QC.txt" % (self.output_dir, prefix)

        command = "samtools sort -n -@ %s " %(threads)\
                   + "-o %s %s" %(tmp_bam, flt_bam)
        job.append([[command]])
        job.run()

        job.append([["mv %s %s" % (tmp_bam, flt_bam)]])
        job.run()

        # Library Complexity
        # [1] TotalReadPairss
        # [2] DistinctReadPairs
        # [3] OneReadPairs
        # [4] TwoReadPairs
        # [5] NRF=Distinct/Total
        # [6] PBC1=OnePair/Distinct
        # [7] PBC2=OnePair/TwoPair
        command = """echo 1 | \
                     awk '{print "#Total\tDistinct\tOne\tTwo\tNRF\tPBC1\tPBC2"}' \
                     > %s""" % (qc_pbc)
        job.append([[command]])
        job.run()

        if is_paired:
            command = """bedtools bamtobed -bedpe -i %s | \
                         awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}' | """
        else:
            command = """bedtools bamtobed -i %s | " \
                         awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}' | """
        command += """grep -v 'chrM' | sort | uniq -c | \
                      awk 'BEGIN{mt=0;m0=0;m1=0;m2=0; OFS="\t"}
                           ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1}
                           END{print mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'\
                      >> %s"""
        command = command % (flt_bam, qc_pbc)
        job.append([[command]])
        job.run()
    def run(self, cpu_num):
        if cpu_num <= 0:
            raise Exeption("The number of CPU must be > 0!")

        job = jr.JobRunner()

        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        sam_output = "%s/%s.raw.sam.gz" % (self.output_dir, self.prefix)

        command = "bwa mem -M -t %s %s " %(cpu_num, self.genome_index) \
                   + "%s %s " %(self.fastq1, self.fastq2) \
                   + "| gzip -c > %s" %(sam_output)

        job.append([[command]])
        job.run()
예제 #14
0
def pool_libraries(inputs):
    if not os.path.isdir("%s/oracle_peaks" % (inputs['outdir'])):
        os.mkdir("%s/oracle_peaks" % (inputs['outdir']))
    job = jr.JobRunner()
    cmd_inputs = "zcat "
    for prefix in inputs['prefix_inputs']:
        cmd_inputs = cmd_inputs + "%s/%s/macs_input/%s.bed.gz " % (
            inputs['outdir'], prefix, prefix)
    cmd_inputs = cmd_inputs + "| sort --parallel=%s -k1,1 -k2,2n | gzip -c > %s/oracle_peaks/pooled_inputs.bed.gz" % (
        inputs['cpus'], inputs['outdir'])
    cmd_controls = "zcat "
    for prefix in inputs['prefix_controls']:
        cmd_controls = cmd_controls + "%s/%s/macs_input/%s.bed.gz " % (
            inputs['outdir'], prefix, prefix)
    cmd_controls = cmd_controls + "| sort --parallel=%s -k1,1 -k2,2n | gzip -c > %s/oracle_peaks/pooled_controls.bed.gz" % (
        inputs['cpus'], inputs['outdir'])
    job.append([[cmd_controls]])
    job.append([[cmd_inputs]])
    job.run()
    def remove_artifacts(self, picard_jar, threads = 1, sponge = None):
        if not os.path.exists(self.flt_bam):
            raise Exception("filtered BAM file does not exist!")

        job = jr.JobRunner()
        tmp_bam = "%s/%s.tmp.bam" %(self.output_dir, self.prefix)

        qc_dups = "%s/%s.pcrDups.QC.txt" %(self.output_dir, self.prefix)

        # Mark PCR duplicates and generate QC file with Picard
        command = "java -Xmx4G -jar %s MarkDuplicates " %(picard_jar)\
                   + "INPUT=%s OUTPUT=%s " %(self.flt_bam, tmp_bam)\
                   + "METRICS_FILE=%s ASSUME_SORTED=true " %(qc_dups)\
                   + "VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=false"
        job.append([[ command ]])
        job.run()
        
        job.append([[ "mv %s %s" %(tmp_bam, self.flt_bam) ]])
        job.run()

        pair_opt = ""
        if self.is_paired:
            pair_opt = "-f2"

        if sponge:
            command = "samtools view -@ %s " %(threads)\
                       + "-F1804 %s -h %s | " %(pair_opt, self.flt_bam)\
                       + "grep -vF -f %s - | " %(sponge)\
                       + "samtools view -@ %s -b -o %s -" %(threads, self.final_bam)
            job.append([[ command ]])
        else:
            command = "samtools view -@ %s -F1804 %s -b " %(threads, pair_opt)\
                       + "-o %s %s" %(self.final_bam, self.flt_bam)
            job.append([[ command ]])
        job.run()

        # Make BAM index file
        command = "samtools index %s %s" %(self.final_bam, self.final_bai)
        job.append([[ command ]])
        job.run()
예제 #16
0
def run_main(inputs, idx):
    ### ALIGN CONTROL LIBRARIES
    run_map_control = mp.BwaMapper(
                            inputs['prefix_controls'][idx], #prefix
                            inputs['index'], # bwa index
                            os.path.join(os.path.join(inputs['outdir'],\
                                         inputs['prefix_controls'][idx]),\
                                         "bwa_out"), # output directory
                            inputs['controls'][idx][0], # fastq read 1
                            inputs['controls'][idx][1]) # fastq read 2

    run_map_control.run(inputs['cpus'])  # CPU for BWA
    # Output: R1.raw.sam.gz
    # Format: <prefix>.raw.sam.gz

    ### Post alignment filtering
    run_filter_control = pf.PostFilter(
        "%s/%s/bwa_out/%s.raw.sam.gz" %
        (inputs['outdir'], inputs['prefix_controls'][idx],
         inputs['prefix_controls'][idx]), "%s/%s/bwa_out" %
        (inputs['outdir'], inputs['prefix_controls'][idx]))  # output directory

    ## Step 1: remove reads with bad CIGAR strings
    run_filter_control.remove_badcigar(inputs['cpus'])  # Threads for SAMtools
    # Output: R1.raw.bam
    # Format: <prefix>.raw.bam

    ## Step 2: remove unmapped, low quality, orphan, and supplemental reads
    run_filter_control.remove_aberrant_reads(
        inputs['cpus'])  # Threads for SAMtools
    # Output: R1.flt.bam
    # Format: <prefix>.flt.bam

    ## Step 3: remove PCR duplicates and sponge mapping reads
    ##         (currently, "java -Xmx4G". Please change it if needed)
    run_filter_control.remove_artifacts(
        inputs['picard'],  # path to picard.jar
        inputs['threads'],
        inputs['sponges'])  # sponge name list
    # Output: R1.final.[bam|bai], R1.pcrDups.QC.txt

    ## Step 4: generate BED and BEDPE files
    ##         (those files are *unsorted*)
    run_filter_control.generate_bed(inputs['cpus'])  # Threads for SAMtools
    # Output: R1.bed.gz (for pseudorep)
    # Output: R1.bedpe.gz (bedtool raw output)

    if not os.path.isdir("%s/%s/macs_input/" %
                         (inputs['outdir'], inputs['prefix_controls'][idx])):
        os.mkdir("%s/%s/macs_input/" %
                 (inputs['outdir'], inputs['prefix_controls'][idx]))
    if not os.path.isfile("%s/%s/macs_input/%s.bed.gz" %
                          (inputs['outdir'], inputs['prefix_controls'][idx],
                           inputs['prefix_controls'][idx])):
        shutil.move("%s/%s/bwa_out/%s.bed.gz" %(inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]),\
               "%s/%s/macs_input/%s.bed.gz" %(inputs['outdir'], inputs['prefix_controls'][idx], inputs['prefix_controls'][idx]))
    ### QC part INPUT LIBRARIES
    control_qc = qc.QC(
        "%s/%s/bwa_out/" %
        (inputs['outdir'], inputs['prefix_controls'][idx]))  # output direcotry

    ## Step 1: Get map stats
    control_qc.get_mapstats(
        "%s/%s/bwa_out/%s.raw.bam" %
        (inputs['outdir'], inputs['prefix_controls'][idx],
         inputs['prefix_controls'][idx]), "%s/%s/bwa_out/%s.final.bam" %
        (inputs['outdir'], inputs['prefix_controls'][idx],
         inputs['prefix_controls'][idx]))  # Output: R1.final.flagstat.QC.txt

    ## Step 2: Compute library complexity
    control_qc.get_library_complexity(
        "%s/%s/bwa_out/%s.flt.bam" %
        (inputs['outdir'], inputs['prefix_controls'][idx],
         inputs['prefix_controls'][idx]),
        inputs['cpus'])  # Threads for SAMtools

    job = jr.JobRunner()
    if inputs['fraglen'] > 0:
        insert = inputs['fraglen']

    elif inputs['spp']:
        ### Find insert size
        control_xcor = xcor.Xcor(
            "%s/%s/bwa_out/%s.final.bam" %
            (inputs['outdir'], inputs['prefix_controls'][idx],
             inputs['prefix_controls'][idx]), "%s/common/run_spp_nodups.R" %
            (os.path.dirname(os.path.abspath(__file__))), inputs['cpus'])
        control_xcor.process()
    else:
        job.append([["java -jar %s CollectInsertSizeMetrics I=%s/%s/bwa_out/%s.final.bam\
                      O=%s/%s/bwa_out/InsertSizeMetrics.txt AS=F H=%s/%s/bwa_out/InsertSizeMetrics.histogram" \
                      %(inputs['picard'],inputs['outdir'],inputs['prefix_controls'][idx],inputs['prefix_controls'][idx],\
                        inputs['outdir'],inputs['prefix_controls'][idx],\
                        inputs['outdir'],inputs['prefix_controls'][idx])]])
        job.run()

    run_filter_control.clean()
    ### END CONTROL LIBRARIES

    ### ALIGN INPUT LIBRARIES
    run_map_input = mp.BwaMapper(
                            inputs['prefix_inputs'][idx], #prefix
                            inputs['index'], # bwa index
                            os.path.join(os.path.join(inputs['outdir'],\
                                         inputs['prefix_inputs'][idx]),\
                                         "bwa_out"), # output directory
                            inputs['inputs'][idx][0], # fastq read 1
                            inputs['inputs'][idx][1]) # fastq read 2

    run_map_input.run(inputs['cpus'])  # CPU for BWA
    # Output: R1.raw.sam.gz
    # Format: <prefix>.raw.sam.gz

    ### Post alignment filtering
    run_filter_input = pf.PostFilter("%s/%s/bwa_out/%s.raw.sam.gz" %(inputs['outdir'], inputs['prefix_inputs'][idx],inputs['prefix_inputs'][idx]), \
                               "%s/%s/bwa_out" %(inputs['outdir'], inputs['prefix_inputs'][idx])) # output directory

    ## Step 1: remove reads with bad CIGAR strings
    run_filter_input.remove_badcigar(inputs['cpus'])  # Threads for SAMtools
    # Output: R1.raw.bam
    # Format: <prefix>.raw.bam

    ## Step 2: remove unmapped, low quality, orphan, and supplemental reads
    run_filter_input.remove_aberrant_reads(
        inputs['cpus'])  # Threads for SAMtools
    # Output: R1.flt.bam
    # Format: <prefix>.flt.bam

    ## Step 3: remove PCR duplicates and sponge mapping reads
    ##         (currently, "java -Xmx4G". Please change it if needed)
    run_filter_input.remove_artifacts(
        inputs['picard'],  # path to picard.jar
        inputs['threads'],
        inputs['sponges'])  # sponge name list
    # Output: R1.final.[bam|bai], R1.pcrDups.QC.txt

    ## Step 4: generate BED and BEDPE files
    ##         (those files are *unsorted*)
    run_filter_input.generate_bed(inputs['cpus'])  # Threads for SAMtools
    # Output: R1.bed.gz (for pseudorep)
    # Output: R1.bedpe.gz (bedtool raw output)

    if not os.path.isdir("%s/%s/macs_input/" %
                         (inputs['outdir'], inputs['prefix_inputs'][idx])):
        os.mkdir("%s/%s/macs_input/" %
                 (inputs['outdir'], inputs['prefix_inputs'][idx]))
    if not os.path.isfile("%s/%s/macs_input/%s.bed.gz" %
                          (inputs['outdir'], inputs['prefix_inputs'][idx],
                           inputs['prefix_inputs'][idx])):
        shutil.move("%s/%s/bwa_out/%s.bed.gz" %(inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]),\
               "%s/%s/macs_input/%s.bed.gz" %(inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]))

    ### QC part INPUT LIBRARIES
    test_qc = qc.QC(
        "%s/%s/bwa_out/" %
        (inputs['outdir'], inputs['prefix_inputs'][idx]))  # output direcotry

    ## Step 1: Get map stats
    test_qc.get_mapstats(
        "%s/%s/bwa_out/%s.raw.bam" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), "%s/%s/bwa_out/%s.final.bam" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]))  # Output: R1.final.flagstat.QC.txt

    ## Step 2: Compute library complexity
    test_qc.get_library_complexity(
        "%s/%s/bwa_out/%s.flt.bam" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), inputs['cpus'])  # Threads for SAMtools
    # Output: R1.libComplexity.QC.txt
    ## Find insert size
    job = jr.JobRunner()
    if inputs['fraglen'] > 0:
        insert = inputs['fraglen']

    elif inputs['spp']:
        ### Find insert size
        input_xcor = xcor.Xcor(
            "%s/%s/bwa_out/%s.final.bam" %
            (inputs['outdir'], inputs['prefix_inputs'][idx],
             inputs['prefix_inputs'][idx]), "%s/common/run_spp_nodups.R" %
            (os.path.dirname(os.path.abspath(__file__))), inputs['cpus'])
        input_xcor.process()
        job.append([[
            "cat %s/%s/bwa_out/*.cc.qc | cut -f3" %
            (inputs['outdir'], inputs['prefix_inputs'][idx])
        ]])
        insert = int(job.run()[0].rstrip())
        #with open(,'r') as fh:
        #    firstline = fh.readline()
        #    insert = firstline.split()[2] #third column

    else:
        job.append([["java -jar %s CollectInsertSizeMetrics I=%s/%s/bwa_out/%s.final.bam\
                      O=%s/%s/bwa_out/InsertSizeMetrics.txt AS=F H=%s/%s/bwa_out/InsertSizeMetrics.histogram" \
                      %(inputs['picard'],inputs['outdir'],inputs['prefix_inputs'][idx],inputs['prefix_inputs'][idx],\
                        inputs['outdir'],inputs['prefix_inputs'][idx],\
                        inputs['outdir'],inputs['prefix_inputs'][idx])]])
        job.run()
        job.append([[
            "grep -A2 \"## METRICS CLASS\" %s/%s/bwa_out/InsertSizeMetrics.txt | tail -1 | cut -f1"
            % (inputs['outdir'], inputs['prefix_inputs'][idx])
        ]])
        insert = job.run()[0].rstrip()
    run_filter_input.clean()

    ### PSEUDOREP
    run_psr = psr.PseudorepsGenerator("%s/%s/macs_input/%s.bed.gz" %(inputs['outdir'], inputs['prefix_inputs'][idx], inputs['prefix_inputs'][idx]),\
                                      inputs['prefix_inputs'][idx],
                                      "%s/%s/macs_input/" %(inputs['outdir'], inputs['prefix_inputs'][idx]),
                                      inputs['threads'],
                                      inputs['cpus'])

    run_psr.run()

    ### PEAK
    run_peak = pc.Macs2PeakCaller(
        "%s/%s/macs_input/%s.bed.gz" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), "%s/%s/macs_input/%s.bed.gz" %
        (inputs['outdir'], inputs['prefix_controls'][idx],
         inputs['prefix_controls'][idx]), insert, "BEDPE",
        "%s/%s/macs_output/%s" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), inputs['genome'], "hs",
        inputs['macs2'], inputs['common'])
    run_peak.run()

    run_peak = pc.Macs2PeakCaller(
        "%s/%s/macs_input/psr_%s.00.bedpe.gz" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), "%s/%s/macs_input/%s.bed.gz" %
        (inputs['outdir'], inputs['prefix_controls'][idx],
         inputs['prefix_controls'][idx]), insert, "BEDPE",
        "%s/%s/macs_output/psr_%s.00" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), inputs['genome'], "hs",
        inputs['macs2'], inputs['common'])
    run_peak.run()

    run_peak = pc.Macs2PeakCaller(
        "%s/%s/macs_input/psr_%s.01.bedpe.gz" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), "%s/%s/macs_input/%s.bed.gz" %
        (inputs['outdir'], inputs['prefix_controls'][idx],
         inputs['prefix_controls'][idx]), insert, "BEDPE",
        "%s/%s/macs_output/psr_%s.01" %
        (inputs['outdir'], inputs['prefix_inputs'][idx],
         inputs['prefix_inputs'][idx]), inputs['genome'], "hs",
        inputs['macs2'], inputs['common'])
    run_peak.run()

    ### OVERLAP
    run_overlap = ovrlp.Overlap(inputs['prefix_inputs'][idx], inputs['outdir'])
    run_overlap.run()
예제 #17
0
def check_arguments(args):
    checker = u.Utils()
    inputs = {}
    try:
        # Check if the input files exist and are well formatted
        os.path.isfile(args['f'])
        inputs['inputs'], inputs['controls'], inputs['prefix_inputs'], inputs[
            'prefix_controls'] = input_integrity(args['f'])

        # Check if the output folder exist
        if os.path.isdir(args['o']):
            raise Exception(
                "mkdir: cannot create directory '%s': File exists." %
                (args['o']))
        else:
            os.makedirs(args['o'])
            inputs['outdir'] = args['o']

        # Check if Bowtie index exists
        if args['b'].endswith('.bwt'):
            prefix_index = os.path.splitext(args['b'])[0]
        else:
            prefix_index = args['b']

        for suffix in ['.bwt', '.amb', '.ann', '.pac', '.sa']:
            if not os.path.isfile("%s%s" % (prefix_index, suffix)):
                raise Exception("The BWA index file '%s%s' does not exist." %
                                (prefix_index, suffix))
        inputs['index'] = prefix_index

        # Check if genome file exists
        if not os.path.isfile(args['g']):
            raise Exception("The genome chromInfo file '%s' does not exist." %
                            (args['g']))
        inputs['genome'] = args['g']

        if args['sponges'] != None:
            if not os.path.isfile(args['sponges']):
                raise Exception("The sponges file '%s' does not exist." %
                                (args['sponges']))
            inputs['sponges'] = args['sponges']
        else:
            inputs['sponges'] = None

        if args['macs2'] != None:
            if os.path.exists(args['macs2']):
                inputs['macs2'] = args['macs2']
            else:
                raise Exception("Macs2 not found in '%s'" % (args['macs2']))
        else:
            try:
                checker.which('macs2')
                inputs['macs2'] = "macs2"
            except Exception as e:
                raise (e)


#        if not os.path.exists(args['picard']):
#            raise Exception("Picard not found in '%s'" %(args['picard']))
#        else:
#            inputs['picard'] = args['picard']
        inputs['fraglen'] = -1
        if args['fraglen'] != None:
            if args['spp']:
                raise Exception(
                    "'--spp' and '--fraglen' are mutually exclusive. Please select only one."
                )
            else:
                inputs['fraglen'] = args['fraglen']
        else:
            if args['spp']:
                try:
                    j = jr.JobRunner()
                    j.append([['Rscript ./common/test.R']])
                    j.run()
                    inputs['spp'] = True
                except:
                    raise Exception(
                        "Spp is not currently installed or cannot be loaded.")
            else:
                inputs['spp'] = False
                inputs['fraglen'] = -1
                print('Picard tools will be used to estimate insert size.')

        inputs['threads'] = args['t']
        inputs['cpus'] = args['c']
        inputs['common'] = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "common")
        inputs['picard'] = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "common/picard-tools-1.141/picard.jar")

        # If not enough controls are supplied, use pooled controls.
        if len(inputs['inputs']) != len(inputs['controls']):
            inputs['pooled'] = True
        else:
            inputs['pooled'] = False

        print("Argument Verification completed")
    except Exception as e:
        raise (e)

    return inputs
    def run(self):
        job=jr.JobRunner()

        #Define the output filenames
        peaks_dirname = self.output_name
        if not os.path.exists(peaks_dirname):
            os.makedirs(peaks_dirname)
        prefix =self.experiment_name.split("/")[-1]
        if prefix.endswith('.bedpe.gz'):
            prefix = prefix[:-9]
        elif prefix.endswith('.bed.gz') :
            prefix = prefix[:-7]

        narrowPeak_fn    = "%s/%s.narrowPeak" %(peaks_dirname, prefix)
        gappedPeak_fn    = "%s/%s.gappedPeak" %(peaks_dirname, prefix)
        broadPeak_fn     = "%s/%s.broadPeak"  %(peaks_dirname, prefix)
        self.narrowPeak_gz_fn = narrowPeak_fn + ".gz"
        self.gappedPeak_gz_fn = gappedPeak_fn + ".gz"
        self.broadPeak_gz_fn  = broadPeak_fn  + ".gz"
        self.narrowPeak_bb_fn = "%s.bb" %(narrowPeak_fn)
        self.gappedPeak_bb_fn = "%s.bb" %(gappedPeak_fn)
        self.broadPeak_bb_fn  = "%s.bb" %(broadPeak_fn)
        self.fc_signal_fn = "%s/%s.fc_signal.bw" \
                             %(peaks_dirname, prefix)
        self.pvalue_signal_fn = "%s/%s.pvalue_signal.bw" \
                                 %(peaks_dirname, prefix)

        # Extract the fragment length estimate from \
        # column 3 of the cross-correlation scores file
#        if self.xcor_scores_input_name == "150":
        fraglen=self.xcor_scores_input_name
#        else:
#            with open(self.xcor_scores_input_name,'r') as fh:
#                firstline = fh.readline()
#                fraglen = firstline.split()[2] #third column
#                print "Fraglen %s" %(fraglen)
        #===========================================
        command = '%s callpeak '%(self.macs2) + \
                  '-t %s -c %s '\
                   %(self.experiment_name, self.control_name) + \
                  '-f %s -n %s/%s '\
                   %(self.input_type, peaks_dirname, prefix) + \
                  '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s \
                  --keep-dup all -B --SPMR' %(self.genomesize, fraglen)
        job.append([["%s" %(command)]])
        job.run()
        #print command
        #returncode = common.block_on(command)
        #print "MACS2 exited with returncode %d" %(returncode)
        #assert returncode == 0, "MACS2 non-zero return"

        # Rescale Col5 scores to range 10-1000 to conform \
        # to narrowPeak.as format (score must be <1000)
        rescaled_narrowpeak_fn = utils.common.rescale_scores(
                                 '%s/%s_peaks.narrowPeak' \
                                  %(peaks_dirname, prefix), 
                                  scores_col=5)

        # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
        command = "sort -k 8gr,8gr %s |\
                   awk 'BEGIN{OFS=\"\\t\"}{$4=\"Peak_\"NR ; \
                   print $0}' | tee %s | gzip -c > %s" \
                   %(rescaled_narrowpeak_fn, narrowPeak_fn,
                   self.narrowPeak_gz_fn)

        job.append([["%s" %(command)]])
        job.run()

        # remove additional files
        command ="rm -f %s/%s_peaks.xls %s/%s_peaks.bed %s_summits.bed"\
                  %(peaks_dirname, prefix,
                    peaks_dirname, prefix, prefix)
        job.append([["%s" %(command)]])
        job.run()
        #===========================================
        # Generate Broad and Gapped Peaks
        #============================================
        command = '%s callpeak ' %(self.macs2) + \
                  '-t %s -c %s ' \
                  %(self.experiment_name, self.control_name) + \
                  '-f %s -n %s/%s '%(self.input_type, 
                                    peaks_dirname, prefix) + \
                  '-g %s -p 1e-2 --broad --nomodel --shift 0 \
                  --extsize %s --keep-dup all' \
                  %(self.genomesize, fraglen)
        #print command
        job.append([["%s" %(command)]])
        job.run()
        # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
        rescaled_broadpeak_fn = utils.common.rescale_scores('%s/%s_peaks.broadPeak'
                                               %(peaks_dirname, prefix),
                                               scores_col=5)
        # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak)  in descending order and replace long peak names in Column 4 with Peak_<peakRank>
        command = "sort -k 8gr,8gr %s | \
                   awk 'BEGIN{OFS=\"\\t\"}{$4=\"Peak_\"NR ; print $0}'|\
                   tee %s | gzip -c > %s"%(rescaled_broadpeak_fn, 
                                           broadPeak_fn,
                                           self.broadPeak_gz_fn)
        #print command

        job.append([["%s" %(command)]])
        job.run()

        # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
        rescaled_gappedpeak_fn = utils.common.rescale_scores('%s/%s_peaks.gappedPeak' 
                                       %(peaks_dirname, prefix),
                                       scores_col=5)

        command = "sort -k 14gr,14gr %s | \
                   awk 'BEGIN{OFS=\"\\t\"}{$4=\"Peak_\"NR ; print $0}'|\
                   tee %s | gzip -c > %s"%(rescaled_gappedpeak_fn,\
                                           gappedPeak_fn,
                                           self.gappedPeak_gz_fn)
        #print command

        job.append([["%s" %(command)]])
        job.run()

        # remove additional file
        job.append([["rm -f %s/%s_peaks.xls %s/%s_peaks.bed %s_summits.bed" %(peaks_dirname,prefix,peaks_dirname,prefix,prefix)]])
        job.run()

        #===========================================
        # For Fold enrichment signal tracks
        #============================================
        # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp).
        command = '%s bdgcmp ' %(self.macs2) + \
                  '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
                  '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
                  '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \
                  '-m FE'
        #print command

        job.append([["%s" %(command)]])
        job.run()
        # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
        job.append([["bedtools slop -i %s/%s_FE.bdg -g %s -b 0 |\
                    %s/bedClip stdin %s %s/%s.fc.signal.bedgraph"
                    %(peaks_dirname, prefix, self.chrom_sizes_name, self.common, 
                      self.chrom_sizes_name, peaks_dirname, prefix)]])
        #print command
        job.run()

        #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg
        # Convert bedgraph to bigwig
        command = '%s/bedGraphToBigWig '%(self.common) + \
                  '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix)+\
                  '%s %s' %(self.chrom_sizes_name, self.fc_signal_fn)
        #print command
        job.append([["%s" %(command)]])
        job.run()

        #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph
        #===========================================
        # For -log10(p-value) signal tracks
        #===========================================
        # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000
        if (self.input_type=="BED" or self.input_type=="BEDPE") :
            job.append([['gzip -dc %s' %(self.experiment_name)+'|wc -l']])
            chipReads = int(job.run()[0])
            job.append([['gzip -dc %s' %(self.control_name)+'|wc -l']])
            controlReads = int(job.run()[0])
            sval=str(min(float(chipReads), float(controlReads))/1000000)
        else:
            job.append([["samtools idxstats %s | awk '{sum=sum+$3}END{print sum}'"%(self.experiment_name)]])
            chipReads = int(job.run()[0])
            job.append([["samtools idxstats %s | awk '{sum=sum+$3}END{print sum}'"%(self.control_name)]])
            controlReads = int(job.run()[0])
            sval=str(min(float(chipReads), float(controlReads))/1000000)
        #    print sval,chipReads,controlReads
        print "chipReads = %s, controlReads = %s, sval = %s" %(chipReads, controlReads, sval)

        job.append([['%s bdgcmp ' %(self.macs2) + \
                '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
                '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
                '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \
                '-m ppois -S %s' %(sval)]])
        job.run()
        # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
        job.append([['bedtools slop -i %s/%s_ppois.bdg -g %s -b 0' 
                   %(peaks_dirname, prefix, self.chrom_sizes_name)+ \
                '| %s/bedClip stdin %s %s/%s.pval.signal.bedgraph'
                 %(self.common, self.chrom_sizes_name,peaks_dirname, prefix)]])
        job.run()

        job.append([["rm -rf %s/%s_ppois.bdg" %(peaks_dirname,prefix)]])
        job.run()

        # Convert bedgraph to bigwig
        command = '%s/bedGraphToBigWig ' %(self.common) + \
        '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s %s' %(self.chrom_sizes_name, self.pvalue_signal_fn)

        job.append([["%s" %(command)]])
        job.run()
        job.append([["rm -f %s/%s.pval.signal.bedgraph" %(peaks_dirname,prefix)]])
        job.append([["rm -f %s/%s_treat_pileup.bdg %s_control_lambda.bdg" %(peaks_dirname,prefix,prefix)]])
        job.run()
        #===========================================
        # Generate bigWigs from beds to support trackhub visualization of peak files
        #============================================
        narrowPeak_bb_fname = utils.common.bed2bb('%s' %(narrowPeak_fn),
                                        '%s' %(self.chrom_sizes_name),
                                        '%s' %(self.narrowPeak_as_name),
                                         bed_type='bed6+4')
        gappedPeak_bb_fname = utils.common.bed2bb('%s' %(gappedPeak_fn),
                                        '%s' %(self.chrom_sizes_name),
                                        '%s' %(self.gappedPeak_as_name),
                                        bed_type='bed12+3')
        broadPeak_bb_fname =  utils.common.bed2bb('%s' %(broadPeak_fn),
                                        '%s' %(self.chrom_sizes_name),
                                        '%s' %(self.broadPeak_as_name),
                                        bed_type='bed6+3')
예제 #19
0
    def process(self):
        job = jr.JobRunner()

        self.intermediate_TA_filename = self.input_bam_basename + ".tagAlign"
        if self.paired_end:
            end_infix = 'PE2SE'
        else:
            end_infix = 'SE'
        self.final_TA_filename = self.input_bam_basename + '.' + end_infix + '.tagAlign.gz'

        # ===================
        # Create tagAlign file
        # ===================

        job.append([[
            "bamToBed -i %s | " % (self.input_bam_filename),
            r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
            "| tee %s | " % (self.intermediate_TA_filename),
            "gzip -c > %s" % (self.final_TA_filename)
        ]])
        job.run()

        # ================
        # Create BEDPE file
        # ================
        if self.paired_end:
            self.final_BEDPE_filename = self.input_bam_basename + ".bedpe.gz"
            #need namesorted bam to make BEDPE
            self.final_nmsrt_bam_prefix = self.input_bam_basename + ".nmsrt"
            self.final_nmsrt_bam_filename = self.final_nmsrt_bam_prefix + ".bam"
            job.append([["samtools sort -@ %s -n -o %s %s" \
    %(self.cpus, self.final_nmsrt_bam_filename, self.input_bam_filename)]])
            job.run()
            job.append([[
                "bedtools bamtobed -bedpe -mate1 -i %s | " %
                (self.final_nmsrt_bam_filename),
                "gzip -c > %s" % (self.final_BEDPE_filename)
            ]])
            job.run()
        # =================================
        # Subsample tagAlign file
        # ================================
        NREADS = 15000000
        if self.paired_end:
            end_infix = 'MATE1'
        else:
            end_infix = 'SE'
        self.subsampled_TA_filename = self.input_bam_basename + ".filt.nodup.sample.%d.%s.tagAlign.gz" % (
            NREADS / 1000000, end_infix)
        steps = 'grep -v "chrM" %s | ' %(self.intermediate_TA_filename) + \
            'shuf -n %d | ' %(NREADS)
        if self.paired_end:
            steps += r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}' | """
        steps += 'gzip -c > %s' % (self.subsampled_TA_filename)
        print(steps)
        job.append([[steps]])
        job.run()
        # Calculate Cross-correlation QC scores
        self.CC_scores_filename = self.subsampled_TA_filename + ".cc.qc"
        self.CC_plot_filename = self.subsampled_TA_filename + ".cc.plot.pdf"

        # CC_SCORE FILE format
        # Filename <tab> numReads <tab> estFragLen <tab> corr_estFragLen <tab> PhantomPeak <tab> corr_phantomPeak <tab> argmin_corr <tab> min_corr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag
        print("Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" %
              (self.spp, self.subsampled_TA_filename, self.cpus,
               self.CC_plot_filename, self.CC_scores_filename))
        job.append([[
            "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s > /dev/null 2>&1" \
                %(self.spp, self.subsampled_TA_filename, self.cpus, self.CC_plot_filename, self.CC_scores_filename)]])
        job.run()

        job.append([[
            r"""sed -r  's/,[^\t]+//g' %s > %s """ %
            (self.CC_scores_filename, "temp")
        ]])
        job.run()

        job.append([["mv temp %s" % (self.CC_scores_filename)]])
        job.run()