示例#1
0
def blacklist_filter(peak, blacklist, keep_irregular_chr, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    peak_ext = get_ext(peak)
    filtered = '{}.bfilt.{}.gz'.format(prefix, peak_ext)

    if get_num_lines(peak) == 0 or blacklist == '' \
            or get_num_lines(blacklist) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, filtered)
        run_shell_cmd(cmd)
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp1 = gunzip(peak, 'tmp1', out_dir)
        tmp2 = gunzip(blacklist, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -v -a {} -b {} | '
        cmd += 'awk \'BEGIN{{OFS="\\t"}} '
        cmd += '{{if ($5>1000) $5=1000; print $0}}\' | '
        if not keep_irregular_chr:
            cmd += 'grep -P \'chr[\\dXY]+\\b\' | '
        cmd += 'gzip -nc > {}'
        cmd = cmd.format(
            tmp1,  # peak
            tmp2,  # blacklist
            filtered)
        run_shell_cmd(cmd)
        rm_f([tmp1, tmp2])
    return filtered
def mark_dup_picard(bam, out_dir, java_heap=None):  # shared by both se and pe
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'filt')
    dupmark_bam = '{}.dupmark.bam'.format(prefix)
    dup_qc = '{}.dup.qc'.format(prefix)
    if java_heap is None:
        java_heap_param = '-Xmx4G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)

    run_shell_cmd('java {java_heap_param} -XX:ParallelGCThreads=1 '
                  '-jar {picard} MarkDuplicates '
                  'INPUT={bam} '
                  'OUTPUT={dupmark_bam} '
                  'METRICS_FILE={dup_qc} '
                  'VALIDATION_STRINGENCY=LENIENT '
                  'USE_JDK_DEFLATER=TRUE '
                  'USE_JDK_INFLATER=TRUE '
                  'ASSUME_SORTED=TRUE '
                  'REMOVE_DUPLICATES=FALSE '.format(
                      java_heap_param=java_heap_param,
                      picard=locate_picard(),
                      bam=bam,
                      dupmark_bam=dupmark_bam,
                      dup_qc=dup_qc,
                  ))
    return dupmark_bam, dup_qc
def frac_mito(non_mito_samstat, mito_samstat, out_dir):
    prefix = os.path.join(
        out_dir,
        os.path.basename(strip_ext(non_mito_samstat, 'non_mito.samstats.qc')))
    frac_mito_qc = '{}.frac_mito.qc'.format(prefix)

    non_mito_samstat_dict = parse_flagstat_qc(non_mito_samstat)
    mito_samstat_dict = parse_flagstat_qc(mito_samstat)

    if 'mapped' in non_mito_samstat_dict:
        # backward compatibility (old key name was 'total')
        key_mapped = 'mapped'
    elif 'mapped_reads' in non_mito_samstat_dict:
        key_mapped = 'mapped_reads'
    Rn = non_mito_samstat_dict[key_mapped]

    if 'mapped' in mito_samstat_dict:
        # backward compatibility (old key name was 'total')
        key_mapped = 'mapped'
    elif 'mapped_reads' in mito_samstat_dict:
        key_mapped = 'mapped_reads'
    Rm = mito_samstat_dict[key_mapped]

    frac = float(Rm) / float(Rn + Rm)
    with open(frac_mito_qc, 'w') as fp:
        fp.write('non_mito_reads\t{}\n'.format(Rn))
        fp.write('mito_reads\t{}\n'.format(Rm))
        fp.write('frac_mito_reads\t{}\n'.format(frac))

    return frac_mito_qc
示例#4
0
def rm_dup_pe(dupmark_bam, nth, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(dupmark_bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    nodup_bam = '{}.nodup.bam'.format(prefix)

    cmd1 = 'samtools view -@ {} -F 1804 -f 2 -b {} > {}'
    cmd1 = cmd1.format(nth, dupmark_bam, nodup_bam)
    run_shell_cmd(cmd1)
    return nodup_bam
def starch_to_bed_gz(starch, out_dir):
    """Convert starch into gzipped BED.
    Required softwares:
        BEDOPS (tested with v2.4.39): unstarch
    """
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(starch)))
    bed_gz = '{}.bed.gz'.format(prefix)
    run_shell_cmd('unstarch {starch} | gzip -nc > {bed_gz}'.format(
        starch=starch,
        bed_gz=bed_gz,
    ))
    return bed_gz
def mark_dup_sambamba(bam, nth, out_dir):  # shared by both se and pe
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'filt')
    dupmark_bam = '{}.dupmark.bam'.format(prefix)
    dup_qc = '{}.dup.qc'

    cmd = 'sambamba markdup -t {} --hash-table-size=17592186044416 '
    cmd += '--overflow-list-size=20000000 '
    cmd += '--io-buffer-size=256 {} {} 2> {}'
    cmd = cmd.format(nth, bam, dupmark_bam, dup_qc)
    run_shell_cmd(cmd)
    return dupmark_bam, dup_qc
def rm_dup_pe(dupmark_bam, nth, out_dir):
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(dupmark_bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    nodup_bam = '{}.nodup.bam'.format(prefix)

    run_shell_cmd(
        'samtools view -F 1804 -f 2 -b {dupmark_bam} {res_param} > {nodup_bam}'
        .format(
            dupmark_bam=dupmark_bam,
            res_param=get_samtools_res_param('view', nth=nth),
            nodup_bam=nodup_bam,
        ))
    return nodup_bam
def pbc_qc_se(bam, mito_chr_name, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    pbc_qc = '{}.lib_complexity.qc'.format(prefix)

    cmd2 = 'bedtools bamtobed -i {} | '
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | '
    cmd2 += 'grep -v "^{}\\s" | sort | uniq -c | '
    cmd2 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} '
    cmd2 += '($1==2){{m2=m2+1}} {{m0=m0+1}} '
    cmd2 += '{{mt=mt+$1}} END{{m1_m2=-1.0; '
    cmd2 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; '
    cmd2 += 'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; '
    cmd2 += 'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",'
    cmd2 += 'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}'
    cmd2 = cmd2.format(bam, mito_chr_name, pbc_qc)
    run_shell_cmd(cmd2)
    return pbc_qc
def frip_shifted(ta, peak, chrsz, fraglen, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)
    half_fraglen = (fraglen + 1) / 2

    if get_num_lines(peak) == 0:
        val1 = 0.0
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools slop -i {} -g {} '
        cmd += '-s -l {} -r {} | '
        cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | '
        cmd += 'bedtools intersect -nonamecheck -a stdin -b {} '
        cmd += '-wa -u | wc -l'
        cmd = cmd.format(ta, chrsz, -half_fraglen, half_fraglen, tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        rm_f(tmp2)
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    return frip_qc
def frip(ta, peak, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    frip_qc = '{}.frip.qc'.format(prefix)

    if get_num_lines(peak) == 0:
        val1 = 0.0
        tmp_files = []
    else:
        # due to bedtools bug when .gz is given for -a and -b
        tmp1 = gunzip(ta, 'tmp1', out_dir)
        tmp2 = gunzip(peak, 'tmp2', out_dir)

        cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l'
        cmd = cmd.format(
            tmp1,  # ta
            tmp2)  # peak
        val1 = run_shell_cmd(cmd)
        tmp_files = [tmp1, tmp2]
    val2 = get_num_lines(ta)
    write_txt(frip_qc, str(float(val1) / float(val2)))
    rm_f(tmp_files)
    return frip_qc
def mark_dup_picard(bam, out_dir):  # shared by both se and pe
    prefix = os.path.join(out_dir,
                          os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'filt')
    dupmark_bam = '{}.dupmark.bam'.format(prefix)
    dup_qc = '{}.dup.qc'.format(prefix)

    cmd = 'java -Xmx4G -XX:ParallelGCThreads=1 -jar '
    cmd += locate_picard()
    cmd += ' MarkDuplicates '
    # cmd = 'picard MarkDuplicates '
    cmd += 'INPUT={} OUTPUT={} '
    cmd += 'METRICS_FILE={} VALIDATION_STRINGENCY=LENIENT '
    cmd += 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE '
    cmd += 'ASSUME_SORTED=true REMOVE_DUPLICATES=false'
    cmd = cmd.format(
        bam,
        dupmark_bam,
        dup_qc)
    run_shell_cmd(cmd)
    return dupmark_bam, dup_qc
示例#12
0
def pbc_qc_se(bam, mito_chr_name, mem_gb, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam)))
    # strip extension appended in the previous step
    prefix = strip_ext(prefix, 'dupmark')
    pbc_qc = '{}.lib_complexity.qc'.format(prefix)

    run_shell_cmd(
        'bedtools bamtobed -i {bam} | '
        'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | '
        'grep -v "^{mito_chr_name}\\s" | sort {sort_param} | uniq -c | '
        'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} '
        '($1==2){{m2=m2+1}} {{m0=m0+1}} '
        '{{mt=mt+$1}} END{{m1_m2=-1.0; '
        'if(m2>0) m1_m2=m1/m2; m0_mt=0; '
        'if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; '
        'printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",'
        'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {pbc_qc}'.format(
            bam=bam,
            mito_chr_name=mito_chr_name,
            sort_param=get_gnu_sort_param(mem_gb * 1024**3, ratio=0.5),
            pbc_qc=pbc_qc,
        ))
    return pbc_qc
示例#13
0
def peak_to_bigbed(peak, peak_type, chrsz, out_dir):
    prefix = os.path.join(out_dir, os.path.basename(strip_ext(peak)))
    bigbed = '{}.{}.bb'.format(prefix, peak_type)
    as_file = '{}.as'.format(prefix)
    chrsz_tmp = '{}.chrsz.tmp'.format(prefix)
    bigbed_tmp = '{}.bb.tmp'.format(prefix)
    bigbed_tmp2 = '{}.bb.tmp2'.format(prefix)

    if peak_type.lower() == 'narrowpeak' or peak_type.lower() == 'regionpeak':
        as_file_contents = '''table narrowPeak
"BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
(
    string chrom;        "Reference sequence chromosome or scaffold"
    uint   chromStart;   "Start position in chromosome"
    uint   chromEnd;     "End position in chromosome"
    string name;     "Name given to a region (preferably unique). Use . if no name is assigned"
    uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000) "
    char[1]  strand;     "+ or - or . for unknown"
    float  signalValue;  "Measurement of average enrichment for the region"
    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
    float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
    int   peak;         "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called."
)
'''
        bed_param = '-type=bed6+4 -as={}'.format(as_file)
    elif peak_type.lower() == 'broadpeak':
        as_file_contents = '''table broadPeak
"BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
(
    string chrom;        "Reference sequence chromosome or scaffold"
    uint   chromStart;   "Start position in chromosome"
    uint   chromEnd;     "End position in chromosome"
    string name;     "Name given to a region (preferably unique). Use . if no name is assigned."
    uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000)"
    char[1]   strand;     "+ or - or . for unknown"
    float  signalValue;  "Measurement of average enrichment for the region"
    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
    float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
)
'''
        bed_param = '-type=bed6+3 -as={}'.format(as_file)
    elif peak_type.lower() == 'gappedpeak':
        as_file_contents = '''table gappedPeak
"This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format."
    (
    string chrom;   "Reference sequence chromosome or scaffold"
    uint chromStart;    "Pseudogene alignment start position"
    uint chromEnd;      "Pseudogene alignment end position"
    string name;        "Name of pseudogene"
    uint score;          "Score of pseudogene with gene (0-1000)"
    char[1] strand;     "+ or - or . for unknown"
    uint thickStart;    "Start of where display should be thick (start codon)"
    uint thickEnd;      "End of where display should be thick (stop codon)"
    uint reserved;      "Always zero for now"
    int blockCount;     "Number of blocks"
    int[blockCount] blockSizes; "Comma separated list of block sizes"
    int[blockCount] chromStarts; "Start positions relative to chromStart"
    float  signalValue;  "Measurement of average enrichment for the region"
    float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
    float  qValue;       "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used."
)
'''
        bed_param = '-type=bed12+3 -as={}'.format(as_file)
    else:
        raise Exception('Unsupported peak file type {}!'.format(peak_type))

    # create temporary .as file
    with open(as_file, 'w') as fp:
        fp.write(as_file_contents)

    cmd1 = "cat {} > {}".format(chrsz, chrsz_tmp)
    run_shell_cmd(cmd1)
    cmd2 = "zcat -f {} | LC_COLLATE=C sort -k1,1 -k2,2n | "
    cmd2 += 'awk \'BEGIN{{OFS="\\t"}} {{if ($5>1000) $5=1000; '
    cmd2 += 'if ($5<0) $5=0; print $0}}\' > {}'
    cmd2 = cmd2.format(peak, bigbed_tmp)
    run_shell_cmd(cmd2)
    cmd3 = "bedClip {} {} {}".format(bigbed_tmp, chrsz_tmp, bigbed_tmp2)
    run_shell_cmd(cmd3)
    cmd4 = "bedToBigBed {} {} {} {}".format(bed_param, bigbed_tmp2, chrsz_tmp,
                                            bigbed)
    run_shell_cmd(cmd4)

    # remove temporary files
    rm_f([as_file, chrsz_tmp, bigbed_tmp, bigbed_tmp2])

    return bigbed