Exemplo n.º 1
0
def step3_nucarray(conf_dict,logfile,filelist):
    '''
    analysis part
    mainly Rscript
    dimentional reduction + clustering
    '''   

    arraydir = conf_dict['General']['outputdirectory'] + 'nucarray/'
    mapping_dir = conf_dict['General']['outputdirectory'] + 'preprocess/'
    createDIR(arraydir)
    os.chdir(arraydir)
 
    win_size = 7
    band_width = 3
    
    conf_dict['Step3_nucarray']['modifywig_center'] = arraydir + conf_dict['General']['outname'] + '_center_modify.wig'
    conf_dict['Step3_nucarray']['positionwig_center'] = arraydir + conf_dict['General']['outname'] + '_center_position.wig'
    conf_dict['Step3_nucarray']['positionbw_center'] = arraydir + conf_dict['General']['outname'] + '_center_position.bw'
    conf_dict['Step3_nucarray']['arrayall'] = arraydir + conf_dict['General']['outname'] + '_Nucleosome_Array_all.bed'
    conf_dict['Step3_nucarray']['arrayselect'] = arraydir + conf_dict['General']['outname'] + '_Nucleosome_Array.bed'
    conf_dict['Step3_nucarray']['gene_array_anno'] = arraydir + conf_dict['General']['outname'] + '_geneLevel_nucarrayAnnotation.bed'
    conf_dict['Step3_nucarray']['profilebw_onarray'] = arraydir + conf_dict['General']['outname'] + "_profile_on_Nucleosome_Array.bw"
    
    if check_filelist(filelist,conf_dict['Step3_nucarray']['modifywig_center'])==0  or not os.path.isfile(conf_dict['Step3_nucarray']['modifywig_center']):
        conf_dict['Step1_preprocess']['centerwig'] = mapping_dir + conf_dict['General']['outname'] + '_center.wig'
        modify_wig_signal(conf_dict['Step1_preprocess']['centerwig'],\
                      conf_dict['Step3_nucarray']['modifywig_center'],\
                      conf_dict['Step3_nucarray']['window_size'],\
                      conf_dict['Step3_nucarray']['smooth_bandwidth'])
        flog(conf_dict['Step3_nucarray']['modifywig_center'],filelist)
    else:
        wlog('modifywig_center wig exists.',logfile)

    bg_center = generate_position_signal(conf_dict['Step3_nucarray']['modifywig_center'],\
                             conf_dict['Step3_nucarray']['positionwig_center'],\
                             conf_dict['Step3_nucarray']['window_size'])
    conf_dict['Step3_nucarray']['bg_value'] = bg_center
    if check_filelist(filelist,conf_dict['Step3_nucarray']['positionwig_center'])==0 or not os.path.isfile(conf_dict['Step3_nucarray']['positionwig_center']):
        make_array(conf_dict['Step3_nucarray']['positionwig_center'],\
               conf_dict['Step3_nucarray']['arrayall'],\
               bg_center)
        flog(conf_dict['Step3_nucarray']['arrayall'],filelist)
        array_cmd = """awk '{if ($3 - $2 > %s && $5 > %s) print $0;}' %s > %s """%(conf_dict['Step3_nucarray']['array_length'],\
                                                                         conf_dict['Step3_nucarray']['array_fold'],\
                                                                         conf_dict['Step3_nucarray']['arrayall'],\
                                                                         conf_dict['Step3_nucarray']['arrayselect'])
        rwlog(array_cmd,logfile)
        flog(conf_dict['Step3_nucarray']['arrayselect'],filelist)
        wlog('Nucleosome array background value:%s'%bg_center,logfile)
    if check_filelist(filelist,conf_dict['Step3_nucarray']['gene_array_anno'])==0 or not os.path.isfile(conf_dict['Step3_nucarray']['gene_array_anno']):
        generate_geneLevel_arrayAnnotation(conf_dict['Step3_nucarray']['arrayselect'],conf_dict['Step1_preprocess']['gene_annotation'],conf_dict['Step3_nucarray']['gene_array_anno'])
        flog(conf_dict['Step3_nucarray']['gene_array_anno'],filelist)
    if check_filelist(filelist,conf_dict['Step3_nucarray']['profilebw_onarray'])==0 or not os.path.isfile(conf_dict['Step3_nucarray']['profilebw_onarray']):
        conf_dict['Step1_preprocess']['genome_length_use'] = mapping_dir + '%s_GemomeLengthTmp.genome'%(conf_dict['General']['outname'])
        conf_dict['Step1_preprocess']['profilebdg'] = mapping_dir + conf_dict['General']['outname'] + '_profile.bdg'
        signal_on_aray( conf_dict['Step1_preprocess']['profilebdg'],\
                    conf_dict['Step3_nucarray']['arrayselect'], \
                    conf_dict['Step3_nucarray']['profilebw_onarray'],\
                    conf_dict['Step1_preprocess']['genome_length_use'])
        flog(conf_dict['Step3_nucarray']['profilebw_onarray'],filelist)
    if check_filelist(filelist,conf_dict['Step3_nucarray']['positionbw_center'])==0 or not os.path.isfile(conf_dict['Step3_nucarray']['positionbw_center']):
        conf_dict['Step1_preprocess']['genome_length_use'] = mapping_dir + '%s_GemomeLengthTmp.genome'%(conf_dict['General']['outname'])
        cmd = 'wigToBigWig %s %s %s' % (conf_dict['Step3_nucarray']['positionwig_center'],conf_dict['Step1_preprocess']['genome_length_use'],conf_dict['Step3_nucarray']['positionbw_center'])
        rwlog(cmd,logfile)
        flog(conf_dict['Step3_nucarray']['positionbw_center'],filelist)
    return conf_dict
Exemplo n.º 2
0
def step5_summary(conf_dict, logfile, filelist):
    '''
    analysis part
    mainly Rscript
    dimentional reduction + clustering
    '''
    # start
    # create section for

    summarydir = conf_dict['General']['outputdirectory'] + 'summary/'
    mapping_dir = conf_dict['General']['outputdirectory'] + 'preprocess/'
    arraydir = conf_dict['General']['outputdirectory'] + 'nucarray/'
    qcdir = conf_dict['General']['outputdirectory'] + 'QC/'
    createDIR(summarydir)
    os.chdir(summarydir)

    result_folder = conf_dict['General']['outputdirectory'] + 'summary/plots/'
    createDIR(result_folder)
    # confirm result path
    conf_dict['Step1_preprocess']['profilebw'] = mapping_dir + conf_dict[
        'General']['outname'] + '_profile.bw'
    if conf_dict['General']['seqtype'] == 'PE':
        conf_dict['Step1_preprocess']['mapped_bed'] = mapping_dir + conf_dict[
            'General']['outname'] + '.bed'
    else:
        conf_dict['Step1_preprocess']['mapped_bed'] = conf_dict[
            'Step1_preprocess']['bed']
    conf_dict['Step1_preprocess'][
        'mapped_bb'] = mapping_dir + conf_dict['General']['outname'] + '.bb'
    conf_dict['Step2_QC']['ave_tssprofile'] = qcdir + conf_dict['General'][
        'outname'] + '_Tss_profile.txt'
    conf_dict['Step2_QC'][
        'fraglen'] = qcdir + conf_dict['General']['outname'] + '_fraglen.txt'
    conf_dict['Step3_nucarray']['arrayselect'] = arraydir + conf_dict[
        'General']['outname'] + '_Nucleosome_Array.bed'
    conf_dict['Step3_nucarray']['gene_array_anno'] = arraydir + conf_dict[
        'General']['outname'] + '_geneLevel_nucarrayAnnotation.bed'
    conf_dict['Step3_nucarray']['profilebw_onarray'] = arraydir + conf_dict[
        'General']['outname'] + "_profile_on_Nucleosome_Array.bw"
    conf_dict['Step3_nucarray']['positionbw_center'] = arraydir + conf_dict[
        'General']['outname'] + '_center_position.bw'
    conf_dict['Step1_preprocess']['centerbw'] = mapping_dir + conf_dict[
        'General']['outname'] + '_center.bw'
    # conf_dict['Step1_preprocess']['bowtieout'] = mapping_dir + conf_dict['General']['outname'] + '.bowtieout'
    ## collect results

    wlog('collect output files', logfile)
    rwlog(
        'mv %s %s' % (conf_dict['Step1_preprocess']['profilebw'], summarydir),
        logfile)
    rwlog(
        'mv %s %s' % (conf_dict['Step1_preprocess']['mapped_bed'], summarydir),
        logfile)
    if os.path.exists(conf_dict['Step1_preprocess']['mapped_bb']):
        rwlog(
            'mv %s %s' %
            (conf_dict['Step1_preprocess']['mapped_bb'], summarydir), logfile)
    rwlog('mv %s %s' % (conf_dict['Step2_QC']['ave_tssprofile'], summarydir),
          logfile)
    rwlog('mv %s %s' % (conf_dict['Step2_QC']['fraglen'], summarydir), logfile)
    if int(conf_dict['Step2_QC']['plotcustom']) == 1:
        rwlog(
            'mv %s %s' % (conf_dict['Step2_QC']['custom_profile'], summarydir),
            logfile)
    rwlog(
        'mv %s %s' % (conf_dict['Step3_nucarray']['arrayselect'], summarydir),
        logfile)
    rwlog(
        'mv %s %s' %
        (conf_dict['Step3_nucarray']['gene_array_anno'], summarydir), logfile)
    rwlog(
        'mv %s %s' %
        (conf_dict['Step3_nucarray']['profilebw_onarray'], summarydir),
        logfile)
    rwlog(
        'mv %s %s' %
        (conf_dict['Step3_nucarray']['positionbw_center'], summarydir),
        logfile)
    rwlog('mv %s %s' % (conf_dict['Step1_preprocess']['centerbw'], summarydir),
          logfile)

    # plot material
    rwlog(
        'cp %s %s' % (qcdir + conf_dict['General']['outname'] +
                      '_Features.txt', result_folder), logfile)
    rwlog(
        'cp %s %s' % (qcdir + conf_dict['General']['outname'] + "_seqCov.pdf",
                      result_folder), logfile)
    rwlog(
        'cp %s %s' % (qcdir + conf_dict['General']['outname'] + "_profile.pdf",
                      result_folder), logfile)
    rwlog(
        'cp %s %s' % (qcdir + conf_dict['General']['outname'] + "_ATfrac.pdf",
                      result_folder), logfile)
    rwlog(
        'cp %s %s' % (qcdir + conf_dict['General']['outname'] + "_fraglen.pdf",
                      result_folder), logfile)
    rwlog(
        'cp %s %s' % (qcdir + conf_dict['General']['outname'] + "_Nucdep.pdf",
                      result_folder), logfile)
    rwlog(
        'cp %s %s' % (qcdir + conf_dict['General']['outname'] + "_Nucfuzz.pdf",
                      result_folder), logfile)
    rwlog(
        'cp %s %s' %
        (qcdir + conf_dict['General']['outname'] + "_dhs.pdf", result_folder),
        logfile)
    rwlog(
        'cp %s %s' %
        (qcdir + conf_dict['General']['outname'] + "_utr.pdf", result_folder),
        logfile)

    os.chdir(result_folder)

    # read feature
    inf = open(conf_dict['General']['outname'] + '_Features.txt')
    for line in inf:
        if line.startswith('seq_coverage'):
            seq_coverage = round(float(line.strip().split()[1]), 2)
        if line.startswith('rot_score'):
            rot_score = round(float(line.strip().split()[1]), 4)
        if line.startswith('nuclen'):
            nuclen = int(float(line.strip().split()[1]))
        if line.startswith('NFRscore'):
            NFRscore = round(float(line.strip().split()[1]), 4)
        if line.startswith('PSarray'):
            PSarray = round(float(line.strip().split()[1]), 4)
        if line.startswith('array_on_utr'):
            array_on_utr = float(line.strip().split()[1])
        if line.startswith('array_on_DHS'):
            array_on_DHS = float(line.strip().split()[1])
        if line.startswith('array_num'):
            array_num = float(line.strip().split()[1])
        if line.startswith('total_utr_length'):
            total_utr_length = float(line.strip().split()[1])
        if line.startswith('total_DHS_length'):
            total_DHS_length = float(line.strip().split()[1])
        if line.startswith('effective_gs'):
            effective_gs = float(line.strip().split()[1])
        if line.startswith('enrichment_on_UTR'):
            UTR_fold = float(line.strip().split()[1])
        if line.startswith('enrichment_on_DHS'):
            DHS_fold = float(line.strip().split()[1])
    inf.close()
    if rot_score < 0.08:
        rot_judge = "Fail"
    else:
        rot_judge = "Pass"
    if nuclen < 140 or nuclen > 155:
        nuclen_judge = "Fail"
    else:
        nuclen_judge = "Pass"
    if NFRscore >= 0.4:
        NFR_judge = "Pass"
    else:
        NFR_judge = "Fail"
    if PSarray <= 0.4:
        PSarray_judge = "Pass"
    else:
        PSarray_judge = "Fail"
    if UTR_fold < 1:
        UTR_judge = "Fail"
    else:
        UTR_judge = "Pass"
    if DHS_fold < 2:
        DHS_judge = "Fail"
    else:
        DHS_judge = "Pass"

    wlog('generate qc documents', logfile)
    ### initiate
    QCdoc = """\documentclass[11pt,a4paper]{article}
\usepackage{tabularx}
\usepackage[english]{babel}
\usepackage{array}
\usepackage{graphicx}
\usepackage{color}
\DeclareGraphicsExtensions{.eps,.png,.pdf,.ps}
\\begin{document}
\\title{QC and analysis reports for MNase-seq data : %s}

\\vspace{-1cm}
\maketitle
\\tableofcontents
\\newpage
\\newpage
\section{Data description}
\\begin{quotation}
Table 1 mainly describe the input file and mapping and analysis parameters.
\end{quotation}
\\begin{table}[h]
\caption{Data description}\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|l| }

""" % (strlatexformat(conf_dict['General']['outname']))
    ### table1 prepare parameter
    if conf_dict['General']['seqtype'] == "PE":
        Seqtype = "Paired end"
        if conf_dict['General']['format'].upper() == "FASTQ":
            inputb = conf_dict['General']['inputb'].split("/")[-1]
        else:
            inputb = "NA"
    else:
        inputb = "NA"
        Seqtype = "Single end"
    if int(conf_dict['Step1_preprocess']['q30filter']) == 1:
        q30filter = "True"
    if int(conf_dict['Step2_QC']['plotcustom']) == 1:
        customRegion = conf_dict['General']['customregion'].split("/")[-1]
    else:
        customRegion = "NA"

    QCdoc += """      
\hline
parameter & value  \\\\
\hline
output name & %s \\\\
\hline
input file A & %s \\\\
\hline
input file B & %s \\\\
\hline
input format & %s  \\\\
\hline
sequencing type &  %s \\\\
\hline
genome version (species) & %s \\\\
\hline
Q30 filter mapped reads & %s \\\\
\hline
custom region & %s \\\\
\hline
\end{tabularx}
\end{table}
""" % (strlatexformat(conf_dict['General']['outname']),
       strlatexformat(conf_dict['General']['inputa'].split("/")[-1]),
       strlatexformat(inputb), conf_dict['General']['format'].upper(), Seqtype,
       conf_dict['Step1_preprocess']['species'], q30filter,
       strlatexformat(customRegion))

    ###  QC component
    QCdoc += """
\\newpage
\\newpage
\section{QC component}
we calculated three key measurements: 1) sequencing coverage, 2) AA/TT/AT dinucleotide frequency and 3) nucleosomal DNA length distribution.
\subsection{Sequencing coverage}
\\begin{quotation}
Sequencing coverage provides a direct measurement of the resolution of two features of nucleosome organization, i.e. occupancy and positioning (Struhl and Segal, 2013). Sequencing coverage is defined as: (Number of reads * 194bp)/(Effective genome size).  "Number of reads" is the number of mappable reads after MAPQ filtering (for single end data, for paired end it's the number of fragment).  "194bp" is the total length of nucleosome and linker estimated from historical data. "Effective genome size" is defined as 2.7e9 bps for humans and 1.87e9 bps for mice. Below we plotted the distribution of sequencing coverage of historical data; the sequencing coverage of input data was marked by vertical line: %s. 
\end{quotation}
\\begin{figure}[h]
        \caption{Sequencing coverage} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}

\\newpage
\\newpage
\subsection{AA/TT/AT di-nucleotide frequency}
\\begin{quotation}
The 10-base AA/TT/AT periodicity in nucleosomal DNA provides a measurement of nucleosome rotational positioning, which has been shown to be influenced by DNA sequence (Satchwell, et al., 1986). Mappable reads were sampled down to 10 million and were extended to 147bp in their 3'end direction. Then the aggregate AA/TT/AT di-nucleotide frequency across 4th - 143th bp of the extended reads was calculated (right). We conducted a Fourier transform on the aggregate frequency and used the energy of 10-bp periodicity (defined as rotational score) to show the extent the MNase-seq reads reflect nucleosome organization. Sample with rotational score greater than 0.08 was defined as "Pass" in this measurement, otherwise it's defined as "Fail". The cutoff 0.08 was determined from the distribution of rotational scores from all historical data (left, vertical line marked the rotational score input sample: %s [%s]).  
\end{quotation}
\\begin{figure}[h]
        \caption{AA/TT/AT di-nucleotide frequency} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}

\\newpage
\\newpage
\subsection{Nucleosomal DNA length distribution}
\\begin{quotation}
Nucleosomal DNA length distribution (refer to fragment length or MNase library size) is closely related and thus can reflect the degree of MNase digestion. For paired end sample, fragment length distribution from all mappable fragments was used directly to infer the nucleosomal DNA length distribution. For single end sample, we calculated a start-to-end distance to estimate the nucleosome length distribution: mappable reads were sampled down to 10 million and then we calculated the distribution of the distance from 5'end of each plus strand read to all 5'end of minus strand reads within 250bp downstream. Duplicate reads were discarded in this calculation. After the distribution of nucleosomal DNA length was generated, the length with highest frequency was defined as the estimated nucleosomal DNA length of the input sample (for both paired end and single end, left). Sample with nucleosomal DNA length within 140bp - 155bp was defined as "Pass", otherwise it's defined as "Fail". The cutoff was determined from the distribution of nucleosomal DNA length from all historical data (left, vertical line marked the nucleosomal DNA length of input sample: %s [%s]).  
\end{quotation}
\\begin{figure}[h]
        \caption{Nucleosomal DNA length distribution} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
""" % (
        str(seq_coverage),
        conf_dict['General']['outname'] + "_seqCov.pdf",
        str(rot_score),
        str(rot_judge),
        conf_dict['General']['outname'] + "_ATfrac.pdf",
        str(nuclen),
        str(nuclen_judge),
        conf_dict['General']['outname'] + "_fraglen.pdf",
    )

    if int(conf_dict['Step2_QC']['plotcustom']) == 1:
        customTEXT = " and the custom regions: %s" % (
            strlatexformat(customRegion))
        siteheat_filenames = strlatexformat(
            conf_dict['Step2_QC']['ave_tssprofile'].split("/")[-1] + ' \& ' +
            conf_dict['Step2_QC']['custom_profile'].split("/")[-1])
    else:
        customTEXT = ""
        siteheat_filenames = strlatexformat(
            conf_dict['Step2_QC']['ave_tssprofile'].split("/")[-1])
    QCdoc += """
\\newpage
\\newpage
\subsection{Nucleosome profile on potential functional regions}
\\begin{quotation}
CAM generated the average curve and the heatmap of nucleosome organization on promoter regions%s in 10bp resolution. Signal from minus strand regions were reversed in both heatmap and aggregate curve. The signal for each regions were also outputted as matrix: %s.  
\end{quotation}
\\begin{figure}[h]
        \caption{Nucleosome profile on potential functional regions} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
\\newpage
\\newpage
\subsection{Nucleosome depletion level and nucleosome fuzziness around TSS}
\\begin{quotation}
Based on nucleosome profiles on promoters, CAM generated two scores to describe the nucleosome positioning on promoters. First, nucleosome depletion level described the fold change of the MNase-seq signal of nucleosome free regions compared to the +1 nucleosome and -1 nucleosome. The higher the nucleosome depletion level is, the deeper the nucleosome free region is. Lower nucleosome depletion level associated with weak or none nucleosome free regions, which may indicate reads from open chromatins. Samples with nucleosome depletion level higher than 0.4 was defined as "Pass", otherwise it's defined as "Fail".The cutoff was determined based on the distribution of nucleosome depletion level from all historical data (left, vertical line marked the nucleosome depletion level of input sample: %s [%s]).
Next, nucleosome fuzziness downstream TSS defined whether clear nucleosome positioning pattern was observed from downstream promoters. The nucleosome fuzziness was calculated by the coefficient of variance (CV) of the linker length between the +1, +2, +3 and +4 nucleosomes. The lower the nucleosome fuzziness is, the better nucleosome positioning was observed on promoters. Samples with nucleosome fuzziness lower than 0.4 was defined as "Pass", otherwise it's defined as "Fail".The cutoff was determined based on the distribution of nucleosome fuzziness scores from all historical data (right, vertical line marked the nucleosome fuzziness of input sample: %s [%s]).
\end{quotation}
\\begin{figure}[h]
    \\begin{minipage}[t]{0.5\linewidth}
        \centering
        \includegraphics[width=1.6in]{%s}
        \caption{nucleosome depletion}
        \label{fig:side:a}
        \end{minipage}
    \\begin{minipage}[t]{0.5\linewidth}
        \centering
        \includegraphics[width=1.6in]{%s}
        \caption{nucleosome fuzziness}
        \label{fig:side:b}
    \end{minipage}
\end{figure}

""" % (customTEXT, siteheat_filenames, conf_dict['General']['outname'] +
       "_profile.pdf", str(NFRscore), str(NFR_judge), str(PSarray),
       str(PSarray_judge), conf_dict['General']['outname'] + "_Nucdep.pdf",
       conf_dict['General']['outname'] + "_Nucfuzz.pdf")
    QCdoc += """
\\newpage
\\newpage
\subsection{Well-positioned nucleosome arrays}
\\begin{quotation}
Regions with well-positioned nucleosome arrays are detected as previous described (Zhang, et al., 2014), and the enrichment in potential regulatory regions (downstream promoter and union DNase I hypersensitive sites (DHS sites)) is listed. Enrichment was defined as observed/expected percentage of nucleosome array on promoter ( $>$ 1 for enriched). Expected percentage was equal to the percentage of promoter length compared to the total length of effective genome. Similar approach was applied on union DHS sites.  For each region with well-positioned nucleosome array, its genomic coordinates together with nucleosome profile values were reported in the output file: %s. Nucleosome arrays with fold enrichment of DHS sites less than 2 is regarded as “Fail” in this measurement,while fold enrichment of UTR regions less than 1 is regarded as "Fail", indicating the well-positioned nucleosome arrays are more likely to be caused by random rather than the barrier model.
\end{quotation}
\\begin{table}[h]
\caption{Enrichment of well-positioned nucleosome arrays}\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|X| }    
\hline
genomic region(Category) &  enrichment \\\\
\hline
downstream promoter & %s [%s] \\\\
\hline
union DHS sites &  %s [%s]\\\\
\hline
\end{tabularx}
\end{table}
\\begin{figure}[h]   
  \\begin{minipage}[t]{0.5\linewidth}   
    \centering   
    \includegraphics[width=2in]{%s}   
    \caption{enrichment on UTR}   
    \label{fig:side:a}   
  \end{minipage} 
  \\begin{minipage}[t]{0.5\linewidth}   
    \centering   
    \includegraphics[width=2in]{%s}   
    \caption{enrichment on DHS}   
    \label{fig:side:b}   
  \end{minipage}
\end{figure}
""" % (strlatexformat(
        conf_dict['Step3_nucarray']['profilebw_onarray'].split("/")[-1]),
       UTR_fold, UTR_judge, DHS_fold, DHS_judge,
       conf_dict['General']['outname'] + "_utr.pdf",
       conf_dict['General']['outname'] + "_dhs.pdf")
    QCdoc += """
\\newpage
\\newpage
\section{Output list}
\\begin{quotation}
All output files were described in the following table
\end{quotation}
\\begin{table}[h]
\caption{output list}\label{bstable}
\\begin{tabularx}{\\textwidth}{ |l|X| }
    
\hline
filename & description  \\\\
\hline
%s/%s & mapped reads on the genome  \\\\
\hline
%s & genome-wide nucleosome profile  \\\\
\hline
%s & genome-wide nucleosome dyad profile  \\\\
\hline
%s & nucleosome signal on promoter regions  \\\\
\hline
%s & nucleosome fragment length distribution  \\\\
""" % (strlatexformat(
        conf_dict['Step1_preprocess']['mapped_bed'].split("/")[-1]),
       strlatexformat(
           conf_dict['Step1_preprocess']['mapped_bb'].split("/")[-1]),
       strlatexformat(
           conf_dict['Step1_preprocess']['profilebw'].split("/")[-1]),
       strlatexformat(
           conf_dict['Step1_preprocess']['centerbw'].split("/")[-1]),
       strlatexformat(conf_dict['Step2_QC']['ave_tssprofile'].split("/")[-1]),
       strlatexformat(conf_dict['Step2_QC']['fraglen'].split("/")[-1]))
    if int(conf_dict['Step2_QC']['plotcustom']) == 1:
        QCdoc += """
\hline
%s & nucleosome signal on custom regions \\\\         
""" % (strlatexformat(conf_dict['Step2_QC']['custom_profile'].split("/")[-1]))
    QCdoc += """
\hline
%s & well-positioned nucleosome arrays \\\\
\hline
%s & gene level annotation of nuc-arrays \\\\
\hline
%s & nucleosome signal on well-positioned nucleosome arrays \\\\
\hline
%s & nucleosome array score signal on the genome \\\\
\hline
%s & summary QC report \\\\
\hline

\end{tabularx}
\end{table} 
\end{document} 

""" % (strlatexformat(
        conf_dict['Step3_nucarray']['arrayselect'].split("/")[-1]),
       strlatexformat(
           conf_dict['Step3_nucarray']['gene_array_anno'].split("/")[-1]),
       strlatexformat(
           conf_dict['Step3_nucarray']['profilebw_onarray'].split("/")[-1]),
       strlatexformat(
           conf_dict['Step3_nucarray']['positionbw_center'].split("/")[-1]),
       strlatexformat(conf_dict['General']['outname']) + "\_summary.pdf")

    latexfile = conf_dict['General']['outname'] + '_summary.tex'
    outf = open(latexfile, 'w')
    outf.write(QCdoc)
    outf.close()
    cmd = "pdflatex %s" % (latexfile)
    cmd2 = 'cp %s %s' % (conf_dict['General']['outname'] + '_summary.pdf',
                         summarydir)
    if conf_dict['General']['latex'] == 1:
        rwlog(cmd, logfile)
        rwlog(cmd, logfile)
        rwlog(cmd2, logfile)
        for files in os.listdir(result_folder):
            if os.path.isfile(files) and files[-12:-4] == "_summary":
                if not files[-4:] in ['.tex', '.pdf', '.png', '.txt']:
                    cmd = "rm %s" % (files)
                    rwlog(cmd, logfile)
        wlog(
            'pdflatex was detected in default PATH, generate summary report %s'
            % ('summary/' + conf_dict['General']['outname'] + '_summary.pdf'),
            logfile)
    else:
        wlog(
            'pdflatex was not detected in default PATH, generate summary report .tex file in summary/plots folder, you can move the whole summary/plots/ folder to the environment with pdflatex installed and run cmd in the plots/ folder: "pdflatex %s"'
            % (conf_dict['General']['outname'] + '_summary.tex'), logfile)

    if conf_dict['clean']:
        wlog(
            '--clean pararmeter was turned on, remove preprocess, nucarray, QC and annotation folders',
            logfile)
        rwlog(
            "rm -r %s" %
            (conf_dict['General']['outputdirectory'] + 'preprocess/'), logfile)
        rwlog(
            "rm -r %s" %
            (conf_dict['General']['outputdirectory'] + 'nucarray/'), logfile)
        rwlog("rm -r %s" % (conf_dict['General']['outputdirectory'] + 'QC/'),
              logfile)
        rwlog(
            "rm -r %s" %
            (conf_dict['General']['outputdirectory'] + 'annotation/'), logfile)
    else:
        wlog('--clean pararmeter was turned off, remove internal file only',
             logfile)
        rwlog(
            "rm -r %s" %
            (conf_dict['General']['outputdirectory'] + 'annotation/'), logfile)
        # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'preprocess/centerbed_tmp.bed'),logfile)
        rwlog(
            "rm %s " % (conf_dict['General']['outputdirectory'] +
                        'preprocess/extbed_tmp.bed'), logfile)
        rwlog(
            "rm %s " % (conf_dict['General']['outputdirectory'] +
                        'preprocess/centerbed_tmp.bed'), logfile)
        rwlog(
            "rm %s " % (conf_dict['General']['outputdirectory'] +
                        'preprocess/rawbed_sortbychrm_tmp.bed'), logfile)
        rwlog(
            "rm %s " % (conf_dict['General']['outputdirectory'] +
                        'preprocess/clipsortbdg_TMPbed2bw.bdg'), logfile)
        rwlog(
            "rm %s " % (conf_dict['General']['outputdirectory'] +
                        'preprocess/freglen_tmp.txt'), logfile)
        rwlog(
            "rm %s " % (conf_dict['General']['outputdirectory'] +
                        'preprocess/rawbed_sortbychrm_tmp.bed'), logfile)
        rwlog(
            "rm %s " %
            (conf_dict['General']['outputdirectory'] + 'preprocess/' +
             conf_dict['General']['outname'] + '.bed.tmp'), logfile)
        rwlog(
            "rm %s " %
            (conf_dict['General']['outputdirectory'] + 'preprocess/' +
             conf_dict['General']['outname'] + '_center.wig'), logfile)
        rwlog(
            "rm %s " %
            (conf_dict['General']['outputdirectory'] + 'preprocess/' +
             conf_dict['General']['outname'] + '_GemomeLengthTmp.genome'),
            logfile)
        # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'preprocess/'+conf_dict['General']['outname']+'_minus1bp.bed'),logfile)
        # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'preprocess/'+conf_dict['General']['outname']+'_PEtoSE.bed'),logfile)
        rwlog(
            "rm %s " %
            (conf_dict['General']['outputdirectory'] + 'preprocess/' +
             conf_dict['General']['outname'] + '_profile.bdg'), logfile)
        # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'preprocess/'+conf_dict['General']['outname']+'_sdplus1bp.bed'),logfile)

    rwlog(
        "rm %s " %
        (conf_dict['General']['outputdirectory'] + 'progress_filelist.txt'),
        logfile)

    wlog('Step4 summary DONE, check %s for final outputs' % (summarydir),
         logfile)

    return conf_dict
Exemplo n.º 3
0
def step2_QC(conf_dict, logfile, filelist):
    '''
    QC step
    mapping stat
    4 QC plots
    '''
    # start
    ### create  QC dir and conduct QC
    qcdir = conf_dict['General']['outputdirectory'] + 'QC/'
    mapping_dir = conf_dict['General']['outputdirectory'] + 'preprocess/'
    arraydir = conf_dict['General']['outputdirectory'] + 'nucarray/'
    createDIR(qcdir)
    os.chdir(qcdir)

    ### start QC plots

    ### TSS profile

    t1 = time.time()
    # resstore dictionary information that may skipped due to process interruption and resart
    conf_dict['Step2_QC']['ave_tssprofile'] = qcdir + conf_dict['General'][
        'outname'] + '_Tss_profile.txt'
    conf_dict['Step1_preprocess']['profilebw'] = mapping_dir + conf_dict[
        'General']['outname'] + '_profile.bw'
    print conf_dict['Step2_QC']['ave_tssprofile']
    if check_filelist(
            filelist, conf_dict['Step2_QC']['ave_tssprofile']
    ) == 0 or not os.path.isfile(conf_dict['Step2_QC']['ave_tssprofile']):
        tssprofile(conf_dict['Step1_preprocess']['profilebw'],\
               conf_dict['Step2_QC']['ave_tssprofile'],\
               conf_dict['Step1_preprocess']['gene_annotation'],\
               conf_dict['Step2_QC']['upstreamtss'],\
               conf_dict['Step2_QC']['downstreamtss'])
        flog(conf_dict['Step2_QC']['ave_tssprofile'], filelist)
        wlog('tss profile done, time: %s' % (time.time() - t1), logfile)
    else:
        wlog('tss profile has been finished.', logfile)

    ### custom region profile
    if int(conf_dict['Step2_QC']['plotcustom']) == 1:
        t2 = time.time()
        # resstore dictionary information that may skipped due to process interruption and resart
        conf_dict['Step2_QC']['custom_profile'] = qcdir + conf_dict['General'][
            'outname'] + '_CustomRegion_profile.txt'
        if check_filelist(
                filelist, conf_dict['Step2_QC']['custom_profile']
        ) == 0 or not os.path.isfile(conf_dict['Step2_QC']['custom_profile']):
            customprofile(conf_dict['Step1_preprocess']['profilebw'],\
                      conf_dict['Step2_QC']['custom_profile'],\
                      conf_dict['General']['customregion'],\
                      conf_dict['Step2_QC']['customregion_dis'])
            wlog('custom region profile done, time: %s' % (time.time() - t2),
                 logfile)
            flog(conf_dict['Step2_QC']['custom_profile'], filelist)
        else:
            wlog('custom region profile has been finished.')

    ### AT frac
    t3 = time.time()
    # resstore dictionary information that may skipped due to process interruption and resart
    conf_dict['Step2_QC'][
        'at_frac'] = qcdir + conf_dict['General']['outname'] + '_ATfrac.txt'
    conf_dict['Step1_preprocess']['sdplus1bpbed'] = mapping_dir + conf_dict[
        'General']['outname'] + '_sdplus1bp.bed'
    conf_dict['Step1_preprocess']['minus1bpbed'] = mapping_dir + conf_dict[
        'General']['outname'] + '_minus1bp.bed'
    conf_dict['Step1_preprocess'][
        'genome_length_use'] = mapping_dir + '%s_GemomeLengthTmp.genome' % (
            conf_dict['General']['outname'])
    genome_annotation_dir = conf_dict['General'][
        'outputdirectory'] + 'annotation/'
    SPNAME_raw = conf_dict['Step1_preprocess']['genome_fasta'].split(
        '/')[-1].split('.')
    if len(SPNAME_raw) > 1:
        SPNAME = ".".join(SPNAME_raw[:-1])
    else:
        SPNAME = SPNAME_raw[0]
    if conf_dict['Step1_preprocess']['usefa'] == 1:
        conf_dict['Step1_preprocess'][
            'genome2bit'] = genome_annotation_dir + SPNAME + '_genome2bit.2bit'
    else:
        conf_dict['Step1_preprocess']['genome2bit'] = conf_dict[
            'Step1_preprocess']['genome_fasta']

    if check_filelist(
            filelist,
            conf_dict['Step2_QC']['at_frac']) == 0 or not os.path.isfile(
                conf_dict['Step2_QC']['at_frac']):
        ATfrac = ATprofile(conf_dict['Step1_preprocess']['sdplus1bpbed'],
                           conf_dict['Step1_preprocess']['genome2bit'])
        outf2 = open(conf_dict['Step2_QC']['at_frac'], 'w')
        for i in range(len(ATfrac)):
            newll = [i + 4, ATfrac[i]]
            outf2.write("\t".join(map(str, newll)) + "\n")
        outf2.close()
        wlog('AT profile done, time: %s' % (time.time() - t3), logfile)
        flog(conf_dict['Step2_QC']['at_frac'], filelist)
    else:
        wlog('AT profile has finished.', logfile)

    ### RR distance /frag len
    t4 = time.time()
    conf_dict['Step2_QC'][
        'fraglen'] = qcdir + conf_dict['General']['outname'] + '_fraglen.txt'

    if conf_dict['General']['seqtype'] == "SE":
        if check_filelist(
                filelist,
                conf_dict['Step2_QC']['fraglen']) == 0 or not os.path.isfile(
                    conf_dict['Step2_QC']['fraglen']):
            nucleosome_length = reads_reads_distance(conf_dict['Step1_preprocess']['sdplus1bpbed'],\
                                             conf_dict['Step1_preprocess']['minus1bpbed'],\
                                             conf_dict['Step1_preprocess']['genome_length_use'],\
                                             conf_dict['Step2_QC']['reads_distance_range'])
            outf3 = open(conf_dict['Step2_QC']['fraglen'], 'w')
            for i in range(len(nucleosome_length)):
                newll = [i, nucleosome_length[i]]
                outf3.write("\t".join(map(str, newll)) + "\n")
            outf3.close()
            flog(conf_dict['Step2_QC']['fraglen'], filelist)
            wlog('nucleosomal DNA length done, time: %s' % (time.time() - t4),
                 logfile)
    else:
        if check_filelist(
                filelist,
                conf_dict['Step2_QC']['fraglen']) == 0 or not os.path.isfile(
                    conf_dict['Step2_QC']['fraglen']):
            rwlog(
                'cp %s %s' %
                (mapping_dir + 'freglen_tmp.txt',
                 qcdir + conf_dict['General']['outname'] + '_fraglen.txt'),
                logfile)
            flog(conf_dict['Step2_QC']['fraglen'], filelist)
            # storege in conf_dic
            # outf3 = open(conf_dict['Step2_QC']['fraglen'],'w')
            # for i in sorted(conf_dict['Step2_QC']['freglen_dict'].keys()):
            #     newll = [i,conf_dict['Step2_QC']['freglen_dict'][i]]
            #     outf3.write("\t".join(map(str,newll))+"\n")
            # outf3.close()
            wlog('nucleosomal DNA length done, time: %s' % (time.time() - t4),
                 logfile)

    ###  feature
    t5 = time.time()

    if conf_dict['Step1_preprocess']['species'] in ['hg38', 'hg19']:
        effective_gs = 2.7e9
    elif conf_dict['Step1_preprocess']['species'] in ['mm10', 'mm9']:
        effective_gs = 1.87e9
    else:
        effective_gs = int(
            calculate_genome_length(
                conf_dict['Step1_preprocess']['genome_length_use']))

    # seq coverage
    try:
        seq_coverage = float(
            conf_dict['Step2_QC']['map_reads']) * 194 / effective_gs
    except:
        coverage_count_cmd = 'wc -l %s' % mapping_dir + conf_dict['General'][
            'outname'] + '.bed'
        seq_coverage = float(int(
            sp(coverage_count_cmd)[0].split()[0])) * 194 / effective_gs
        # seq_coverage = 1000000
        # count bed file
    # array number
    conf_dict['Step3_nucarray']['arrayselect'] = arraydir + conf_dict[
        'General']['outname'] + '_Nucleosome_Array.bed'
    array_count_cmd = 'wc -l %s' % (conf_dict['Step3_nucarray']['arrayselect'])
    # print array_count_cmd
    array_num = int(sp(array_count_cmd)[0].split()[0])

    # array on UTR 3kb
    array_on_utr, total_utr_length = generate_utr_overlap(
        conf_dict['Step3_nucarray']['arrayselect'],
        conf_dict['Step1_preprocess']['gene_annotation'], 3000)
    array_on_DHS, total_DHS_length = generate_DHS_overlap(
        conf_dict['Step3_nucarray']['arrayselect'],
        conf_dict['Step1_preprocess']['union_dhs'])
    utr_fold = round((float(array_on_utr) / array_num) /
                     (float(total_utr_length) / effective_gs), 4)
    DHS_fold = round((float(array_on_DHS) / array_num) /
                     (float(total_DHS_length) / effective_gs), 4)
    # gene level array annotation +-3kb
    # print conf_dict['rscript']
    ### generate QC plots
    cmd = 'Rscript %s %s %s %s %s %s %s %s %s'%(conf_dict['rscript'] + 'QCplots.r',\
                                    conf_dict['General']['outname'],\
                                    conf_dict['Step2_QC']['upstreamtss'],\
                                    conf_dict['Step2_QC']['downstreamtss'],\
                                    conf_dict['Step2_QC']['plotcustom'],\
                                    seq_coverage,\
                                    conf_dict['Step2_QC']['reads_distance_range'],\
                                    utr_fold,\
                                    DHS_fold)
    wlog(cmd, logfile)
    rot_score, nuclen, NFRscore, PSarray = sp(cmd)[0].split()[-4:]

    outf4 = open(qcdir + conf_dict['General']['outname'] + '_Features.txt',
                 'w')

    outf4.write("\t".join(map(str, ['seq_coverage', seq_coverage])) + "\n")
    outf4.write("\t".join(map(str, ['rot_score', rot_score])) + "\n")
    outf4.write("\t".join(map(str, ['nuclen', nuclen])) + "\n")
    outf4.write("\t".join(map(str, ['NFRscore', NFRscore])) + "\n")
    outf4.write("\t".join(map(str, ['PSarray', PSarray])) + "\n")
    outf4.write("\t".join(map(str, ['array_on_utr', array_on_utr])) + "\n")
    outf4.write("\t".join(map(str, ['array_on_DHS', array_on_DHS])) + "\n")
    outf4.write("\t".join(map(str, ['array_num', array_num])) + "\n")
    outf4.write("\t".join(map(str, ['total_utr_length', total_utr_length])) +
                "\n")
    outf4.write("\t".join(map(str, ['total_DHS_length', total_DHS_length])) +
                "\n")
    outf4.write("\t".join(map(str, ['effective_gs', effective_gs])) + "\n")
    outf4.write("\t".join(map(str, ['enrichment_on_UTR', utr_fold])) + "\n")
    outf4.write("\t".join(map(str, ['enrichment_on_DHS', DHS_fold])) + "\n")
    outf4.close()
    flog(qcdir + conf_dict['General']['outname'] + '_Features.txt', filelist)

    wlog("QC plots and feature summary done, time: %s" % (time.time() - t5),
         logfile)
    return conf_dict
Exemplo n.º 4
0
def step0_integrate_data(conf_dict, logfile):
    '''
    step0 integrate data 
    check and complement parameter
    '''
    wlog("Start CAM", logfile)
    wlog("Step0: Data integrate", logfile)

    ### check output name
    if "/" in conf_dict['General']['outname']:
        ewlog(
            "outname is the name of all your output result, cannot contain " /
            ", current outname is  %s" % (conf_dict['General']['outname']),
            logfile)

    ### check data path of inputa
    if "~" in conf_dict['General']['inputa']:
        ewlog(
            'require absolute path for input file, input file cannot contain "~", current input file is %s'
            % (conf_dict['General']['inputa']), logfile)
    if not conf_dict['General']['inputa'].startswith('/'):
        conf_dict['General']['inputa'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['inputa']
    if not os.path.isfile(conf_dict['General']['inputa']):
        ewlog("input file %s not found" % (conf_dict['General']['inputa']),
              logfile)

    ### check data format
    if conf_dict['General']['seqtype'] == "PE":
        wlog('sequencing type is set as PE(paired end)', logfile)
    elif conf_dict['General']['seqtype'] == "SE":
        wlog('sequencing type is set as SE(single end)', logfile)
    else:
        ewlog(
            'seqtype can only be SE/PE, current value is %s' %
            (conf_dict['General']['seqtype']), logfile)

    if conf_dict['General']['inputa'].endswith('.fastq'):
        conf_dict['General']['format'] = 'fastq'
        wlog('input file is raw sequencing file (fastq)', logfile)
        if conf_dict['General']['seqtype'] == "PE":
            if conf_dict['General']['inputb'].strip() == "":
                ewlog(
                    '2nd part of input file (inputb, fastq) is left blank while seqtype is PE (paired end). Please make sure you correctly input your data and specify the sequencing type. CAM exit',
                    logfile)
            elif not conf_dict['General']['inputb'].endswith('.fastq'):
                ewlog(
                    '2nd part of input file (inputb) does not endwith .fastq. Please make sure you correctly input your data. CAM exit',
                    logfile)
            ### check data path of inputb
            if "~" in conf_dict['General']['inputb']:
                ewlog(
                    'require absolute path for input file(part2), input file cannot contain "~", current input file(part2) is %s. CAM exit'
                    % (conf_dict['General']['inputb']), logfile)
            if not conf_dict['General']['inputb'].startswith('/'):
                conf_dict['General']['inputb'] = conf_dict['General'][
                    'startdir'] + conf_dict['General']['inputb']
            if not os.path.isfile(conf_dict['General']['inputb']):
                ewlog(
                    "input file(part2) %s not found. CAM exit" %
                    (conf_dict['General']['inputb']), logfile)
        elif conf_dict['General']['seqtype'] == "SE":
            if conf_dict['General']['inputb'].strip() != "":
                wlog(
                    '2nd part of input file(inputb) is not left blank while seqtype is SE (single end), ignore inputb parameter',
                    logfile)

    elif conf_dict['General']['inputa'].endswith('.sam'):
        conf_dict['General']['format'] = 'sam'
        wlog('input file is aligned sequencing file (sam)', logfile)
        if conf_dict['General']['inputb'].strip() != "":
            wlog(
                '2nd part of input file(inputb) is not left blank while input file is aligned sam file, ignore inputb parameter',
                logfile)
    elif conf_dict['General']['inputa'].endswith('.bed'):
        conf_dict['General']['format'] = 'bed'
        wlog('input file is aligned sequencing file (bed)', logfile)
        if conf_dict['General']['inputb'].strip() != "":
            wlog(
                '2nd part of input file(inputb) is not left blank while input file is aligned bed file, ignore inputb parameter',
                logfile)
    else:
        ewlog(
            'input file is not in a proper format (fastq/sam/bed), current input file is %s. CAM exit'
            % (conf_dict['General']['inputa']), logfile)

    ### check custom region
    conf_dict['Step2_QC']['plotcustom'] = 1
    if conf_dict['General']['customregion'].strip() == "":
        wlog('no custom region input, custom region profile will be skipped',
             logfile)
        conf_dict['Step2_QC']['plotcustom'] = 0
    if "~" in conf_dict['General']['customregion']:
        wlog(
            'require absolute path for custom region, custom region cannot contain "~", current input file is %s, custom region profile will be skipped'
            % (conf_dict['General']['customregion']), logfile)
        conf_dict['Step2_QC']['plotcustom'] = 0
    if not conf_dict['General']['customregion'].startswith('/'):
        conf_dict['General']['customregion'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['customregion']
    if not os.path.isfile(conf_dict['General']['customregion']):
        wlog(
            "custom region %s not found, custom region profile will be skipped"
            % (conf_dict['General']['customregion']), logfile)
        conf_dict['Step2_QC']['plotcustom'] = 0
    if not conf_dict['General']['customregion'].endswith('.bed') or checkbed(
            conf_dict['General']['customregion']) == 0:
        wlog(
            'custom region is not in bed format or not endswith .bed, current name of custom region is %s, custom region profile will be skipped'
            % (conf_dict['General']['customregion']), logfile)
        conf_dict['Step2_QC']['plotcustom'] = 0

    if conf_dict['Step2_QC']['plotcustom'] == 1:
        if int(conf_dict['Step2_QC']['customregion_dis']) > 5000 or int(
                conf_dict['Step2_QC']['customregion_dis']) < 200:
            wlog(
                "distance of custom region length (customregion_dis parameter) should be set between 200 ~ 5000, current value is %s, set customregion_dis to default 1000bp"
                % (conf_dict['Step2_QC']['customregion_dis'], logfile))
            conf_dict['Step2_QC']['customregion_dis'] = 1000

    ### check species parameter
    if conf_dict['Step1_preprocess']['species'] == "":
        ewlog("species is not given, CAM exit", logfile)
    elif not conf_dict['Step1_preprocess']['species'] in [
            'hg38', 'hg19', 'mm10', 'mm9'
    ]:
        ewlog(
            "species should be chose from [hg38,hg19,mm10,mm9](case sensitive), current species is %s, CAM exit"
            % (conf_dict['Step1_preprocess']['species']), logfile)
    else:
        if not os.path.isfile(conf_dict['default_anno_dir'] +
                              conf_dict['Step1_preprocess']['species'] +
                              '_refgenes.txt'):
            ewlog(
                "gene annotation file for genome version: %s (file should be %s_refgenes.txt) is not detected, make sure you already have (don't remove) corresponded genome version installed, CAM exit"
                % (conf_dict['Step1_preprocess']['species'],
                   conf_dict['Step1_preprocess']['species']))
        if not os.path.isfile(conf_dict['default_anno_dir'] +
                              conf_dict['Step1_preprocess']['species'] +
                              '.genome'):
            ewlog(
                "genome length file for genome version: %s (file should be %s.genome) is not detected, make sure you already have corresponded genome version installed, CAM exit"
                % (conf_dict['Step1_preprocess']['species'],
                   conf_dict['Step1_preprocess']['species']))
        if not os.path.isfile(conf_dict['default_anno_dir'] + 'DHS_' +
                              conf_dict['Step1_preprocess']['species'] +
                              '.bed'):
            ewlog(
                "union DHS file for genome version: %s (file should be DHS_%s.bed) is not detected, make sure you already have corresponded genome version installed, CAM exit"
                % (conf_dict['Step1_preprocess']['species'],
                   conf_dict['Step1_preprocess']['species']))
        else:
            wlog(
                "corresponded annotation files for %s it detected" %
                (conf_dict['Step1_preprocess']['species']), logfile)

    ### use default annotation file, if species is correctly inputted
    conf_dict['Step1_preprocess'][
        'gene_annotation'] = conf_dict['default_anno_dir'] + conf_dict[
            'Step1_preprocess']['species'] + '_refgenes.txt'
    conf_dict['Step1_preprocess'][
        'genome_length'] = conf_dict['default_anno_dir'] + conf_dict[
            'Step1_preprocess']['species'] + '.genome'
    conf_dict['Step1_preprocess'][
        'union_dhs'] = conf_dict['default_anno_dir'] + 'DHS_' + conf_dict[
            'Step1_preprocess']['species'] + '.bed'

    ### mapping index and 2bit
    if conf_dict['Step1_preprocess']['genome_fasta'] == "":
        ewlog("genome_fasta is not given, CAM exit", logfile)
    else:
        if "~" in conf_dict['Step1_preprocess']['genome_fasta']:
            ewlog(
                "genome_fasta file: %s is not absolute path, CAM exit" %
                (conf_dict['Step1_preprocess']['genome_fasta']), logfile)
        if not conf_dict['Step1_preprocess']['genome_fasta'].startswith("/"):
            conf_dict['Step1_preprocess'][
                'genome_fasta'] = conf_dict['General']['startdir'] + conf_dict[
                    'Step1_preprocess']['genome_fasta']
        if os.path.isfile(conf_dict['Step1_preprocess']['genome_fasta']):
            if conf_dict['Step1_preprocess']['genome_fasta'].endswith(
                    '.fa') and checkfa(
                        conf_dict['Step1_preprocess']['genome_fasta']) == 1:
                wlog(
                    "genome_fasta file: %s is detected (fasta format)" %
                    (conf_dict['Step1_preprocess']['genome_fasta']), logfile)
                conf_dict['Step1_preprocess']['usefa'] = 1
            elif conf_dict['Step1_preprocess']['genome_fasta'].endswith(
                    '.2bit'):
                wlog(
                    "genome_fasta file: %s is detected (2bit format)" %
                    (conf_dict['Step1_preprocess']['genome_fasta']), logfile)
                conf_dict['Step1_preprocess']['usefa'] = 0
            else:
                ewlog(
                    "genome_fasta file: %s is not in fasta(.fa) or 2bit(.2bit) format, CAM exit"
                    % (conf_dict['Step1_preprocess']['genome_fasta']), logfile)

        else:
            ewlog(
                "genome_fasta file: %s is not a regular file, CAM exit" %
                (conf_dict['Step1_preprocess']['genome_fasta']), logfile)

    ### check options
    try:
        int(conf_dict['Step1_preprocess']['mapping_p'])
        #wlog('mapping thread is %s'%(str(int(conf_dict['Step1_preprocess']['mapping_p']))),logfile)
    except:
        ewlog(
            'mapping_p should be int, current value is %s' %
            (conf_dict['Step1_preprocess']['mapping_p']), logfile)
    if int(conf_dict['Step1_preprocess']['trim3end']) < 0:
        ewlog(
            'trim3end should be greater/equal than 0, current value is %s' %
            (conf_dict['Step1_preprocess']['trim3end']), logfile)

    if not int(conf_dict['Step1_preprocess']['q30filter']) in [0, 1]:
        ewlog(
            'q30filter measurement can only be 0/1, current value is %s' %
            (conf_dict['Step1_preprocess']['q30filter']), logfile)
    if not int(conf_dict['Step1_preprocess']['trim3end']) in [0, 1]:
        ewlog(
            'q30filter measurement can only be 0/1, current value is %s' %
            (conf_dict['Step1_preprocess']['q30filter']), logfile)
    if not int(conf_dict['Step1_preprocess']['rpm']) in [0, 1]:
        ewlog(
            'rpm function can only be 0/1, current value is %s' %
            (conf_dict['Step1_preprocess']['rpm']), logfile)

    if not int(conf_dict['Step2_QC']['sample_down_reads']) > 1000000:
        ewlog(
            'sample_down_reads should greater than 1000000 to make sure enough reads for QC step, current value is %s'
            % (conf_dict['Step2_QC']['sample_down_reads']), logfile)

    if not int(conf_dict['Step2_QC']['upstreamtss']) > 500:
        wlog(
            'upstreamtss should be greater than 500 , current value is %s, CAM adjust upstreamtss to 500bp'
            % (conf_dict['Step2_QC']['upstreamtss']), logfile)
        conf_dict['Step2_QC']['upstreamtss'] = 500

    if not int(conf_dict['Step2_QC']['downstreamtss']) > 500:
        wlog(
            'downstreamtss should be greater than 500 , current value is %s, CAM adjust downstreamtss to 500bp'
            % (conf_dict['Step2_QC']['downstreamtss']), logfile)
        conf_dict['Step2_QC']['downstreamtss'] = 500

    if not int(conf_dict['Step2_QC']['reads_distance_range']) > 150:
        wlog(
            'reads_distance_range should be greater than 150 , current value is %s, CAM adjust reads_distance_range to 150bp'
            % (conf_dict['Step2_QC']['reads_distance_range']), logfile)
        conf_dict['Step2_QC']['reads_distance_range'] = 150
    wlog('check options: DONE ', logfile)

    ### check Rscript
    #if not 'Usage' in sperr('Rscript')[1] and not 'version' in sperr('Rscript')[1]:
    #    ewlog('require Rscript',logfile)

    ### check pdflatex
    if sp('pdflatex --help')[0] == "":
        wlog(
            'pdflatex was not installed, CAM is still processing but no summary QC report generated',
            logfile)
        conf_dict['General']['latex'] = 0
    else:
        conf_dict['General']['latex'] = 1

    wlog('Step0 Data integrate DONE', logfile)

    return conf_dict
Exemplo n.º 5
0
def step1_preprocess(conf_dict,logfile,filelist):
    '''
    main data processing step, including mapping, transformation
    for fastq format : 
        bowtie mapping,
        q30 filter (optional),
        transform to bed 
    for sam format:
        q30 filter (optional),
        transform to bed    
    ''' 
    ### create annotation dir
    t0 = time.time()

    wlog('generate annotation files',logfile)
    genome_annotation_dir = conf_dict['General']['outputdirectory'] + 'annotation/'
    createDIR(genome_annotation_dir)
    os.chdir(genome_annotation_dir)
    
    SPNAME_raw = conf_dict['Step1_preprocess']['genome_fasta'].split('/')[-1].split('.')
    if len(SPNAME_raw) > 1:
        SPNAME = ".".join(SPNAME_raw[:-1])
    else:
        SPNAME = SPNAME_raw[0]
    if conf_dict['Step1_preprocess']['usefa'] == 1:

        genome2bit_gen_cmd = 'faToTwoBit %s %s_genome2bit.2bit'%(conf_dict['Step1_preprocess']['genome_fasta'],SPNAME)
        rwlog(genome2bit_gen_cmd,logfile)
        conf_dict['Step1_preprocess']['genome2bit'] = genome_annotation_dir + SPNAME + '_genome2bit.2bit'

    else:
        conf_dict['Step1_preprocess']['genome2bit'] = conf_dict['Step1_preprocess']['genome_fasta']
        fa_gen_cmd = 'twoBitToFa %s %s_genomefa.fa'%(conf_dict['Step1_preprocess']['genome2bit'],SPNAME)
        rwlog(fa_gen_cmd,logfile)
        conf_dict['Step1_preprocess']['genome_fasta'] = genome_annotation_dir + SPNAME + '_genomefa.fa'
            
    if conf_dict['General']['format'] == 'fastq':
        wlog('check bowtie index',logfile)
        if conf_dict['Step1_preprocess']['mapindex'] == "":
            wlog("no bowtie mapindex inputted",logfile)
            generate_index = 1
        elif not os.path.isfile( conf_dict['Step1_preprocess']['mapindex']+'.1.ebwt' ):
            wlog("cannot find inputted bowtie index file : %s "%(conf_dict['Step1_preprocess']['mapindex']+'.1.ebwt'),logfile)
            generate_index = 1
        else:
            generate_index = 0
        if generate_index == 1:
            wlog('generate bowtie index',logfile)
            if conf_dict['Step1_preprocess']['usefa'] == 1:
                genome2bit_gen_cmd = 'faToTwoBit %s %s_genome2bit.2bit'%(conf_dict['Step1_preprocess']['genome_fasta'],SPNAME)
                rwlog(genome2bit_gen_cmd,logfile)
                conf_dict['Step1_preprocess']['genome2bit'] = genome_annotation_dir + SPNAME + '_genome2bit.2bit'            
            else:
                conf_dict['Step1_preprocess']['genome2bit'] = conf_dict['Step1_preprocess']['genome_fasta']
                fa_gen_cmd = 'twoBitToFa %s %s_genomefa.fa'%(conf_dict['Step1_preprocess']['genome2bit'],SPNAME)
                rwlog(fa_gen_cmd,logfile)
                conf_dict['Step1_preprocess']['genome_fasta'] = genome_annotation_dir + SPNAME + '_genomefa.fa'
            mapindex_gen_cmd = 'bowtie-build %s %s_bowtie_index'%(conf_dict['Step1_preprocess']['genome_fasta'],SPNAME)
            rwlog(mapindex_gen_cmd,logfile)
            conf_dict['Step1_preprocess']['mapindex'] = genome_annotation_dir + SPNAME + '_bowtie_index'
            wlog('generate bowtie index done: %s'%(conf_dict['Step1_preprocess']['mapindex']),logfile)
        else:
            wlog('use user inputted bowtie index',logfile)
        
    wlog("prepare annotation done, time: %s"%(time.time()-t0),logfile)
    ### create mapping dir 
    mapping_dir = conf_dict['General']['outputdirectory'] + 'preprocess/'
    createDIR(mapping_dir)
    
    ### check reads file format , 
    ## start mapping step if format is fastq
    os.chdir(mapping_dir)

    if conf_dict['General']['format'] == 'fastq':
        ### sam file name
        conf_dict['Step1_preprocess']['sam'] = mapping_dir + conf_dict['General']['outname'] + '.sam'
        conf_dict['Step1_preprocess']['maplog'] = mapping_dir + conf_dict['General']['outname'] + '.bowtieout'
        # check the progress for mapping
        if check_filelist(filelist,conf_dict['Step1_preprocess']['sam'])==0 or not os.path.isfile(conf_dict['Step1_preprocess']['sam']):
            wlog('Now start mapping in %s , all mapping result will be here'%(mapping_dir),logfile)
            t1 = time.time()
            ### judge fastq reads length
            ReadLengthA = fastq_reads_length(conf_dict['General']['inputa'])
            if ReadLengthA[1] == 'difflen':
                wlog('WARNING: reads length in input fastq %s is not consistant'%(conf_dict['General']['inputa']),logfile)
            if conf_dict['General']['seqtype'] == 'PE':
                ReadLengthB = fastq_reads_length(conf_dict['General']['inputb'])
                if ReadLengthB[1] == 'difflen':
                    wlog('WARNING: reads length in input fastq (part2) %s is not consistant, nucpipe exit'%(conf_dict['General']['inputb']),logfile)
                if ReadLengthA[0] != ReadLengthB[0]:
                    wlog('WARNING: read length in 2 part of input fastq file is different',logfile)
                ReadLength = min(ReadLengthA[0] ,ReadLengthB[0])
            else:
                ReadLength = ReadLengthA[0]
            wlog('reads length is detected as %s'%(str(ReadLength)),logfile)
            
            ### check : read_length - trim3end > 18 
            if int(conf_dict['Step1_preprocess']['trim3end']) == 0:
                pass
            else:
                read_left_length = ReadLength - int(conf_dict['Step1_preprocess']['trim3end'])
                if read_left_length < 18:
                    ewlog('user set trim3end length is %s, the left reads length is %sbp, less than 18bp, CAM exit'%(conf_dict['Step1_preprocess']['trim3end'],str(read_left_length)),logfile)
            
            ### check bowtie software
            if sp('which bowtie')[0].strip() == "":
                ewlog('bowtie is not detected in default PATH, make sure you installed bowtie and export it into default PATH',logfile)
            # SE / PE bowtie mapping
            if conf_dict['General']['seqtype'] == 'PE':
                wlog('seqtype is PE (paired end), mapping with bowtie paired end mode',logfile)
                mapping_cmd = 'bowtie -X %s -3 %s --chunkmbs 256 -m 1 -p %s -S %s -1 %s -2 %s  %s 2>&1 >>/dev/null |tee -a %s'%( \
                               conf_dict['Step1_preprocess']['fragment_length_limit'],\
                               conf_dict['Step1_preprocess']['trim3end'], \
                               conf_dict['Step1_preprocess']['mapping_p'], \
                               conf_dict['Step1_preprocess']['mapindex'], \
                               conf_dict['General']['inputa'], \
                               conf_dict['General']['inputb'], \
                               conf_dict['Step1_preprocess']['sam'], \
                               conf_dict['Step1_preprocess']['maplog'])
                rwlog(mapping_cmd,logfile)
            elif conf_dict['General']['seqtype'] == 'SE':
                wlog('seqtype is SE (single end), mapping with bowtie single end mode',logfile)
                mapping_cmd = 'bowtie -3 %s --chunkmbs 256 -m 1 -p %s -S %s  %s  %s   >>/dev/null |tee -a %s'%( \
                               conf_dict['Step1_preprocess']['trim3end'], \
                               conf_dict['Step1_preprocess']['mapping_p'], \
                               conf_dict['Step1_preprocess']['mapindex'], \
                               conf_dict['General']['inputa'], \
                               conf_dict['Step1_preprocess']['sam'], \
                               conf_dict['Step1_preprocess']['maplog'])
                rwlog(mapping_cmd,logfile)
                # write the finished file into progress_filelist.txt
                flog(conf_dict['Step1_preprocess']['sam'],filelist)
            else:
                ewlog('wrong seqtype, current seqtype is %s'%(conf_dict['General']['seqtype']))
            wlog('mapping done, time: %s'%(time.time()-t1),logfile)
        else:
            wlog('mapping has been finished.',logfile)
    ### for sam/bed file, skip mapping step        
    elif conf_dict['General']['format'] == 'sam':
        wlog('input file format is sam, skip mapping step',logfile)
        conf_dict['Step1_preprocess']['sam'] = conf_dict['General']['inputa']
    elif conf_dict['General']['format'] == 'bed':
        wlog('input file format is bed, skip mapping step',logfile)
    else: 
        ewlog('input file is not in a proper format (fastq/sam/bed), current input file is %s. CAM exit'%(conf_dict['General']['inputa']),logfile)                

    ### transform sam to bed
    
    t2 = time.time()
    if conf_dict['General']['seqtype']=='PE':
        conf_dict['Step1_preprocess']['bed'] = mapping_dir + conf_dict['General']['outname'] + '_PEtoSE.bed'
    else:
        conf_dict['Step1_preprocess']['bed'] = mapping_dir + conf_dict['General']['outname'] + '.bed'
    if conf_dict['General']['format'] == 'fastq' or conf_dict['General']['format'] == 'sam':
        # check the progress for samtobed
        if check_filelist(filelist,conf_dict['Step1_preprocess']['bed'])==0 or not os.path.isfile(conf_dict['Step1_preprocess']['bed']):
            map_reads, fraglen_Dict = transform_sam2bed(conf_dict['Step1_preprocess']['sam'],\
                                                        conf_dict['Step1_preprocess']['bed'],\
                                                        conf_dict['General']['seqtype'],\
                                                        conf_dict['Step1_preprocess']['q30filter'],\
                                                        conf_dict['Step1_preprocess']['fragment_length_limit'])
            # write the finished file into progress_filelist.txt
            flog(conf_dict['Step1_preprocess']['bed'],filelist)

    else:
        ## for PEbed input, convert to SEbed, output only one strand for the fragments with only 1bp; for SEbed input, check format
        # check the progress for samtobed
        # finished_files = open(filelist).readlines()
        # finished_files = [i.strip() for i in finished_files]
        # status = 1
        # if conf_dict['Step1_preprocess']['bed'] not in finished_files:
        #     status = 0
        # print status
        # print check_filelist(filelist,conf_dict['Step1_preprocess']['bed'])==0
        # sys.exit(1)
        # print check_filelist(filelist,conf_dict['Step1_preprocess']['bed'])==0
        # sys.exit(1)
        if check_filelist(filelist,conf_dict['Step1_preprocess']['bed'])==0 or not os.path.isfile(conf_dict['Step1_preprocess']['bed']):
            map_reads, fraglen_Dict = transform_bed2bed(conf_dict['General']['inputa'],\
                                                        conf_dict['Step1_preprocess']['bed'],\
                                                        conf_dict['General']['seqtype'],\
                                                        conf_dict['Step1_preprocess']['q30filter'],\
                                                        conf_dict['Step1_preprocess']['fragment_length_limit'])
            flog(conf_dict['Step1_preprocess']['bed'],filelist)
            conf_dict['Step2_QC']['map_reads'] = map_reads
            conf_dict['Step2_QC']['freglen_dict'] = fraglen_Dict
            outf3 = open(mapping_dir+'freglen_tmp.txt','w')
            for i in sorted(conf_dict['Step2_QC']['freglen_dict'].keys()):
                newll = [i,conf_dict['Step2_QC']['freglen_dict'][i]]
                outf3.write("\t".join(map(str,newll))+"\n")
            outf3.close()
            flog(mapping_dir+'freglen_tmp.txt',filelist)
    # fraglen_Dict should be write into files

    transform_time = time.time() -t2
    wlog('transforming done, time: %s'%(transform_time),logfile)
    
    t3 = time.time()
    
    ### generate extbw, +1bpbw, -1bpbw, 10M +1bp bed, 10M -1bp bed
    conf_dict['Step1_preprocess']['profilebw'] = mapping_dir + conf_dict['General']['outname'] + '_profile.bw'
    conf_dict['Step1_preprocess']['profilebdg'] = mapping_dir + conf_dict['General']['outname'] + '_profile.bdg'
    conf_dict['Step1_preprocess']['centerwig'] = mapping_dir + conf_dict['General']['outname'] + '_center.wig'
    conf_dict['Step1_preprocess']['sdplus1bpbed'] = mapping_dir + conf_dict['General']['outname'] + '_sdplus1bp.bed'
    conf_dict['Step1_preprocess']['minus1bpbed'] = mapping_dir + conf_dict['General']['outname'] + '_minus1bp.bed'

    conf_dict['Step1_preprocess']['genome_length_use'] = mapping_dir + '%s_GemomeLengthTmp.genome'%(conf_dict['General']['outname'])
    correct_genome_length(conf_dict['Step1_preprocess']['genome_length'],conf_dict['Step1_preprocess']['genome_length_use'])
    # check the progress for bed2bw
    if check_filelist(filelist,conf_dict['Step1_preprocess']['centerwig'])==0 or not os.path.isfile(conf_dict['Step1_preprocess']['centerwig']):
        if not conf_dict['Step2_QC'].has_key('map_reads'):
            coverage_count_cmd = 'wc -l %s' % conf_dict['Step1_preprocess']['bed']
            # print coverage_count_cmd
            # print os.system(coverage_count_cmd)
            # print os.popen(coverage_count_cmd).read()
            conf_dict['Step2_QC']['map_reads'] = int(sp(coverage_count_cmd)[0].split()[0])
        print conf_dict['Step2_QC']['map_reads']            
        bed2allbw(conf_dict['Step1_preprocess']['bed'],\
                  conf_dict['Step1_preprocess']['genome_length_use'],\
                  conf_dict['Step1_preprocess']['profilebdg'],\
                  conf_dict['Step1_preprocess']['profilebw'],\
                  conf_dict['Step1_preprocess']['centerwig'],\
                  conf_dict['Step1_preprocess']['minus1bpbed'],\
                  conf_dict['Step1_preprocess']['sdplus1bpbed'],\
                  conf_dict['Step2_QC']['map_reads'],\
                  conf_dict['Step2_QC']['sample_down_reads'],\
                  conf_dict['Step1_preprocess']['rpm'])
        flog(conf_dict['Step1_preprocess']['centerwig'],filelist)
    else:
        wlog('pileup has been finished.',logfile)
    bed2bw_time = time.time() -t3
    wlog('pileup done, time: %s'%(bed2bw_time),logfile)
    # transform bed to bb
    # for fastq input, samtool view to transform sam to bed
    # than sort the bed file, and transform to bb
    if conf_dict['General']['seqtype']=='PE':
        conf_dict['Step1_preprocess']['mapped_bed'] = mapping_dir + conf_dict['General']['outname'] + '.bed'
        if check_filelist(filelist,conf_dict['Step1_preprocess']['mapped_bed'])==0  or not os.path.isfile(conf_dict['Step1_preprocess']['mapped_bed']):
            if conf_dict['General']['format']!='bed':
                cmd = 'samtools view %s -Sb | bamToBed -i stdin > %s' % (conf_dict['Step1_preprocess']['sam'],conf_dict['Step1_preprocess']['mapped_bed'])
                rwlog(cmd,logfile)
                flog(conf_dict['Step1_preprocess']['mapped_bed'],filelist)
            else:
                cmd = 'cp %s %s' % (conf_dict['General']['inputa'],conf_dict['Step1_preprocess']['mapped_bed'])
                rwlog(cmd,logfile)
                flog(conf_dict['Step1_preprocess']['mapped_bed'],filelist)
    else:
        conf_dict['Step1_preprocess']['mapped_bed'] = conf_dict['Step1_preprocess']['bed']
    
    conf_dict['Step1_preprocess']['mapped_bb'] = mapping_dir + conf_dict['General']['outname'] + '.bb'

    # cmd = 'awk \'{FS="\\t";OFS="\\t";if (NF==6 && $2>=0 && $3>=0) print $0;}\' %s | sort -k1,1 -k2,2n > %s.tmp' % (conf_dict['Step1_preprocess']['mapped_bed'],conf_dict['Step1_preprocess']['mapped_bed'])
    # try :
    if check_filelist(filelist,'%s.tmp' % conf_dict['Step1_preprocess']['mapped_bed'])==0 or not os.path.isfile(conf_dict['Step1_preprocess']['mapped_bed']):
        cmd = 'awk %s{FS="\\t";OFS="\\t";if (NF==6 && $2>=0 && $3>=0) print $0;}%s %s | sort -k1,1 -k2,2n > %s.tmp' % ("'","'",conf_dict['Step1_preprocess']['mapped_bed'],conf_dict['Step1_preprocess']['mapped_bed'])
        rwlog(cmd,logfile)
        flog('%s.tmp' % conf_dict['Step1_preprocess']['mapped_bed'],filelist)
    if sp('which bedToBigBed')[0].strip() == "":
        ewlog('bedToBigBed is not detected in default PATH, make sure you installed bowtie and export it into default PATH',logfile)
    else:
        if check_filelist(filelist,conf_dict['Step1_preprocess']['mapped_bb'])==0 or not os.path.isfile(conf_dict['Step1_preprocess']['mapped_bb']):
            cmd = 'bedToBigBed %s.tmp %s %s' % (conf_dict['Step1_preprocess']['mapped_bed'],conf_dict['Step1_preprocess']['genome_length_use'],conf_dict['Step1_preprocess']['mapped_bb'])
            rwlog(cmd,logfile)
            flog(conf_dict['Step1_preprocess']['mapped_bb'],filelist)
    # cmd = 'rm %s.tmp ' % conf_dict['Step1_preprocess']['mapped_bed']
    # rwlog(cmd,logfile)
    # except:
        # print 'Failed to transform bed to bb.'

    return conf_dict
Exemplo n.º 6
0
def main():

    args = parse_args()
    # print int(args.task)
    # print int(args.task) >=2
    # print int(args.task) >=3
    # print int(args.task) ==4
    # sys.exit(1)

    # print 'Run only %s tasks' % args.task
    # sys.exit(1)

    conf_dict = read_conf(args.config)
    ### read raw path of output dir, the startdir will be used when the input file is not in absolute path
    conf_dict['General']['startdir'] = os.getcwd() + '/'

    ### check output name and dir from input parameter
    if conf_dict['General']['outname'] == "":
        print 'your outname cannot be left blank,exit'
        sys.exit(1)
    if "." in conf_dict['General']['outname']:
        oldname = conf_dict['General']['outname']
        newname = oldname.replace(".", "-")
        conf_dict['General']['outname'] = newname
        print 'replace outname from %s to %s for latex summary' % (oldname,
                                                                   newname)
    if conf_dict['General']['outputdirectory'] == "":
        conf_dict['General']['outputdirectory'] = conf_dict['General'][
            'outname']
        print 'output directory is blank, use outname as directory name and set output directory in current folder'
    if "~" in conf_dict['General']['outname']:
        print 'ERROR: ~ cannot appeared in outname, current outname is %s' % (
            conf_dict['General']['outname'])
        sys.exit(1)
    if "~" in conf_dict['General']['outputdirectory']:
        print 'ERROR: require absolute path for outputdirectory'
        sys.exit(1)
    if not conf_dict['General']['outputdirectory'].endswith('/'):
        conf_dict['General']['outputdirectory'] += '/'
    if not conf_dict['General']['outputdirectory'].startswith('/'):
        conf_dict['General']['outputdirectory'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['outputdirectory']

    ### creat output dir
    if os.path.isfile(conf_dict['General']['outputdirectory'].rstrip("/")):
        print 'ERROR: name of your output dir %s is exist as a file, cannot create a dir,CAM exit' % (
            conf_dict['General']['outputdirectory'].rstrip("/"))
        sys.exit(1)
    elif os.path.isdir(conf_dict['General']['outputdirectory']):
        if not args.fover:
            print 'name of your output dir %s is exist as a dir, overwrite function (-f) is turned off, write output result in existing dir' % (
                conf_dict['General']['outputdirectory'])
        else:
            print 'name of your output dir %s is exist as a dir, overwrite function (-f) is turned on, write output result in existing dir' % (
                conf_dict['General']['outputdirectory'])
    else:
        os.system("mkdir %s" % (conf_dict['General']['outputdirectory']))

    ### move to output dir
    os.chdir(conf_dict['General']['outputdirectory'])
    ## cp config file to output folder
    if args.config.startswith('~') or args.config.startswith('/'):
        cmd = 'cp %s .' % (args.config)
    else:
        cmd = 'cp %s .' % (conf_dict['General']['startdir'] + args.config)
    CMD(cmd)
    ### specify the main progress log file
    logfile = conf_dict['General']['outputdirectory'] + 'progress_log.txt'
    filelist = conf_dict['General']['outputdirectory'] + 'progress_filelist.txt'
    ## remove existing log file.
    if os.path.isfile(logfile) and args.fover:
        CMD('rm %s' % logfile)
    if os.path.isfile(filelist) and args.fover:
        CMD('rm %s' % filelist)
    wlog('', filelist)
    ### Rscript location
    conf_dict['rscript'] = os.path.join(CAMpipe.__path__[0], "Rscript/")
    conf_dict['clean'] = args.Clean

    ### default annotation location
    conf_dict['default_anno_dir'] = os.path.join(CAMpipe.__path__[0],
                                                 "annotation/")

    ### main step for CAM, see individual script for detail note.
    # preparing step, integrate parameter, prepare for following step
    # wlog(args.task,logfile)
    wlog("Step0: preparation", logfile)
    t = time.time()
    step0_integrate_data(conf_dict, logfile)
    step0time = time.time() - t
    wlog("running time for Step0: %s" % (step0time), logfile)

    # main data processing step, including mapping, generate bigwig file
    wlog("Step1: data-preprocess", logfile)
    final_file = conf_dict['General'][
        'outputdirectory'] + 'preprocess/' + conf_dict['General'][
            'outname'] + '.bb'
    final_file_bp = conf_dict['General'][
        'outputdirectory'] + 'summary/' + conf_dict['General'][
            'outname'] + '.bb'
    # print not args.fover and (os.path.isfile(final_file) or os.path.isfile(final_file_bp))
    # sys.exit(1)
    if not args.fover and (os.path.isfile(final_file)
                           or os.path.isfile(final_file_bp)):
        print 'Step1 has been finished.'
    else:
        t = time.time()
        step1_preprocess(conf_dict, logfile, filelist)
        step1time = time.time() - t
        wlog("running time for Step1: %s" % (step1time), logfile)

    # nucarray step, including  nucarray detection, annotation of detected nucarray
    if int(args.task) >= 2:
        wlog("Step2: call well-positioned nucleosome array (analysis)",
             logfile)
        final_file = conf_dict['General'][
            'outputdirectory'] + 'nucarray/' + conf_dict['General'][
                'outname'] + '_center_position.bw'
        final_file_bp = conf_dict['General'][
            'outputdirectory'] + 'summary/' + conf_dict['General'][
                'outname'] + '_center_position.bw'
        if not args.fover and (os.path.isfile(final_file)
                               or os.path.isfile(final_file_bp)):
            print 'Step2 has been finished.'
        else:
            t = time.time()
            step3_nucarray(conf_dict, logfile, filelist)
            step3time = time.time() - t
            wlog("running time for Step2: %s" % (step3time), logfile)

    # QC and analysis step, including QC and analysis
    if int(args.task) >= 3:
        wlog("Step3: QC and nucleosome profile", logfile)
        final_file = conf_dict['General'][
            'outputdirectory'] + 'QC/' + conf_dict['General'][
                'outname'] + '_Features.txt'
        file_count_cmd = 'ls %s' % (conf_dict['General']['outputdirectory'] +
                                    'QC/')
        files = sp(file_count_cmd)[0].split('\n')
        files_num = 0
        for i in range(len(files)):
            if files[i].endswith('.pdf'):
                files_num += 1
        # print 'Total pdf files:%s' % files_num
        # print os.path.isfile(final_file) and files_num==8
        # sys.exit(1)
        # print os.path.isfile(final_file)
        if not args.fover and ((os.path.isfile(final_file)
                                and files_num == 8)):
            print 'Step3 has been finished.'
        else:
            t = time.time()
            step2_QC(conf_dict, logfile, filelist)
            step2time = time.time() - t
            wlog("running time for Step3: %s" % (step2time), logfile)

    if int(args.task) == 4:
        wlog("Step4: summary and output", logfile)
        final_file = conf_dict['General'][
            'outputdirectory'] + 'summary/' + conf_dict['General'][
                'outname'] + '_summary.pdf'
        if os.path.isfile(final_file):
            print 'Step4 has been finished.'
        else:
            t = time.time()
            step5_summary(conf_dict, logfile, filelist)
            step5time = time.time() - t
            wlog("running time for Step4: %s" % (step5time), logfile)