Exemplo n.º 1
0
def annotationmain(cosmic, clinvar, g1000, 
                   ref_ens,
                   vcf, sample, snp_filter, indel_filter,
                   output, logger_annotation_process, logger_annotation_errors, calling):
    callings = calling.split('\t')
    dict_cos, dict_clin, dict_g1000 = read_database(cosmic, clinvar, g1000)
    for call in callings:
        if 'raw_variants_SNP.vcf' in vcf:
            annotated_csv = output + '/' + sample + '.raw_SNP.' + call + '.txt'
            annotated_csv_add = output + '/' + sample + '.raw_SNP.' + call + '_PASS.txt'
            non_rs = output + '/' + sample + '.raw_SNP_non_rs.txt'
            non_cos = output + '/' + sample + '.raw_SNP_non_cos.txt'
            stats_file = output + '/' + sample + '.raw_SNP_' + call + '_stats.txt'
        elif 'raw_variants_indel.vcf' in vcf:
            annotated_csv = output + '/' + sample + '.raw_indel.' + call + '.txt'
            annotated_csv_add = output + '/' + sample + '.raw_indel.' + call + '_PASS.txt'
            non_rs = output + '/' + sample + '.raw_indel_non_rs.txt'
            non_cos = output + '/' + sample + '.raw_indel_non_cos.txt'
            stats_file = output + '/' + sample + '.raw_indel_' + call + '_stats.txt'
        elif 'filter_SNP.vcf' in vcf:
            annotated_csv = output + '/' + sample + '.filter_SNP.' + call + '.txt'
            annotated_csv_add = output + '/' + sample + '.filter_SNP.' + call + '_PASS.txt'
            non_rs = output + '/' + sample + '.filter_SNP_non_rs.txt'
            non_cos = output + '/' + sample + '.filter_SNP_non_cos.txt'
            stats_file = output + '/' + sample + '.filter_SNP_' + call + '_stats.txt'
        elif 'filter_indel.vcf' in vcf:
            annotated_csv = output + '/' + sample + '.filter_indel.' + call + '.txt'
            annotated_csv_add = output + '/' + sample + '.filter_indel.' + call + '_PASS.txt'
            non_rs = output + '/' + sample + '.filter_indel_non_rs.txt'
            non_cos = output + '/' + sample + '.filter_indel_non_cos.txt'
            stats_file = output + '/' + sample + '.filter_indel_' + call + '_stats.txt'
        elif 'raw_variants.vcf' in vcf:
            annotated_csv = output + '/' + sample + '.raw_variants.' + call + '.txt'
            annotated_csv_add = output + '/' + sample + '.raw_variants.' + call + '_PASS.txt'
            non_rs = output + '/' + sample + '.raw_variants_non_rs.txt'
            non_cos = output + '/' + sample + '.raw_variants_non_cos.txt'
            stats_file = output + '/' + sample + '.raw_variants_' + call + '_stats.txt'
        # - read the annotation database
        if not os.path.isfile(vcf) and call == 'GATK':
            store_annotation_logs('null', logger_annotation_errors, vcf + " does not exist!\n")
            # print(vcf + ' does not exist!')
        else:
            # --annotation
            annotation_v(dict_cos, dict_clin, dict_g1000, vcf, annotated_csv, stats_file, snp_filter,
                         indel_filter, sample, logger_annotation_process, call)
            # --add the annotation
            fill_table(annotated_csv, annotated_csv_add, ref_ens, call)
Exemplo n.º 2
0
def annotation_v(dict_cos, dict_clin, dict_g1000, variant_vcf, annotated_csv,
                 stats_file, snp_filter, indel_filter, sample_name, logger_annotation_process, calling):
    key_list = []
    num_in_clinvar = 0
    num_in_cosmic = 0
    num_in_g1000 = 0
    num_unmatch = 0
    output = open(annotated_csv, 'w')
    for keys in zip(read_vcf(variant_vcf, output, sample_name, calling, snp_filter, indel_filter)):
        # print(keys)
        key = keys[0][0]
        key1 = keys[0][1]
        value = keys[0][2]
        # print(value)
        unmatch = 0
        # drop duplicate variant
        if key in key_list:
            continue
        if key in dict_clin:
            new = '\t'.join(value) + '\t' + dict_clin[key].replace(',', '\t') + '\t'
            num_in_clinvar += 1
        else:
            new = '\t'.join(value) + '\t' + '-\t'*3 + define_hgvs(value[1], value[2], value[3], value[4]) + '\t-\t'
            unmatch += 1
        if key in dict_cos:
            new += dict_cos[key].replace(',', '\t') + '\t'
            num_in_cosmic += 1
        elif key1 in dict_cos:
            new += dict_cos[key1].replace(',', '\t') + '\t'
            num_in_cosmic += 1
        else:
            new += '-\t'*13
            unmatch += 1
        if key in dict_g1000:
            new += dict_g1000[key].replace(',', '\t') + '\n'
            num_in_g1000 += 1
        else:
            new += '-\t'*6 + '-\n'
        # 3 databases both unmatch
        if unmatch == 3:
            num_unmatch += 1
        else:
            output.write(new)
            key_list.append(key)
    output.close()
    store_annotation_logs(logger_annotation_process,
                          'null', '{0} has {1} variants.'.format(sample_name, key_list))
    store_annotation_logs(logger_annotation_process,
                          'null', '{0} has {1} variants in COSMIC database'.format(sample_name, num_in_cosmic))
    store_annotation_logs(logger_annotation_process,
                          'null', '{0} has {1} variants in Clinvar database'.format(sample_name, num_in_clinvar))
    store_annotation_logs(logger_annotation_process,
                          'null', '{0} has {1}variants in G1000 database'.format(sample_name, num_in_g1000))
    store_annotation_logs(logger_annotation_process, 'null',
                          '{0} has {1} variants unmatch in cosmic and clinvar.'.format(sample_name, num_unmatch))
    stats_out = open(stats_file, 'w')
    stats_out.write('#The sample has %s variants.\n' % len(key_list))
    stats_out.write('#Type\tvariants\n')
    stats_out.write('Variants\t%s\n' % len(key_list))
    stats_out.write('COSMIC\t%s\n' % num_in_cosmic)
    stats_out.write('Clinvar\t%s\n' % num_in_clinvar)
    stats_out.write('G1000\t%s\n' % num_in_g1000)
    stats_out.write('unmatch\t%s\n' % num_unmatch)
    stats_out.close()
Exemplo n.º 3
0
def main_run_germline_variant_calling(path_sampleID_sub):
    time_start1 = time.time()
    source = path_sampleID_sub[0].split('\t')[0]
    sample = path_sampleID_sub[0].split('\t')[1]
    if len(path_sampleID_sub[0].split('\t')) > 2:
        tailname = path_sampleID_sub[0].split('\t')[2]
        sample = sample + '_' + tailname
    # parameters
    (output, fastqc_dir, primers_file, exome_target_bed, min_read_len,
     common_seq1, common_seq2, num_threads, edit_dist, min_mapq, max_soft_clip,
     max_dist, memory_size, snp_filter, indel_filter, ref_ens, bwa_dir,
     samtools_dir, umitools_dir, gatk_dir, ref_index_name, ref_fa_file,
     total_ref_fa_file, total_ref_fa_dict, known_sites, erc, db_cosmic,
     db_clinvar, db_g1000, test_level, exome_target, calling, tabix, bgzip,
     bcftools_dir, varsan2_dir, strelka2_dir, total_ref_chrom_fa_file,
     datasets_dir, smcounter, mtdepth, rpb, ncpu, minbq, minmq, hplen,
     mismatchthr, mtdrop, maxmt, primerdist, bedtandemrepeats,
     bedrepeatmaskersubset, bedtools_dir, renew) = path_sampleID_sub[1:55]
    # check the output
    out_dir = output + '/' + sample
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # pipeline log file
    log_dir = out_dir + '/' + 'log'
    if not os.path.isdir(log_dir):
        try:
            os.makedirs(log_dir)
        except OSError as e:
            if e.errno == 17:
                logger_pipeline_process = log_dir
                logger_pipeline_errors = log_dir
    logger_pipeline_process = log_dir
    logger_pipeline_errors = log_dir
    # time cost
    time_start = time.time()
    module = "QC"

    read1 = source + '/' + sample + '_R1_001.fastq.gz'
    read2 = source + '/' + sample + '_R2_001.fastq.gz'

    #if tools in ['all', 'qc']:
    if 'qc' in tools or 'all' in tools:
        print("Test QC module!\n")
        # qc_dir
        qc_dir = out_dir + '/' + 'QC'
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        logger_statistics_process = log_dir
        logger_statistics_errors = log_dir
        qc_result1, qc_result2 = qc_raw_reads(fastqc_dir, qc_dir, sample,
                                              module, read1, read2,
                                              logger_statistics_process,
                                              logger_statistics_errors)
        # check the quality of the raw reads
        if float(qc_result1[7].strip('%')) > 70 and float(
                qc_result2[7].strip('%')) > 70:
            print(
                "The ratio of read1 and read2 with Q30 quality are both higher than 70%."
            )
        else:
            exit(
                "The ratio of read1 and read2 with Q30 quality are both lower than 80%!!!!!!!"
            )
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--QC of reads is completed after {1} min.\n".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        store_statistics_logs(
            logger_statistics_process, 'null',
            "--{0}--QC of reads is completed after {1} min.\n".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # trim
    ##########################################################################################
    # time cost
    time_start = time.time()
    # undetermined_dir
    undetermined_dir = out_dir + '/' + 'undetermined'
    trimmed1 = undetermined_dir + '/' + sample + '_R1_undetermined.fastq'
    trimmed2 = undetermined_dir + '/' + sample + '_R2_undetermined.fastq'
    stats_file = undetermined_dir + '/' + sample + '_basic_stats.txt'
    # if tools in ['all', 'trim']:
    if 'trim' in tools or 'all' in tools:
        print("please check the QC subprocess result--the min read length!")
        print("The cutoff of the min read length is the default: {0}".format(
            min_read_len))
        print("Test trim module!\n\n\n")
        # mkdir undetermined_dir
        if not os.path.exists(undetermined_dir):
            os.makedirs(undetermined_dir)

        logger_trim_process = log_dir
        logger_trim_errors = log_dir

        trim_read_pairs(read1, read2, trimmed1, trimmed2, min_read_len,
                        common_seq1, common_seq2, stats_file,
                        logger_trim_process, logger_trim_errors)
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--Trim of reads is completed after {1} min.\n".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        store_trim_logs(
            logger_trim_process, 'null',
            "--{0}--Trimming of reads is completed after {1} min.\n".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # align
    ##########################################################################################
    # time cost
    time_start = time.time()
    # aligned_dir
    aligned_dir = out_dir + '/' + 'aligned'
    trim_read1 = out_dir + '/' + 'undetermined' + '/' + sample + '_R1_undetermined.fastq'
    trim_read2 = out_dir + '/' + 'undetermined' + '/' + sample + '_R2_undetermined.fastq'

    out_file = aligned_dir + '/' + sample + '_aligned.sam'
    # if tools in ['all', 'align']:
    if 'align' in tools or 'all' in tools:
        print("please check the Trim subprocess result--undetermined.fastq!")
        print("Test align module!\n")
        if not os.path.exists(aligned_dir):
            os.makedirs(aligned_dir)
        logger_bwa_process = log_dir
        logger_bwa_errors = log_dir
        align_reads_bwa(bwa_dir, samtools_dir, ref_fa_file, ref_index_name,
                        exome_target_bed, total_ref_fa_file, trim_read1,
                        trim_read2, out_file, num_threads, logger_bwa_process,
                        logger_bwa_errors, renew)
        store_align_logs(
            logger_bwa_process, 'null',
            "--{0}--Alignment of reads is completed after {1} min.".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--Align of reads is completed after {1} min.".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    # #####################################annotationmain####################################################
    # post_align
    # #########################################################################################
    # time cost
    time_start = time.time()
    # post_aligned_dir
    filtered_dir = out_dir + '/' + 'filtered'
    # out_file from the align
    alignment_sam = out_file
    out_file1 = filtered_dir + '/' + sample + '_tmp.sam'
    stats_file = filtered_dir + '/' + sample + '_align_stats.txt'
    primer_stats_file = filtered_dir + '/' + sample + '_primer_stats.csv'
    out_file2 = filtered_dir + '/' + sample + '_filtered.sam'

    # if tools in ['all', 'post_align']:
    if 'post_align' in tools or 'all' in tools:
        print("please check the Algin subprocess result--aligned.sam!")
        print("Test post align module!\n")
        if not os.path.exists(filtered_dir):
            os.makedirs(filtered_dir)
        logger_filter_process = log_dir
        logger_filter_errors = log_dir
        filter_alignment_samtools(samtools_dir, alignment_sam, min_mapq,
                                  max_soft_clip, out_file1, stats_file,
                                  logger_filter_process, logger_filter_errors)
        identify_gs_primers(samtools_dir, out_file1, primers_file, max_dist,
                            out_file2, stats_file, primer_stats_file,
                            logger_filter_process, logger_filter_errors)
        store_filter_logs(
            logger_filter_process, 'null',
            "--{0}--Post Alignment of reads is completed after {1} min.".
            format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--Post_align of reads is completed after {1} min.".format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # barcode clustering
    ##########################################################################################
    # time cost
    time_start = time.time()
    # clustering_dir
    clustered_dir = out_dir + '/' + 'clustered'
    filtered_sam = out_dir + '/' + 'filtered' + '/' + sample + '_filtered.sam'
    filtered_bam = clustered_dir + '/' + sample + '_filtered.bam'
    sorted_bam = clustered_dir + '/' + sample + '_filtered_sorted.bam'
    umitool_stats = clustered_dir + '/' + sample + '_deduplicated'
    #umitool_stats = clustered_dir + '/' + sample + '_group.tsv'
    umis_sam = clustered_dir + '/' + sample + '_umis.sam'
    #if tools in ['all', 'cluster']:
    if 'cluster' in tools or 'all' in tools:
        print("please check the post algin subprocess result--filtered.sam!")
        print("Test cluster module!\n")
        if not os.path.exists(clustered_dir):
            os.makedirs(clustered_dir)
        logger_umi_process = log_dir
        logger_umi_errors = log_dir
        umitool(samtools_dir, umitools_dir, filtered_sam, filtered_bam,
                sorted_bam, umitool_stats, umis_sam, edit_dist,
                logger_umi_process, logger_umi_errors)
        store_cluster_logs(
            logger_umi_process, 'null',
            "--{0}--UMIs tools clustering of reads is completed after {1} min."
            .format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--Cluster of reads is completed after {1} min.".format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # reformat
    ##########################################################################################
    # time cost
    time_start = time.time()
    # reformated_dir
    reformated_dir = out_dir + '/' + 'reformated'
    alignment_sam = out_dir + '/' + 'clustered' + '/' + sample + '_umis.sam'
    output_sam = reformated_dir + '/' + sample + '_vcready.sam'
    # if tools in ['all', 'reformat']:
    if 'reformat' in tools or 'all' in tools:
        print("please check the cluster subprocess result--umis.sam!")
        print("Test reformat module!\n")
        if not os.path.exists(reformated_dir):
            os.makedirs(reformated_dir)
        logger_reformat_process = log_dir
        logger_reformat_errors = log_dir
        reformat_sam(alignment_sam, output_sam, logger_reformat_process,
                     logger_reformat_errors)
        store_reformat_logs(
            logger_reformat_process, 'null',
            '--{0}--Finish reformating alignment SAM file is completed after {1} min.'
            .format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--Reformat alignment SAM file is completed after {1} min.'.
            format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    # #########################################################################################
    # Germline variant calling
    # #########################################################################################
    # time cost
    time_start = time.time()
    # germline_vc_dir
    germline_vc_dir = out_dir + '/' + 'germline_vc'
    # modify the known-sites
    known_sites = known_sites.replace(',',
                                      ' --known-sites ' + datasets_dir + '/')
    known_sites = datasets_dir + '/' + known_sites
    vready_sam = out_dir + '/' + 'reformated' + '/' + sample + '_vcready.sam'
    # marked_bqsr_bam = germline_vc_dir + '/' + sample + '_sorted.MarkDuplicates.BQSR.bam'
    if os.path.basename(exome_target_bed) != 'all':
        exon_interval = germline_vc_dir + '/' + 'target_interval.list'
    else:
        exon_interval = 'all'
    # if tools in ['all', 'variant_call']:
    if 'variant_call' in tools or 'all' in tools:
        print("please check the reformat subprocess result--vcready.sam!")
        print("Test variant_call module!\n")
        if not os.path.exists(germline_vc_dir):
            os.makedirs(germline_vc_dir)
        logger_germline_vc_process = log_dir
        logger_germline_vc_errors = log_dir

        bqsr = 'n'
        bam_to_variant, bqsr_bam_to_variant = sam_to_bam(
            gatk_dir, samtools_dir, vready_sam, sample, germline_vc_dir,
            memory_size, exome_target_bed, total_ref_fa_file,
            total_ref_fa_dict, known_sites, logger_germline_vc_process,
            logger_germline_vc_errors, bqsr, renew)
        callings = calling.split(',')
        # if calling == 'GATK':
        if 'GATK' in callings:
            germline_variant_calling(gatk_dir, bam_to_variant, sample,
                                     germline_vc_dir, memory_size,
                                     total_ref_fa_file, exon_interval, erc,
                                     snp_filter, indel_filter,
                                     logger_germline_vc_process,
                                     logger_germline_vc_errors)
        # elif calling == 'strelka2':
        if 'strelka2' in callings:
            strelka2_call(strelka2_dir, bgzip, tabix, total_ref_chrom_fa_file,
                          germline_vc_dir, sample, bam_to_variant,
                          exome_target_bed, logger_germline_vc_process,
                          logger_germline_vc_errors, renew)
        # elif calling == 'samtools':
        if 'samtools' in callings:
            samtools_call(samtools_dir, bcftools_dir, bam_to_variant, sample,
                          germline_vc_dir, total_ref_fa_file,
                          logger_germline_vc_process,
                          logger_germline_vc_errors)
        # elif calling == 'varscan2':
        if 'varscan2' in callings:
            varsan2_call(samtools_dir, varsan2_dir, total_ref_fa_file,
                         germline_vc_dir, sample, bam_to_variant,
                         logger_germline_vc_process, logger_germline_vc_errors)
        if 'smcounter' in callings:
            bam_to_variant = bam_to_variant.rstrip(
                ".MarkDuplicates.RG.bam") + '.bam'
            threshold = 0
            logfile = germline_vc_dir + '/smcountlog'
            smcounter_call(smcounter, germline_vc_dir + '/' + sample,
                           bam_to_variant, exome_target_bed, mtdepth, rpb,
                           ncpu, minbq, minmq, hplen, mismatchthr, mtdrop,
                           maxmt, primerdist, threshold, total_ref_fa_file,
                           bedtandemrepeats, bedrepeatmaskersubset,
                           bedtools_dir, logfile, logger_germline_vc_process,
                           logger_germline_vc_errors, renew)
        store_germline_vc_logs(
            logger_germline_vc_process, 'null',
            '--{0}--Germline variant calling is completed after {1} min.'.
            format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--variant_calling is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    # #########################################################################################
    # Annotation variant calling
    # #########################################################################################
    # time cost
    time_start = time.time()
    # Annotation dir
    annotation_dir = out_dir + '/' + 'annotation'
    raw_vcf = germline_vc_dir + '/' + sample + '.raw_variants.vcf'
    snp_vcf = germline_vc_dir + '/' + sample + '.raw_variants_SNP.vcf'
    indel_vcf = germline_vc_dir + '/' + sample + '.raw_variants_indel.vcf'
    # annotation
    # if tools in ['all', 'annotation']:
    if 'annotation' in tools or 'all' in tools:
        print("please check the variant_call subprocess result--VCF!")
        print("Test annotation module!\n")
        # Annotation dir
        if not os.path.exists(annotation_dir):
            os.makedirs(annotation_dir)
        logger_annotation_process = log_dir
        logger_annotation_errors = log_dir
        snp_limit, indel_limit = read_vcf_filter(snp_filter, indel_filter)
        callings = calling.split(',')
        for callingsub in callings:
            annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, raw_vcf,
                           sample, snp_limit, indel_limit, annotation_dir,
                           logger_annotation_process, logger_annotation_errors,
                           callingsub)
        if 'GATK' in callings:
            callingsub = 'GATK'
            annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, snp_vcf,
                           sample, snp_limit, indel_limit, annotation_dir,
                           logger_annotation_process, logger_annotation_errors,
                           callingsub)
            annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, indel_vcf,
                           sample, snp_limit, indel_limit, annotation_dir,
                           logger_annotation_process, logger_annotation_errors,
                           callingsub)

        store_annotation_logs(
            logger_annotation_process, 'null',
            '--{0}--Finish annotation variant  is completed after {1} min.'.
            format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--Annotation variant is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # statistics of the variant calling pipeline
    ##########################################################################################
    # time cost
    time_start = time.time()
    # statistics dir
    statistics_dir = out_dir + '/' + 'statistics'
    if not os.path.exists(statistics_dir):
        os.makedirs(statistics_dir)
    # statistics the clean reads
    # statistics trim dir
    statistics_trim_dir = statistics_dir + '/' + 'trim_QC'
    if not os.path.exists(statistics_trim_dir):
        os.makedirs(statistics_trim_dir)
    # if tools in ['all', 'statis']:
    if 'statis' in tools or 'all' in tools:
        print("please check the others subprocess results!")
        print("Test statistics module!\n")
        #if tools == 'statis':
        logger_statistics_process = log_dir
        logger_statistics_errors = log_dir
        # statistics dir
        if not os.path.exists(statistics_dir):
            os.makedirs(statistics_dir)
        module = "Trim"
        trim_result1, trim_result2 = qc_raw_reads(fastqc_dir,
                                                  statistics_trim_dir, sample,
                                                  module, trimmed1, trimmed2,
                                                  logger_statistics_process,
                                                  logger_statistics_errors)
        # statistics the align
        module1 = "Align"
        align_sorted_bam = statistics_depth_coverage(
            samtools_dir, out_file, statistics_dir, sample, module1,
            exome_target, exome_target_bed, logger_statistics_process,
            logger_statistics_errors, renew)
        align_statistics = statistics_sam_bam(samtools_dir, align_sorted_bam,
                                              statistics_dir, sample, module1,
                                              logger_statistics_process,
                                              logger_statistics_errors, renew)
        # statistics the filter
        # cluster module would build the filter sorted bam, but it has been changed UMIs-tools
        module2 = "Fliter"
        filtered_sorted_bam = statistics_depth_coverage(
            samtools_dir, filtered_sam, statistics_dir, sample, module2,
            exome_target, exome_target_bed, logger_statistics_process,
            logger_statistics_errors, renew)
        fliter_statistics = statistics_sam_bam(samtools_dir,
                                               filtered_sorted_bam,
                                               statistics_dir, sample, module2,
                                               logger_statistics_process,
                                               logger_statistics_errors, renew)
        # statistics the umi-tools
        module3 = "Cluster_reformat"
        cr_sorted_bam = statistics_depth_coverage(
            samtools_dir, vready_sam, statistics_dir, sample, module3,
            exome_target, exome_target_bed, logger_statistics_process,
            logger_statistics_errors, renew)
        cr_statistics = statistics_sam_bam(samtools_dir, cr_sorted_bam,
                                           statistics_dir, sample, module3,
                                           logger_statistics_process,
                                           logger_statistics_errors, renew)
        # staistics the bases MT depth
        statistics_mtdepth_coverage(germline_vc_dir, statistics_dir, sample,
                                    exome_target, logger_statistics_process,
                                    logger_statistics_errors)
        # merge the sorted bam
        merge_statistics_sam_bam(logger_statistics_process,
                                 logger_statistics_errors, statistics_dir,
                                 sample, ','.join([module1, module2,
                                                   module3]), align_statistics,
                                 fliter_statistics, cr_statistics)
        store_statistics_logs(
            logger_statistics_process, 'null',
            '--{0}--Statistics is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--Statistics is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--Pipeline : {0} is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start1) / 60))))
        # statistics the time cost
        process_log = out_dir + '/' + 'log' + '/' + 'process.log'
        statistics_time(statistics_dir, sample, process_log,
                        logger_statistics_process, logger_statistics_errors)
        print("--" * 20 + '\n\n')
    return 'Pipeline : {0} is completed after {1} min.'.format(
        sample, ('%.2f' % ((time.time() - time_start1) / 60)))