Пример #1
0
def Loc_paired(read, loc_dict):
    '''
    Record the location information for paired sequencing
    Return:
    loc_s = { chrid: {loc1: coverage; ...}; chrid:..}
    loc1: plus strand (++ or --) 5' pos +  minus strand (+- or -+) 5' pos
    '''

    # Using the read class In: paired unique mapping read  Out: [flag,strand,chrom,pos1,CIGAR,pos2,insert,seq,score]
    read_s = RI(read)
    read_info = read_s.extract_information()
    if len(read_info) == 0:
        return loc_dict
    strand, chrom, pos, pos2, insert = read_info[1], read_info[2], read_info[
        3], read_info[5], read_info[6]

    # only use one mate with plus insert to record
    if strand == '+-' or strand == '-+':
        return loc_dict

    site = pos + '_' + str(int(pos) + int(insert) - 1)
    if loc_dict.has_key(chrom):
        if loc_dict[chrom].has_key(site):
            loc_dict[chrom][site] += 1
        else:
            loc_dict[chrom][site] = 1
    else:
        loc_dict[chrom] = {}
        loc_dict[chrom][site] = 1
    return loc_dict
Пример #2
0
def Loc_paired(read,loc_dict):
    '''
    Record the location information for paired sequencing
    Return:
    loc_s = { chrid: {loc1: coverage; ...}; chrid:..}
    loc1: plus strand (++ or --) 5' pos +  minus strand (+- or -+) 5' pos
    '''

    # Using the read class In: paired unique mapping read  Out: [flag,strand,chrom,pos1,CIGAR,pos2,insert,seq,score]
    read_s = RI(read)
    read_info = read_s.extract_information()
    if len(read_info) == 0:
        return loc_dict
    strand, chrom, pos, pos2, insert = read_info[1], read_info[2], read_info[3], read_info[5], read_info[6]


    # only use one mate with plus insert to record
    if strand == '+-' or strand == '-+':
        return loc_dict

    site = pos + '_' + str(int(pos) + int(insert) - 1)
    if loc_dict.has_key(chrom):
        if loc_dict[chrom].has_key(site):
            loc_dict[chrom][site] +=1
        else:
            loc_dict[chrom][site] = 1
    else:
        loc_dict[chrom] = {}
        loc_dict[chrom][site] = 1
    return loc_dict
Пример #3
0
def Loc_single(read, loc_dict):
    '''
    Record the location information for single-end sequencing
    Return:
    loc_s = { chrid: {loc1: coverage; ...}; chrid:..}
    loc1: 5' pos + strand
    '''

    # Using the read class  In: single unique mapping read  Out: [flag,strand,chrom,pos,CIGAR,seq,score]
    read_s = RI(read)
    read_info = read_s.extract_information()
    if len(read_info) == 0:
        return loc_dict

    strand, chrom, pos, seq = read_info[1], read_info[2], read_info[
        3], read_info[5]

    # The minus strand should be shifted len(seq) bp
    if strand == '-+':
        pos = int(pos) + len(seq) - 1

    site = strand + '_' + str(pos)
    if loc_dict.has_key(chrom):
        if loc_dict[chrom].has_key(site):
            loc_dict[chrom][site] += 1
        else:
            loc_dict[chrom][site] = 1
    else:
        loc_dict[chrom] = {}
        loc_dict[chrom][site] = 1
    return loc_dict
Пример #4
0
def Loc_single(read,loc_dict):
    '''
    Record the location information for single-end sequencing
    Return:
    loc_s = { chrid: {loc1: coverage; ...}; chrid:..}
    loc1: 5' pos + strand
    '''

    # Using the read class  In: single unique mapping read  Out: [flag,strand,chrom,pos,CIGAR,seq,score]
    read_s = RI(read)
    read_info = read_s.extract_information()
    if len(read_info) == 0:
        return loc_dict

    strand, chrom, pos, seq = read_info[1], read_info[2], read_info[3], read_info[5]

    # The minus strand should be shifted len(seq) bp
    if strand == '-+':
        pos = int(pos) +  len(seq) -1

    site = strand + '_' + str(pos)
    if loc_dict.has_key(chrom):
        if loc_dict[chrom].has_key(site):
            loc_dict[chrom][site] += 1
        else:
            loc_dict[chrom][site] = 1
    else:
        loc_dict[chrom] = {}
        loc_dict[chrom][site] = 1
    return loc_dict
Пример #5
0
 def get_read_info(self):
     '''
     Using the read class
     If the read is unique mapping or unique and  paired mapping, we will get a information list.
     If not, we will get a empty list ([]).
     In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
     In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
     '''
     read_s = RI(self.read)
     read_info = read_s.extract_information()
     return read_info
Пример #6
0
 def get_read_info(self):
     """
     Using the read class
     If the read is unique mapping or unique and  paired mapping, we will get a information list.
     If not, we will get a empty list ([]).
     In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
     In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
     """
     read_s = RI(self.read, self.bsmb)
     read_info = read_s.extract_information()
     return read_info
Пример #7
0
def run(args):
    """
    Alternative module: Use the strategy in Bis-SNP to trim 5' bisulfite conversion failures
    """
    options = args.parse_args()

    if len(options.sam_file) == 0:
        error("Missing the SAM file, use -s or --sam option.")
    else:
        options.sam_file = options.sam_file.split(',')
    for s in options.sam_file:
        if not os.path.isfile(s):
            error("Can't open the SAM file: " + s)
            sys.exit(1)

    if len(options.ref_file) == 0:
        error(
            "Missing the reference genome fasta file, use -r or --ref option.")
    else:
        if not os.path.isfile(options.ref_file):
            error("Can't open the ref file: " + options.ref_file)

    if len(options.samtools) != 0:
        if options.samtools[-1] != '/':
            options.samtools += '/'

    if len(options.name) == 0:
        error("Missing the output file name, use -n or --name options.")

    sam_inf = options.sam_file
    ref_file = options.ref_file
    bsm = options.bsm
    s_path = options.samtools
    name = options.name
    remove_overlap = options.remove_overlap
    filter_dup = options.filter_dup
    p_poisson = options.p_poisson
    gsize = options.gsize
    not_mapping = options.not_mapping

    info("Get the all parameter!!")

    #check the input mapping files
    sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path)
    pre_flag = read_inf.readline().split('\t')[1]
    if 'p' in pre_flag:
        single_on = False
        info("The input mapping files are paired-end sequencing!")
    else:
        single_on = True
        info("The input mapping files are single-end sequencing!")

    loc_dict = {}
    if filter_dup:
        ## if filter_up is TRUE, the duplicate reads will be assessed and shown in Dup_dis.pdf
        info("The filter_dup has been set True.")
        info("Assess the duplicate reads...")
        for sam in sam_inf:
            #check the input mapping files
            sam_format, read_inf = check.check_mapping_file(sam, s_path)
            if single_on:
                for read in read_inf:
                    loc_dict = LI.Loc_single(read, loc_dict, bsm)
            else:
                for read in read_inf:
                    loc_dict = LI.Loc_paired(read, loc_dict, bsm)
        max_cov = DR.duplicate_report(loc_dict, gsize, p_poisson, name)
        info('Get the duplicate reads distribution!')

    #get reference information
    ref = GR.get_ref(ref_file)
    trim_position = []

    filter_duplicate_reads = 0
    filter_nonuniform_trim_bp = 0
    filter_nonuniform_trim_bp_CG = 0
    filter_remove_overlap_bp = 0
    filter_not_mapping_reads = 0
    all_reads = 0
    not_mapping_reads = 0
    all_mapping_bp = 0

    ##filter the 5' bisulfite failure
    for sam in sam_inf:
        out_sam = sam[:-4] + '_' + name + '_filter.sam'
        out = open(out_sam, 'w')
        #check the input mapping files
        record_mate = {}
        sam_format, read_inf = check.check_mapping_file_header(sam, s_path)

        for read in read_inf:
            #for sam header
            if read.startswith('@'):
                out.write(read)
                continue
            else:
                all_reads += 1  ##record the read number (2013-06-20)

                #Get the read information for trimming
                #If the read isn't unique mapping, we will get a empty list ([]).
                #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
                #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
            read_info = RI(read, bsm)
            read_info = read_info.extract_information()

            if len(read_info) == 0:
                not_mapping_reads += 1
                if not_mapping:  #keep the not_unique mapping reads (or not paired mapping)
                    out.write(read)
                else:
                    filter_not_mapping_reads += 1  ##record the not mapping read number (2013-06-20)
                continue

            if len(
                    loc_dict
            ) > 0:  #the --filter_dup has been set True, have to remove duplicate reads
                duplicate, loc_dict = DF(read_info, loc_dict, max_cov,
                                         single_on)
            else:
                duplicate = False

            if single_on:
                all_mapping_bp += len(
                    read_info[5]
                )  ##record the mapping read basepair (2013-06-20)
            else:
                all_mapping_bp += len(
                    read_info[7]
                )  ##record the mapping read basepair (2013-06-20)

            record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = NF.nonuniform_filter(
                read, out, read_info, ref, remove_overlap, duplicate,
                single_on, record_mate, trim_position,
                filter_nonuniform_trim_bp_CG, filter_duplicate_reads,
                filter_remove_overlap_bp)
        out.close()
        del record_mate
    NR.nonuniform_generator(trim_position, name)

    for i in range(len(trim_position)):
        filter_nonuniform_trim_bp += i * trim_position[i]

    ##produce the filter report
    info('Produce the report file...')
    report_out = open(name + "_BSeQC_nonuniform_filter_report.txt", 'w')
    report_out.write('Total reads: %d\n' % all_reads)
    if single_on:
        report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' %
                         (not_mapping_reads,
                          float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Unique mapping reads: %d(%.2f%s all reads)\n' %
            ((all_reads - not_mapping_reads),
             float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Skip not unique mapping reads: %d(%.2f%s all reads)\n' %
            (filter_not_mapping_reads,
             float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique mapping reads:\n')
        report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write(
            'Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' %
            (filter_duplicate_reads, float(filter_duplicate_reads) /
             (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write(
            "Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            % (filter_nonuniform_trim_bp,
               float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write(
            "Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            %
            (filter_nonuniform_trim_bp_CG,
             float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%"))

    else:
        report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' %
                         (not_mapping_reads,
                          float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Unique paired mapping reads: %d(%.2f%s)\n' %
            ((all_reads - not_mapping_reads),
             float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Skip not paired unique mapping reads: %d(%.2f%s)\n' %
            (filter_not_mapping_reads,
             float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique paired mapping reads:\n')
        report_out.write('All unique paired mapping basepairs: %d\n' %
                         all_mapping_bp)
        report_out.write(
            'Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n'
            % (filter_duplicate_reads, float(filter_duplicate_reads) /
               (all_reads - not_mapping_reads * 100), "%"))
        report_out.write(
            "Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            % (filter_nonuniform_trim_bp,
               float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write(
            "Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            %
            (filter_nonuniform_trim_bp_CG,
             float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%"))
        report_out.write(
            'Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n'
            % (filter_remove_overlap_bp,
               float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%"))
    report_out.close()
    info('Get the report file!')
Пример #8
0
def filter_sam(sam_inf, strand_t, read_l, single_on, name, s_path, auto, remove_overlap, loc_dict, max_cov,
               not_mapping):
    '''
    Trim the mapping files with the biased positions of every length in every strand,
    which are saved in the variance: strand_t.
    '''
    filter_duplicate_reads = 0
    filter_mbias_trim_bp = 0
    filter_remove_overlap_bp = 0
    filter_not_mapping_reads = 0
    all_reads = 0
    not_mapping_reads = 0
    all_mapping_bp = 0
    for sam in sam_inf:
        out_sam = sam[:-4] + '_' + name + '_filter.sam'
        out = open(out_sam, 'w')

        #check the input mapping files
        sam_format, read_inf = check.check_mapping_file_header(sam, s_path)

        #scan every read to qc_filter
        for read in read_inf:
            #for sam header
            if read.startswith('@'):
                out.write(read)
                continue
            else:
                all_reads += 1  ##record the read number (2013-06-20)

            #Get the read information for trimming
            #If the read isn't unique mapping, we will get a empty list ([]).
            #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
            #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
            read_info = RI(read)
            read_info = read_info.extract_information()
            if len(read_info) == 0:
                not_mapping_reads += 1
                if not_mapping:         #keep the not_unique mapping reads
                    out.write(read)
                else:
                    filter_not_mapping_reads += 1  ##record the not mapping read number (2013-06-20)
                continue

            if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads
                duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on)
            else:
                duplicate = False

            if single_on:
                all_mapping_bp += len(read_info[5])     ##record the mapping read basepair (2013-06-20)
                if auto:
                    if read_l[0] != '':
                        original_length = int(read_l[sam_inf.index(sam)])
                    else:
                        original_length = ''
                    filter_mbias_trim_bp, filter_duplicate_reads = SF(read, strand_t, out, read_info, original_length,
                        duplicate, filter_mbias_trim_bp, filter_duplicate_reads)
                else:
                    if not duplicate and len(loc_dict) > 0:
                        out.write(read)                 #not trimming, only output not_duplicate reads
                    else:
                        filter_duplicate_reads += 1     ##record the duplicate read (2013-06-20)
            else:
                all_mapping_bp += len(read_info[7])     ##record the mapping read basepair (2013-06-20)
                if auto or remove_overlap:
                    if read_l[0] != '':
                        original_length = [int(i) for i in read_l[sam_inf.index(sam)].split('-')]
                    else:
                        original_length = ''
                    filter_mbias_trim_bp, filter_duplicate_reads, filter_remove_overlap_bp = PF(read, strand_t, out,
                        read_info, original_length, auto, remove_overlap, duplicate, filter_mbias_trim_bp,
                        filter_duplicate_reads, filter_remove_overlap_bp)
                else:
                    if not duplicate and len(loc_dict) > 0:
                        out.write(read)                  #not trimming, only output not_duplicate reads
                    else:
                        filter_duplicate_reads += 1     ##record the duplicate read (2013-06-20)
        out.close()

    ##produce the filter report
    info('Produce the report file...')
    report_out = open(name + "_BSeQC_mbias_filter_report.txt", 'w')
    report_out.write('Total reads: %d\n' % all_reads)
    if single_on:
        report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % (
        not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % (
        (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % (
        filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique mapping reads:\n')
        report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % (
        filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique mapping basepairs)\n' % (
        filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%"))

    else:
        report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % (
        not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % (
        (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % (
        filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique paired mapping reads:\n')
        report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % (
        filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (
        filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (
        filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%"))
    report_out.close()
    info('Get the report file!')




	
Пример #9
0
def filter_sam(sam_inf, strand_t, read_l, single_on, name, s_path, auto, remove_overlap, loc_dict, max_cov,
               not_mapping):
    '''
    Trim the mapping files with the biased positions of every length in every strand,
    which are saved in the variance: strand_t.
    '''
    filter_duplicate_reads = 0
    filter_mbias_trim_bp = 0
    filter_remove_overlap_bp = 0
    filter_not_mapping_reads = 0
    all_reads = 0
    not_mapping_reads = 0
    all_mapping_bp = 0
    for sam in sam_inf:
        out_sam = sam[:-4] + '_' + name + '_filter.sam'
        out = open(out_sam, 'w')

        #check the input mapping files
        sam_format, read_inf = check.check_mapping_file_header(sam, s_path)

        #scan every read to qc_filter
        for read in read_inf:
            #for sam header
            if read.startswith('@'):
                out.write(read)
                continue
            else:
                all_reads += 1  ##record the read number (2013-06-20)

            #Get the read information for trimming
            #If the read isn't unique mapping, we will get a empty list ([]).
            #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
            #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
            read_info = RI(read)
            read_info = read_info.extract_information()
            if len(read_info) == 0:
                not_mapping_reads += 1
                if not_mapping:         #keep the not_unique mapping reads
                    out.write(read)
                else:
                    filter_not_mapping_reads += 1  ##record the not mapping read number (2013-06-20)
                continue

            if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads
                duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on)
            else:
                duplicate = False

            if single_on:
                all_mapping_bp += len(read_info[5])     ##record the mapping read basepair (2013-06-20)
                if auto:
                    if read_l[0] != '':
                        original_length = int(read_l[sam_inf.index(sam)])
                    else:
                        original_length = ''
                    filter_mbias_trim_bp, filter_duplicate_reads = SF(read, strand_t, out, read_info, original_length,
                        duplicate, filter_mbias_trim_bp, filter_duplicate_reads)
                else:
                    if not duplicate and len(loc_dict) > 0:
                        out.write(read)                 #not trimming, only output not_duplicate reads
                    else:
                        filter_duplicate_reads += 1     ##record the duplicate read (2013-06-20)
            else:
                all_mapping_bp += len(read_info[7])     ##record the mapping read basepair (2013-06-20)
                if auto or remove_overlap:
                    if read_l[0] != '':
                        original_length = [int(i) for i in read_l[sam_inf.index(sam)].split('-')]
                    else:
                        original_length = ''
                    filter_mbias_trim_bp, filter_duplicate_reads, filter_remove_overlap_bp = PF(read, strand_t, out,
                        read_info, original_length, auto, remove_overlap, duplicate, filter_mbias_trim_bp,
                        filter_duplicate_reads, filter_remove_overlap_bp)
                else:
                    if not duplicate and len(loc_dict) > 0:
                        out.write(read)                  #not trimming, only output not_duplicate reads
                    else:
                        filter_duplicate_reads += 1     ##record the duplicate read (2013-06-20)
        out.close()

    ##produce the filter report
    info('Produce the report file...')
    report_out = open(name + "_BSeQC_mbias_filter_report.txt", 'w')
    report_out.write('Total reads: %d\n' % all_reads)
    if single_on:
        report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % (
        not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % (
        (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % (
        filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique mapping reads:\n')
        report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % (
        filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique mapping basepairs)\n' % (
        filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%"))

    else:
        report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % (
        not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % (
        (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % (
        filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique paired mapping reads:\n')
        report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % (
        filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (
        filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (
        filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%"))
    report_out.close()
    info('Get the report file!')
Пример #10
0
def record_site(read, ref, bsm, dige_site):
    '''
    record the location of restriction enzyme digestion site in the read,
    and the methylation level of the restriction enzyme digestion site
    :param read:
    :param ref:
    :param dige_site:
    '''


    #Using the read class
    #If the read is unique mapping or unique and  paired mapping, we will get a information list.
    #If not, we will get a empty list ([]).
    #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
    #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
    read_s = RI(read, bsm)
    read_info = read_s.extract_information()
    strand = ''
    site_meth_list = []
    reverse_strand = ['-+', '+-']

    if len(read_info) == 0:
        return read_info, site_meth_list


    strand, chr, pos, seq = read_info[1], read_info[2], int(read_info[3]) - 1, read_info[-2]
    #print strand, chr, pos, seq

    if len(chr.split('_')) > 1:   #filter some 'chr_' chromosome
        return read_info, site_meth_list

    readlen = len(seq)
    refseq = ref[chr][pos - 1: pos + readlen + 1]


    if strand not in reverse_strand:
    #get the location of restriction enzyme digestion site
        dige_5_index = refseq.find('CCGG')
        dige_3_index = refseq.rfind('CCGG')

    else:
    # if the strand is '-+' (single-end) or '+-', reverse the seq and refseq
        seq = seq[::-1]
        refseq = refseq[::-1]
        dige_5_index = refseq.find('GGCC')
        dige_3_index = refseq.rfind('GGCC')

    if dige_3_index == -1 or dige_5_index == -1:
        warning("Can't find MspI site in the read: %s" %read.rstrip())
        #return dige_dict
        return read_info, site_meth_list


    # get the location of the restriction enzyme digestion site
    #dige_dict[strand]['s'][0].append(dige_5_index + 1)
    #dige_dict[strand]['e'][0].append(dige_3_index + 1)


    # get the methylation state of the restriction enzyme digestion site
    if strand[1] == '-':
        #stand: +- or --, check the third nucleotide of C-CGG
        dige_5_meth, dige_5_all = get_meth_state(strand, seq, refseq, dige_5_index + 1)
        dige_3_meth, dige_3_all = get_meth_state(strand, seq, refseq, dige_3_index + 1)
    else:
        #stand: ++ or -+, check the second nucleotide of C-CGG
        dige_5_meth, dige_5_all = get_meth_state(strand, seq, refseq, dige_5_index)
        dige_3_meth, dige_3_all = get_meth_state(strand, seq, refseq, dige_3_index)



    #dige_dict[strand]['s'][1] = [sum(x) for x in zip(dige_dict[strand]['s'][1], [dige_5_meth, dige_5_all])]
    #dige_5_meth_list = [dige_5_meth] + [0] * 10
    #dige_5_all_list = [dige_5_all] + [0] * 10
    #dige_3_meth_list = [0] * 11
    #dige_3_all_list = [0] * 11
    #calculate the methylation states of 10 nucleotides after dige_5_index
    #for i in range(1, 11):
        #meth_5, all_5 = get_meth_state(strand, seq, refseq, dige_5_index + 2 + i)  # +2 to scan the nucleotide after CCGG
        #meth_3, all_3 = get_meth_state(strand, seq, refseq, len(seq) - i)
        #dige_5_meth_list[i] = meth_5
        #dige_5_all_list[i] = all_5
        #dige_3_meth_list[10 - i] = meth_3
        #dige_3_all_list[10 - i] = all_3
    #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, len(seq)]
    site_meth_list = [dige_5_meth, dige_5_all, 0, 0, dige_5_index, len(seq)]
    if dige_3_index != 0 and dige_3_index != dige_5_index:
        if strand[1] == '+':
            #check the 3' digestion site for '++' and '-+'
            if len(seq[(dige_3_index - 1):(dige_3_index + 3)]) == 4:
                if (strand == '++' and seq[dige_3_index + 2] == 'A') or (strand == '-+' and seq[dige_3_index + 2] == 'T'):
                    site_meth_list = [dige_5_meth, dige_5_all, dige_3_meth, dige_3_all, dige_5_index, dige_3_index]
                    #dige_dict[strand]['e'][1] = [sum(x) for x in zip(dige_dict[strand]['e'][1], [dige_3_meth, dige_3_all])]
                    #dige_dict[strand]['e'] = [sum(x) for x in zip(dige_dict[strand]['e'], [dige_3_meth, dige_3_all])]
                    #for i in range(1, 11):
                    #    meth_3, all_3 = get_meth_state(strand, seq, refseq, dige_3_index - 1 - i)
                    #    dige_3_meth_list[10 - i] = meth_3
                    #    dige_3_all_list[10 - i] = all_3
                    #dige_3_meth_list[-1] = dige_3_meth
                    #dige_3_all_list[-1] = dige_3_all
                    #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, dige_3_index]
                    #print "end_repair_3", refseq[dige_3_index:(dige_3_index + 4)], dige_3_meth_list, dige_3_all_list


        else:
            site_meth_list = [dige_5_meth, dige_5_all, dige_3_meth, dige_3_all, dige_5_index, dige_3_index]
            #dige_dict[strand]['e'][1] = [sum(x) for x in zip(dige_dict[strand]['e'][1], [dige_3_meth, dige_3_all])]
            #dige_dict[strand]['e'] = [sum(x) for x in zip(dige_dict[strand]['e'], [dige_3_meth, dige_3_all])]
            #for i in range(1, 11):
            #    meth_3, all_3 = get_meth_state(strand, seq,refseq, dige_3_index - 1 - i)
            #    dige_3_meth_list[10 - i] = meth_3
            #    dige_3_all_list[10 - i] = all_3
            #dige_3_meth_list[-1] = dige_3_meth
            #dige_3_all_list[-1] = dige_3_all
            #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, dige_3_index]
            #print 'bbbb', dige_3_meth_list, dige_3_all_list

    #return dige_dict
    return read_info, site_meth_list
Пример #11
0
def record_site(read, ref, bsm, dige_site):
    '''
    record the location of restriction enzyme digestion site in the read,
    and the methylation level of the restriction enzyme digestion site
    :param read:
    :param ref:
    :param dige_site:
    '''

    #Using the read class
    #If the read is unique mapping or unique and  paired mapping, we will get a information list.
    #If not, we will get a empty list ([]).
    #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
    #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
    read_s = RI(read, bsm)
    read_info = read_s.extract_information()
    strand = ''
    site_meth_list = []
    reverse_strand = ['-+', '+-']

    if len(read_info) == 0:
        return read_info, site_meth_list

    strand, chr, pos, seq = read_info[1], read_info[2], int(
        read_info[3]) - 1, read_info[-2]
    #print strand, chr, pos, seq

    if len(chr.split('_')) > 1:  #filter some 'chr_' chromosome
        return read_info, site_meth_list

    readlen = len(seq)
    refseq = ref[chr][pos - 1:pos + readlen + 1]

    if strand not in reverse_strand:
        #get the location of restriction enzyme digestion site
        dige_5_index = refseq.find('CCGG')
        dige_3_index = refseq.rfind('CCGG')

    else:
        # if the strand is '-+' (single-end) or '+-', reverse the seq and refseq
        seq = seq[::-1]
        refseq = refseq[::-1]
        dige_5_index = refseq.find('GGCC')
        dige_3_index = refseq.rfind('GGCC')

    if dige_3_index == -1 or dige_5_index == -1:
        warning("Can't find MspI site in the read: %s" % read.rstrip())
        #return dige_dict
        return read_info, site_meth_list

    # get the location of the restriction enzyme digestion site
    #dige_dict[strand]['s'][0].append(dige_5_index + 1)
    #dige_dict[strand]['e'][0].append(dige_3_index + 1)

    # get the methylation state of the restriction enzyme digestion site
    if strand[1] == '-':
        #stand: +- or --, check the third nucleotide of C-CGG
        dige_5_meth, dige_5_all = get_meth_state(strand, seq, refseq,
                                                 dige_5_index + 1)
        dige_3_meth, dige_3_all = get_meth_state(strand, seq, refseq,
                                                 dige_3_index + 1)
    else:
        #stand: ++ or -+, check the second nucleotide of C-CGG
        dige_5_meth, dige_5_all = get_meth_state(strand, seq, refseq,
                                                 dige_5_index)
        dige_3_meth, dige_3_all = get_meth_state(strand, seq, refseq,
                                                 dige_3_index)

    #dige_dict[strand]['s'][1] = [sum(x) for x in zip(dige_dict[strand]['s'][1], [dige_5_meth, dige_5_all])]
    #dige_5_meth_list = [dige_5_meth] + [0] * 10
    #dige_5_all_list = [dige_5_all] + [0] * 10
    #dige_3_meth_list = [0] * 11
    #dige_3_all_list = [0] * 11
    #calculate the methylation states of 10 nucleotides after dige_5_index
    #for i in range(1, 11):
    #meth_5, all_5 = get_meth_state(strand, seq, refseq, dige_5_index + 2 + i)  # +2 to scan the nucleotide after CCGG
    #meth_3, all_3 = get_meth_state(strand, seq, refseq, len(seq) - i)
    #dige_5_meth_list[i] = meth_5
    #dige_5_all_list[i] = all_5
    #dige_3_meth_list[10 - i] = meth_3
    #dige_3_all_list[10 - i] = all_3
    #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, len(seq)]
    site_meth_list = [dige_5_meth, dige_5_all, 0, 0, dige_5_index, len(seq)]
    if dige_3_index != 0 and dige_3_index != dige_5_index:
        if strand[1] == '+':
            #check the 3' digestion site for '++' and '-+'
            if len(seq[(dige_3_index - 1):(dige_3_index + 3)]) == 4:
                if (strand == '++' and seq[dige_3_index + 2] == 'A') or (
                        strand == '-+' and seq[dige_3_index + 2] == 'T'):
                    site_meth_list = [
                        dige_5_meth, dige_5_all, dige_3_meth, dige_3_all,
                        dige_5_index, dige_3_index
                    ]
                    #dige_dict[strand]['e'][1] = [sum(x) for x in zip(dige_dict[strand]['e'][1], [dige_3_meth, dige_3_all])]
                    #dige_dict[strand]['e'] = [sum(x) for x in zip(dige_dict[strand]['e'], [dige_3_meth, dige_3_all])]
                    #for i in range(1, 11):
                    #    meth_3, all_3 = get_meth_state(strand, seq, refseq, dige_3_index - 1 - i)
                    #    dige_3_meth_list[10 - i] = meth_3
                    #    dige_3_all_list[10 - i] = all_3
                    #dige_3_meth_list[-1] = dige_3_meth
                    #dige_3_all_list[-1] = dige_3_all
                    #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, dige_3_index]
                    #print "end_repair_3", refseq[dige_3_index:(dige_3_index + 4)], dige_3_meth_list, dige_3_all_list

        else:
            site_meth_list = [
                dige_5_meth, dige_5_all, dige_3_meth, dige_3_all, dige_5_index,
                dige_3_index
            ]
            #dige_dict[strand]['e'][1] = [sum(x) for x in zip(dige_dict[strand]['e'][1], [dige_3_meth, dige_3_all])]
            #dige_dict[strand]['e'] = [sum(x) for x in zip(dige_dict[strand]['e'], [dige_3_meth, dige_3_all])]
            #for i in range(1, 11):
            #    meth_3, all_3 = get_meth_state(strand, seq,refseq, dige_3_index - 1 - i)
            #    dige_3_meth_list[10 - i] = meth_3
            #    dige_3_all_list[10 - i] = all_3
            #dige_3_meth_list[-1] = dige_3_meth
            #dige_3_all_list[-1] = dige_3_all
            #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, dige_3_index]
            #print 'bbbb', dige_3_meth_list, dige_3_all_list

    #return dige_dict
    return read_info, site_meth_list
Пример #12
0
def run(args):
    """
    Alternative module: Use the strategy in Bis-SNP to trim 5' bisulfite conversion failures
    """
    options = args.parse_args()

    if len(options.sam_file) == 0:
        error("Missing the SAM file, use -s or --sam option.")
    else:
        options.sam_file = options.sam_file.split(',')
    for s in options.sam_file:
        if not os.path.isfile(s):
            error("Can't open the SAM file: " + s)
            sys.exit(1)

    if len(options.ref_file) == 0:
        error("Missing the reference genome fasta file, use -r or --ref option.")
    else:
        if not os.path.isfile(options.ref_file):
            error("Can't open the ref file: " + options.ref_file)

    if len(options.samtools) != 0:
        if options.samtools[-1] != '/':
            options.samtools += '/'

    if len(options.name) == 0:
        error("Missing the output file name, use -n or --name options.")

    sam_inf = options.sam_file
    ref_file = options.ref_file
    bsm = options.bsm
    s_path = options.samtools
    name = options.name
    remove_overlap = options.remove_overlap
    filter_dup = options.filter_dup
    p_poisson = options.p_poisson
    gsize = options.gsize
    not_mapping = options.not_mapping

    info("Get the all parameter!!")

    #check the input mapping files
    sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path)
    pre_flag = read_inf.readline().split('\t')[1]
    if 'p' in pre_flag:
        single_on = False
        info("The input mapping files are paired-end sequencing!")
    else:
        single_on = True
        info("The input mapping files are single-end sequencing!")

    loc_dict = {}
    if filter_dup:
        ## if filter_up is TRUE, the duplicate reads will be assessed and shown in Dup_dis.pdf
        info("The filter_dup has been set True.")
        info("Assess the duplicate reads...")
        for sam in sam_inf:
            #check the input mapping files
            sam_format, read_inf = check.check_mapping_file(sam, s_path)
            if single_on:
                for read in read_inf:
                    loc_dict = LI.Loc_single(read, loc_dict, bsm)
            else:
                for read in read_inf:
                    loc_dict = LI.Loc_paired(read, loc_dict, bsm)
        max_cov = DR.duplicate_report(loc_dict, gsize, p_poisson, name)
        info('Get the duplicate reads distribution!')

    #get reference information
    ref = GR.get_ref(ref_file)
    trim_position = []

    filter_duplicate_reads = 0
    filter_nonuniform_trim_bp = 0
    filter_nonuniform_trim_bp_CG = 0
    filter_remove_overlap_bp = 0
    filter_not_mapping_reads = 0
    all_reads = 0
    not_mapping_reads = 0
    all_mapping_bp = 0

    ##filter the 5' bisulfite failure
    for sam in sam_inf:
        out_sam = sam[:-4] + '_' + name + '_filter.sam'
        out = open(out_sam, 'w')
        #check the input mapping files
        record_mate = {}
        sam_format, read_inf = check.check_mapping_file_header(sam, s_path)

        for read in read_inf:
            #for sam header
            if read.startswith('@'):
                out.write(read)
                continue
            else:
                all_reads += 1  ##record the read number (2013-06-20)

                #Get the read information for trimming
                #If the read isn't unique mapping, we will get a empty list ([]).
                #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
                #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
            read_info = RI(read, bsm)
            read_info = read_info.extract_information()

            if len(read_info) == 0:
                not_mapping_reads += 1
                if not_mapping:         #keep the not_unique mapping reads (or not paired mapping)
                    out.write(read)
                else:
                    filter_not_mapping_reads += 1  ##record the not mapping read number (2013-06-20)
                continue

            if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads
                duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on)
            else:
                duplicate = False

            if single_on:
                all_mapping_bp += len(read_info[5])   ##record the mapping read basepair (2013-06-20)
            else:
                all_mapping_bp += len(read_info[7])   ##record the mapping read basepair (2013-06-20)

            record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = NF.nonuniform_filter(read,
                                                                                                                out,
                                                                                                                read_info,
                                                                                                                ref,
                                                                                                                remove_overlap,
                                                                                                                duplicate,
                                                                                                                single_on,
                                                                                                                record_mate,
                                                                                                                trim_position,
                                                                                                                filter_nonuniform_trim_bp_CG,
                                                                                                                filter_duplicate_reads,
                                                                                                                filter_remove_overlap_bp)
        out.close()
        del record_mate
    NR.nonuniform_generator(trim_position, name)

    for i in range(len(trim_position)):
        filter_nonuniform_trim_bp += i * trim_position[i]

    ##produce the filter report
    info('Produce the report file...')
    report_out = open(name + "_BSeQC_nonuniform_filter_report.txt", 'w')
    report_out.write('Total reads: %d\n' % all_reads)
    if single_on:
        report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % (
            not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % (
            (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % (
            filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique mapping reads:\n')
        report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % (
            filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write("Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
            filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write("Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
            filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%"))

    else:
        report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % (
            not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % (
            (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % (
            filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique paired mapping reads:\n')
        report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % (
            filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads * 100), "%"))
        report_out.write("Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
            filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write("Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
            filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%"))
        report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (
            filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%"))
    report_out.close()
    info('Get the report file!')