Пример #1
0
def parse_mapping_stringent(mapping, assembly, mm, \
        ends = False, scaffolds = False, max_cov = 100):
    """
    - create a paired-read dictionary from sam files
    - only include stringently mapped reads
    - use max_cov to limit the number of reads stored
    * pairs[read] = [bit, mate, mappping[scaffold] = [map, map2, ...], fastq]
    *  map = [overlap, mismatches, sam_info]
    *  sam_info = all sam lines except read + quality
    """
    pairs = {}
    header = []
    # make sure that mapping was successful
    if mapping is False:
        return pairs, header
    # s2c[scaffold] = [[coverage per position, # reads connecting base to the next base], [p2, pn2]]
    s2c = {
        id: [[0, False, False] for i in range(0, info[1])]
        for id, info in list(assembly.items())
    }
    for line in open(mapping):
        if line.startswith('@'):
            header.append(line.strip())
            continue
        line = line.strip().split()
        # only include stringently mapped reads
        mismatches = map_tool.count_mismatches(line)
        if mismatches > mm:
            continue
        read, bit, scaffold, start = line[0:4]
        bit, start = int(bit), int(start)
        r = [start, start + len(line[9]) - 1]
        # make sure read is > 10 bp long
        if r[1] - r[0] < 10:
            continue
        fastq = map_tool.sam2fastq(line)
        info = [line[0:9], line[11:]]
        if '/' in read:
            read = read.rsplit('/', 1)[0]
        if bin(bit)[-7] == '1':  # first sequence in pair
            read = '%s_1' % (read)
            mate = '%s_2' % (read.rsplit('_', 1)[0])
        else:
            read = '%s_2' % (read)
            mate = '%s_1' % (read.rsplit('_', 1)[0])
        if ends is not False and (r[0] > ends
                                  and r[1] < scaffolds[scaffold][1] - ends):
            continue
        if min([i[0] for i in s2c[scaffold][r[0]:r[1]]]) >= max_cov:
            continue
        s2c = add_coverage(scaffold, assembly, r, s2c, line, window=0)
        pairs = add_read(pairs, read, info, r, bit, mate, fastq, mismatches,
                         scaffold)
    return pairs, header
Пример #2
0
def check_mm(sam, window, read_length):
    """
    make sure mismatches are not in window at beginning or end of read
    if mismatches are not in the beginning or end of the read, return False
    """ 
    mm = map_tool.count_mismatches(sam)
    if mm is False:
        return True
    if mm == 0:
        return False
    mm_positions = mm_positions_from_md(sam, read_length)
    if mm_positions is False:
        return False
    elif mm_positions is True:
        return True
    for pos in mm_positions:
        if pos <= window or pos >= (read_length - window):
            return True
    return False
Пример #3
0
def parse_mapping_errors(mapping, s2errors, s2windows):
    """
    - create a paired-read dictionary from sam files
    - only include reads mapping to error window
    * pairs[read] = [bit, mate, mappping[scaffold] = [map, map2, ...], fastq]
    *  map = [overlap, mismatches, sam_info]
    *  sam_info = all sam lines except read + quality
    """
    pairs = {}
    for line in open(mapping):
        if line.startswith('@'):
            continue
        line = line.strip().split()
        read, bit, scaffold, start = line[0:4]
        bit, start = int(bit), int(start)
        r = [start, start + len(line[9]) - 1]
        m_scaffold = line[6]
        if scaffold != m_scaffold:
            mate_r = [False, False]
        else:
            mstart = int(line[7])
            mate_r = [mstart, mstart + len(line[9]) - 1]
        # make sure read or mate overlaps with an error window
        if map2window(scaffold, s2windows, s2errors, r, mate_r) is False:
            continue
        mismatches = map_tool.count_mismatches(line)
        fastq = map_tool.sam2fastq(line)
        info = [line[0:9], line[11:]]
        if '/' in read:
            read = read.rsplit('/', 1)[0]
        if bin(bit)[-7] == '1':  # first sequence in pair
            read = '%s_1' % (read)
            mate = '%s_2' % (read.rsplit('_', 1)[0])
        else:
            read = '%s_2' % (read)
            mate = '%s_1' % (read.rsplit('_', 1)[0])
        pairs = add_read(pairs, read, info, r, bit, mate, fastq, mismatches,
                         scaffold)
    return pairs
Пример #4
0
def parse_mapping(mapping, ends = False, scaffolds = False):
    """
    create a paired-read dictionary from sam files
    * pairs[read] = [bit, mate, mappping[scaffold] = [map, map2, ...], fastq]
    *  map = [overlap, mismatches, sam_info]
    *  sam_info = all sam lines except read + quality
    """
    pairs = {}
    header = []
    for line in open(mapping):
        if line.startswith('@'):
            header.append(line.strip())
            continue
        line = line.strip().split()
        read, bit, scaffold, start = line[0:4]
        bit, start = int(bit), int(start)
        r = [start, start + len(line[9]) - 1]
        mismatches = map_tool.count_mismatches(line)
        fastq = map_tool.sam2fastq(line)
        info = [line[0:9], line[11:]]
        if '/' in read:
            read = read.rsplit('/', 1)[0]
        if bin(bit)[-7] == '1': # first sequence in pair
            read = '%s_1' % (read)
            mate = '%s_2' % (read.rsplit('_', 1)[0])
        else:
            read = '%s_2' % (read)
            mate = '%s_1' % (read.rsplit('_', 1)[0])
        if ends is not False and (r[0] > ends and r[1] < scaffolds[scaffold][1] - ends):
            continue
        if read not in pairs:
            pairs[read] = [bit, mate, {}, fastq]
        if scaffold not in pairs[read][2]:
            pairs[read][2][scaffold] = []
        pairs[read][2][scaffold].append([r, mismatches, info])
    return pairs, header
Пример #5
0
def copies(mapping, s2bins, rna, min_rna=800, mismatches=0):
    """
    1. determine bin coverage
    2. determine rRNA gene coverage
    3. compare
    """
    cov = {}  # cov[scaffold] = [bases, length]
    s2bins, bins2s = parse_s2bins(s2bins)
    rna_cov = parse_rna(rna, s2bins, min_rna)
    s2bins, bins2s = filter_missing_rna(s2bins, bins2s, rna_cov)
    # count bases mapped to scaffolds and rRNA gene regions
    for line in mapping:
        line = line.strip().split()
        # get scaffold lengths
        if line[0].startswith('@'):
            if line[0].startswith('@SQ') is False:
                continue
            s = line[1].split(':')[1]
            l = int(line[2].split(':')[1])
            # check if scaffold is binned
            if s not in s2bins:
                continue
            if s not in cov:
                cov[s] = [0, l]
        # check mismatch threshold
        mm = count_mismatches(line)
        if mm is False or mm > mismatches:
            continue
        # check that scaffold is in bin
        s, bases = line[2], len(line[9])
        if s not in cov:
            continue
        cov[s][0] += bases
        rna_cov = rna_bases(rna_cov, s, bases, line)
    print('# mismatches threshold: %s' % (mismatches))
    header = ['#rRNA scaffold', 'rRNA genes >=%sbp on scaffold' % (min_rna), \
            'rRNA coverage', \
            'bin', 'bin info', 'bin coverage', \
            'rRNAs >=%sbp in bin' % (min_rna), \
            'rRNA coverage/bin coverage', \
            'estimated number of copies']
    print('\t'.join(header))
    for bin, scaffolds in list(bins2s.items()):
        rna_count = sum(
            [len(rna_cov[s][2]) for s in scaffolds if s in rna_cov])
        for s in scaffolds:
            if s not in rna_cov:
                continue
            out = []
            counts = rna_cov[s]
            bin_cov = calc_bin_cov(bins2s[bin], cov)
            num_genes = len(counts[2])
            rna_coverage = float(float(counts[0]) / float(counts[1]))
            if bin_cov == 0:
                rna_div_bin = 0
            else:
                rna_div_bin = float(rna_coverage / bin_cov)
            est = int(max([rna_count, counts, rna_div_bin]))
            out = [
                s, num_genes, rna_coverage, bin, bin_cov, rna_count,
                rna_div_bin, est
            ]
            print('\t'.join([str(i) for i in out]))