def parse_mapping_stringent(mapping, assembly, mm, \ ends = False, scaffolds = False, max_cov = 100): """ - create a paired-read dictionary from sam files - only include stringently mapped reads - use max_cov to limit the number of reads stored * pairs[read] = [bit, mate, mappping[scaffold] = [map, map2, ...], fastq] * map = [overlap, mismatches, sam_info] * sam_info = all sam lines except read + quality """ pairs = {} header = [] # make sure that mapping was successful if mapping is False: return pairs, header # s2c[scaffold] = [[coverage per position, # reads connecting base to the next base], [p2, pn2]] s2c = { id: [[0, False, False] for i in range(0, info[1])] for id, info in list(assembly.items()) } for line in open(mapping): if line.startswith('@'): header.append(line.strip()) continue line = line.strip().split() # only include stringently mapped reads mismatches = map_tool.count_mismatches(line) if mismatches > mm: continue read, bit, scaffold, start = line[0:4] bit, start = int(bit), int(start) r = [start, start + len(line[9]) - 1] # make sure read is > 10 bp long if r[1] - r[0] < 10: continue fastq = map_tool.sam2fastq(line) info = [line[0:9], line[11:]] if '/' in read: read = read.rsplit('/', 1)[0] if bin(bit)[-7] == '1': # first sequence in pair read = '%s_1' % (read) mate = '%s_2' % (read.rsplit('_', 1)[0]) else: read = '%s_2' % (read) mate = '%s_1' % (read.rsplit('_', 1)[0]) if ends is not False and (r[0] > ends and r[1] < scaffolds[scaffold][1] - ends): continue if min([i[0] for i in s2c[scaffold][r[0]:r[1]]]) >= max_cov: continue s2c = add_coverage(scaffold, assembly, r, s2c, line, window=0) pairs = add_read(pairs, read, info, r, bit, mate, fastq, mismatches, scaffold) return pairs, header
def check_mm(sam, window, read_length): """ make sure mismatches are not in window at beginning or end of read if mismatches are not in the beginning or end of the read, return False """ mm = map_tool.count_mismatches(sam) if mm is False: return True if mm == 0: return False mm_positions = mm_positions_from_md(sam, read_length) if mm_positions is False: return False elif mm_positions is True: return True for pos in mm_positions: if pos <= window or pos >= (read_length - window): return True return False
def parse_mapping_errors(mapping, s2errors, s2windows): """ - create a paired-read dictionary from sam files - only include reads mapping to error window * pairs[read] = [bit, mate, mappping[scaffold] = [map, map2, ...], fastq] * map = [overlap, mismatches, sam_info] * sam_info = all sam lines except read + quality """ pairs = {} for line in open(mapping): if line.startswith('@'): continue line = line.strip().split() read, bit, scaffold, start = line[0:4] bit, start = int(bit), int(start) r = [start, start + len(line[9]) - 1] m_scaffold = line[6] if scaffold != m_scaffold: mate_r = [False, False] else: mstart = int(line[7]) mate_r = [mstart, mstart + len(line[9]) - 1] # make sure read or mate overlaps with an error window if map2window(scaffold, s2windows, s2errors, r, mate_r) is False: continue mismatches = map_tool.count_mismatches(line) fastq = map_tool.sam2fastq(line) info = [line[0:9], line[11:]] if '/' in read: read = read.rsplit('/', 1)[0] if bin(bit)[-7] == '1': # first sequence in pair read = '%s_1' % (read) mate = '%s_2' % (read.rsplit('_', 1)[0]) else: read = '%s_2' % (read) mate = '%s_1' % (read.rsplit('_', 1)[0]) pairs = add_read(pairs, read, info, r, bit, mate, fastq, mismatches, scaffold) return pairs
def parse_mapping(mapping, ends = False, scaffolds = False): """ create a paired-read dictionary from sam files * pairs[read] = [bit, mate, mappping[scaffold] = [map, map2, ...], fastq] * map = [overlap, mismatches, sam_info] * sam_info = all sam lines except read + quality """ pairs = {} header = [] for line in open(mapping): if line.startswith('@'): header.append(line.strip()) continue line = line.strip().split() read, bit, scaffold, start = line[0:4] bit, start = int(bit), int(start) r = [start, start + len(line[9]) - 1] mismatches = map_tool.count_mismatches(line) fastq = map_tool.sam2fastq(line) info = [line[0:9], line[11:]] if '/' in read: read = read.rsplit('/', 1)[0] if bin(bit)[-7] == '1': # first sequence in pair read = '%s_1' % (read) mate = '%s_2' % (read.rsplit('_', 1)[0]) else: read = '%s_2' % (read) mate = '%s_1' % (read.rsplit('_', 1)[0]) if ends is not False and (r[0] > ends and r[1] < scaffolds[scaffold][1] - ends): continue if read not in pairs: pairs[read] = [bit, mate, {}, fastq] if scaffold not in pairs[read][2]: pairs[read][2][scaffold] = [] pairs[read][2][scaffold].append([r, mismatches, info]) return pairs, header
def copies(mapping, s2bins, rna, min_rna=800, mismatches=0): """ 1. determine bin coverage 2. determine rRNA gene coverage 3. compare """ cov = {} # cov[scaffold] = [bases, length] s2bins, bins2s = parse_s2bins(s2bins) rna_cov = parse_rna(rna, s2bins, min_rna) s2bins, bins2s = filter_missing_rna(s2bins, bins2s, rna_cov) # count bases mapped to scaffolds and rRNA gene regions for line in mapping: line = line.strip().split() # get scaffold lengths if line[0].startswith('@'): if line[0].startswith('@SQ') is False: continue s = line[1].split(':')[1] l = int(line[2].split(':')[1]) # check if scaffold is binned if s not in s2bins: continue if s not in cov: cov[s] = [0, l] # check mismatch threshold mm = count_mismatches(line) if mm is False or mm > mismatches: continue # check that scaffold is in bin s, bases = line[2], len(line[9]) if s not in cov: continue cov[s][0] += bases rna_cov = rna_bases(rna_cov, s, bases, line) print('# mismatches threshold: %s' % (mismatches)) header = ['#rRNA scaffold', 'rRNA genes >=%sbp on scaffold' % (min_rna), \ 'rRNA coverage', \ 'bin', 'bin info', 'bin coverage', \ 'rRNAs >=%sbp in bin' % (min_rna), \ 'rRNA coverage/bin coverage', \ 'estimated number of copies'] print('\t'.join(header)) for bin, scaffolds in list(bins2s.items()): rna_count = sum( [len(rna_cov[s][2]) for s in scaffolds if s in rna_cov]) for s in scaffolds: if s not in rna_cov: continue out = [] counts = rna_cov[s] bin_cov = calc_bin_cov(bins2s[bin], cov) num_genes = len(counts[2]) rna_coverage = float(float(counts[0]) / float(counts[1])) if bin_cov == 0: rna_div_bin = 0 else: rna_div_bin = float(rna_coverage / bin_cov) est = int(max([rna_count, counts, rna_div_bin])) out = [ s, num_genes, rna_coverage, bin, bin_cov, rna_count, rna_div_bin, est ] print('\t'.join([str(i) for i in out]))