def zoom_region(args): region_s = args.start region_e = args.end tx = args.transcript cmd = 'grep %s %s' % (tx, args.psl) print cmd lines = runInShell(cmd) hit_lines = lines.split('\n') hits = eva.read_psl_hits(hit_lines, 'ref') n_reads = 0 n_single = 0 n_in = 0 n_pairs = 0 n_pairs_in_out = 0 n_junction = 0 n_junction_pairs = 0 for h in hits[tx]: check_mate = False n_reads += 1 if h.rstart >= region_s and h.rend <= region_e: n_in += 1 check_mate = True if h.rstart <= region_s and h.rend >= region_s: n_junction += 1 if h.rstart <= region_e and h.rend >= region_e: n_junction += 1 if check_mate: mate_id = get_mate_id(h.qname) mate_hit = eva.find_hit(hits, mate_id, tx) if mate_hit is None: n_single += 1 else: if mate_hit.rend < region_s or mate_hit.rstart > region_e: n_pairs_in_out += 1 if mate_hit.rstart >= region_s and mate_hit.rend <= region_e: n_pairs += 1 if mate_hit.rstart <= region_s and mate_hit.rend >= region_s: n_junction_pairs += 1 if mate_hit.rstart <= region_e and mate_hit.rend >= region_e: n_junction_pairs += 1 print 'All hit reads: %d ' % n_reads print 'All ingle reads: %d ' % n_single print 'Reads in the region: %d ' % n_in print 'Reads in the junction: %d ' % n_junction print 'Pairs in the region: %d ' % n_pairs print 'Pairs in the junction: %d ' % n_junction_pairs print 'One in region, one out: %d ' % n_pairs_in_out
def draw_dot(args): tx = FastaFile(args.tx) dot = open(args.transcript + '.dot', 'w') cmd = 'grep %s %s' % (args.transcript, args.tx) hit_lines = runInShell(cmd) milestones = [] lines = runInShell('grep ' + args.transcript + ' ' + args.psl) hit_lines = lines.split('\n') print 'grep ' + args.transcript + ' ' + args.psl raw_hits = eva.read_psl_hits(hit_lines, 'query') hits = raw_hits[args.transcript] # Every starting/ending point on the query transcript is a 'milestone' m = [] print hits print '-----------------------------' for h in hits: if h.qname == args.transcript and not h.qname == h.rname: for i in range(h.n_blocks): if not h.q_block_starts[i] in m: milestones.append((h.q_block_starts[i], 'start', h.rname, i)) m.append(h.q_block_starts[i]) if not (h.q_block_starts[i] + h.block_sizes[i]) in m: milestones.append((h.q_block_starts[i] + h.block_sizes[i], 'end', h.rname, i)) m.append(h.q_block_starts[i] + h.block_sizes[i]) blocks = {} tx_cursors = {} tx_node_ids = {} milestones.sort() blocks[args.transcript] = [] tx_node_ids[args.transcript] = 0 tx_cursors[args.transcript] = 0 dot.write('digraph g { \n\trankdir = LR \n') n_solid_blocks = 0 # Determine all blocks of the query transcript print milestones for i in range(len(milestones) - 1): (next_m, next_start_or_end, next_rname, next_index) = milestones[i + 1] (m, start_or_end, rname, index) = milestones[i] is_covered = True if i == 0 and m > 0: b = Block(args.transcript, -1, 0, m, False) blocks[args.transcript].append(b) is_covered = check_covered(args.transcript, hits, m, next_m) if is_covered: b = Block(args.transcript, n_solid_blocks, m, next_m, is_covered) b.matched_block = b n_solid_blocks += 1 else: b = Block(args.transcript, -1, m, next_m, is_covered) blocks[args.transcript].append(b) #print i, len(milestones) #print milestones[i + 1], tx.get_seq_len(args.transcript) if i == len(milestones) - 2 and next_m < tx.get_seq_len(args.transcript): b = Block(args.transcript, -1, next_m, h.qlen, False) blocks[args.transcript].append(b) # print '-----------------------------' # print blocks[args.transcript] # print '-----------------------------' # Determine the blocks of 'reference' transcripts for h in hits: rname = h.rname if h.qname == args.transcript and not h.qname == h.rname: blocks[rname] = [] tx_cursors[rname] = 0 tx_node_ids[rname] = 0 for i in range(h.n_blocks): if i == 0: if h.r_block_starts[0] > 0: b = Block(rname, len(blocks[rname]), 0, h.r_block_starts[0], False) blocks[rname].append(b) q_start = h.q_block_starts[i] q_end = q_start + h.block_sizes[i] # For those blocks covered by this transcript, add them for j in range(len(blocks[args.transcript])): b = blocks[args.transcript][j] diff = h.r_block_starts[i] - h.q_block_starts[i] if (b.start >= q_start and b.end <= q_end): new_b = Block(rname, b.id, b.start + diff, b.end + diff, True, b) blocks[rname].append(new_b) if b.end >= q_end: break # Add the 'not hit' block in between the hit blocks if i < h.n_blocks - 1 and h.r_block_starts[i] + h.block_sizes[i] < h.r_block_starts[i + 1]: b = Block(rname, len(blocks[rname]), h.r_block_starts[i] + h.block_sizes[i], h.r_block_starts[i + 1], False) blocks[rname].append(b) # If all blocks do not cover till the end, add the remaining area as a 'not hit' block if i == h.n_blocks - 1 and h.block_sizes[i] + h.r_block_starts[i] < h.rlen: b = Block(rname, len(blocks[rname]), h.block_sizes[i] + h.r_block_starts[i], h.rlen, False) blocks[rname].append(b) for t, tx_blocks in blocks.iteritems(): # print t, tx_blocks if len(blocks[t]) > 0: dot.write('\t%s_0 [fixedsize=true, width=3, style=filled, fillcolor=%s, shape=box, label="%s:%d"] \n' % (nice_name(t), 'orange', t, tx.get_seq_len(t))) print 'n_solid_blocks ', n_solid_blocks for i in range(n_solid_blocks + 1): # Check whether there is a 'not hit' block before hit block i has_dummy = False all_covered = True for t, tx_blocks in blocks.iteritems(): if tx_cursors[t] < len(tx_blocks): b = tx_blocks[tx_cursors[t]] matched_b = b.matched_block if not b.is_covered: has_dummy = True all_covered = False if b.id != i: has_dummy = True # print '----------------------' # print 'has_dummy ', has_dummy # print 'all_covered ', all_covered # print tx_cursors # For each 'referece transcript', draw until the current 'hit block' i for t, tx_blocks in blocks.iteritems(): cursor = tx_cursors[t] if cursor >= len(tx_blocks): continue b = tx_blocks[cursor] if has_dummy: if b.is_covered: if not all_covered: dot.write('\t%s_%d [fixedsize=true, width=2, style="filled,dashed", fillcolor=%s, shape=box, label=""] \n' % (nice_name(t), tx_node_ids[t] + 1, get_color(args.transcript, t, b))) dot.write('\t%s_%d -> %s_%d \n' % (nice_name(t), tx_node_ids[t], nice_name(t), tx_node_ids[t] + 1)) tx_node_ids[t] += 1 if b.id == i: dot.write('\t%s_%d [fixedsize=true, width=2, style="filled", fillcolor=%s, shape=box, label="%d: %d~%d"] \n' % (nice_name(t), tx_node_ids[t] + 1, get_color(args.transcript, t, b), b.end - b.start, b.start, b.end)) dot.write('\t%s_%d -> %s_%d \n' % (nice_name(t), tx_node_ids[t], nice_name(t), tx_node_ids[t] + 1)) tx_cursors[t] += 1 tx_node_ids[t] += 1 else: dot.write('\t%s_%d [fixedsize=true, width=2, style="filled,dashed", fillcolor=%s, shape=box, label=""] \n' % (nice_name(t), tx_node_ids[t] + 1, get_color(args.transcript, t, b))) dot.write('\t%s_%d -> %s_%d \n' % (nice_name(t), tx_node_ids[t], nice_name(t), tx_node_ids[t] + 1)) tx_node_ids[t] += 1 else: dot.write('\t%s_%d [fixedsize=true, width=2, style="filled", fillcolor=%s, shape=box, label="%d: %d~%d"] \n' % (nice_name(t), tx_node_ids[t] + 1, get_color(args.transcript, t, b), b.end - b.start, b.start, b.end)) dot.write('\t%s_%d -> %s_%d \n' % (nice_name(t), tx_node_ids[t], nice_name(t), tx_node_ids[t] + 1)) tx_cursors[t] += 1 tx_node_ids[t] += 1 if tx_cursors[t] < len(tx_blocks): b = tx_blocks[tx_cursors[t]] if b.is_covered and b.id == i: dot.write('\t%s_%d [fixedsize=true, width=2, style="filled", fillcolor=%s, shape=box, label="%d: %d~%d"] \n' % (nice_name(t), tx_node_ids[t] + 1, get_color(args.transcript, t, b), b.end - b.start, b.start, b.end)) dot.write('\t%s_%d -> %s_%d \n' % (nice_name(t), tx_node_ids[t], nice_name(t), tx_node_ids[t] + 1)) tx_cursors[t] += 1 tx_node_ids[t] += 1 else: dot.write('\t%s_%d [fixedsize=true, width=2, style="filled,dashed", fillcolor=%s, shape=box, label=""] \n' % (nice_name(t), tx_node_ids[t] + 1, get_color(args.transcript, t, b))) dot.write('\t%s_%d -> %s_%d \n' % (nice_name(t), tx_node_ids[t], nice_name(t), tx_node_ids[t] + 1)) tx_node_ids[t] += 1 else: dot.write('\t%s_%d [fixedsize=true, width=2, style="filled", fillcolor=%s, shape=box, label="%d: %d~%d"] \n' % (nice_name(t), tx_node_ids[t] + 1, get_color(args.transcript, t, b), b.end - b.start, b.start, b.end)) dot.write('\t%s_%d -> %s_%d \n' % (nice_name(t), tx_node_ids[t], nice_name(t), tx_node_ids[t] + 1)) tx_cursors[t] += 1 tx_node_ids[t] += 1 dot.write('} \n') dot.close() print 'Check file %s.dot' % args.transcript
def zoom_tx(tx_name, ref, blat_psl, ctg_or_read): lines = runInShell('grep ' + tx_name + ' ' + blat_psl) hit_lines = lines.split('\n') tx_hits = [] if len(hit_lines) <= 0: return tx_hits hits = eva.read_psl_hits(hit_lines, 'query') tx = FastaFile(ref) tx_seq = tx.seqs[tx_name] for qname, h in hits.iteritems(): tx_hits.append(h[0]) summary = '' spans = [] if ctg_or_read == 'read': n_reads = len(tx_hits) n_pairs = 0 n_match = 0 n_match_minus_1 = 0 n_match_minus_2 = 0 n_match_minus_3 = 0 n_match_minus_4 = 0 n_match_minus_5 = 0 n_match_lt_5 = 0 n_lt_one_block = 0 all_base_covered = False cvr = [0 for x in range(len(tx_seq))] ave_cvr = 0.0 sd_cvr = 0.0 tx_hits.sort(key=lambda x: x.qname, reverse=False) h_pre = None for h in tx_hits: if h.qlen - h.n_match == 0: n_match += 1 if h.qlen - h.n_match == 1: n_match_minus_1 += 1 if h.qlen - h.n_match == 2: n_match_minus_2 += 1 if h.qlen - h.n_match == 3: n_match_minus_3 += 1 if h.qlen - h.n_match == 4: n_match_minus_4 += 1 if h.qlen - h.n_match == 5: n_match_minus_5 += 1 if h.qlen - h.n_match > 5: n_match_lt_5 += 1 if h.n_blocks > 1: n_lt_one_block += 1 try: if not h_pre is None: if int(h.qname) - int(h_pre.qname) == 1: n_pairs += 1 spans.append(abs(h.rstart - h_pre.rstart)) except: pass h_pre = h if len(tx_hits) > 0: summary += 'Transcript: %s\n' % tx_name summary += 'Tx length: %s\n' % len(tx_seq) summary += '# of reads: %d\n' % n_reads summary += '# of pairs: 2 * %d\n' % n_pairs summary += 'Full match: %d\n' % n_match summary += '1 mismatch: %d\n' % n_match_minus_1 summary += '2 mismatch: %d\n' % n_match_minus_2 summary += '3 mismatch: %d\n' % n_match_minus_3 summary += '4 mismatch: %d\n' % n_match_minus_4 summary += '5 mismatch: %d\n' % n_match_minus_5 summary += '>5 mismatch: %d\n' % n_match_lt_5 summary += '>1 blocks: %d\n' % n_lt_one_block summary += 'Mean span: %.2f\n' % (get_mean(spans)) summary += 'Std dev: %.2f\n' % (dev(spans, get_mean(spans))) tx_hits.sort(key=lambda x: x.n_blocks, reverse=False) tx_hits.sort(key=lambda x: x.rstart, reverse=False) return summary, tx_hits