def _contig_profile(alignment, platform, genome_len): """ Computes alignment profile """ #max_aln_err = config.vals["err_modes"][platform]["max_aln_error"] aln_errors = [] profile = [Profile() for _ in range(genome_len)] for aln in alignment: #if aln.err_rate > max_aln_err: continue aln_errors.append(aln.err_rate) qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) #qry_seq = aln.qry_seq #trg_seq = aln.trg_seq trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 if trg_pos >= genome_len: trg_pos -= genome_len prof_elem = profile[trg_pos] if trg_nuc == "-": prof_elem.insertions[qry_nuc] += 1 else: prof_elem.nucl = trg_nuc prof_elem.matches[qry_nuc] += 1 trg_pos += 1 return profile, aln_errors
def _get_bubble_seqs(alignment, platform, profile, partition, contig_info): """ Given genome landmarks, forms bubble sequences """ if not partition: return [] #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] bubbles = [] ext_partition = [0] + partition + [contig_info.length] for p_left, p_right in zip(ext_partition[:-1], ext_partition[1:]): bubbles.append(Bubble(contig_info.id, p_left)) consensus = [p.nucl for p in profile[p_left:p_right]] bubbles[-1].consensus = "".join(consensus) for aln in alignment: #if aln.err_rate > max_aln_err: continue bubble_id = bisect(partition, aln.trg_start % contig_info.length) next_bubble_start = ext_partition[bubble_id + 1] chromosome_start = (bubble_id == 0 and not contig_info.type == "circular") chromosome_end = (aln.trg_end > partition[-1] and not contig_info.type == "circular") branch_start = None first_segment = True trg_pos = aln.trg_start for i, trg_nuc in enumerate(aln.trg_seq): if trg_nuc == "-": continue if trg_pos >= contig_info.length: trg_pos -= contig_info.length if trg_pos >= next_bubble_start or trg_pos == 0: if not first_segment or chromosome_start: branch_seq = fp.to_acgt( aln.qry_seq[branch_start:i].replace("-", "")) bubbles[bubble_id].branches.append(branch_seq) first_segment = False bubble_id = bisect(partition, trg_pos) next_bubble_start = ext_partition[bubble_id + 1] branch_start = i trg_pos += 1 if chromosome_end: branch_seq = fp.to_acgt(aln.qry_seq[branch_start:].replace( "-", "")) bubbles[-1].branches.append(branch_seq) return bubbles
def _get_bubble_seqs(alignment, profile, partition, contig_id): """ Given genome landmarks, forms bubble sequences """ if not partition or not alignment: return [] ctg_len = alignment[0].trg_len bubbles = [] ext_partition = [0] + partition + [ctg_len] for p_left, p_right in zip(ext_partition[:-1], ext_partition[1:]): bubbles.append(Bubble(contig_id, p_left)) consensus = [p.nucl for p in profile[p_left:p_right]] bubbles[-1].consensus = "".join(consensus) for aln in alignment: bubble_id = bisect(partition, aln.trg_start) next_bubble_start = ext_partition[bubble_id + 1] chromosome_start = bubble_id == 0 chromosome_end = aln.trg_end > partition[-1] branch_start = None first_segment = True trg_pos = aln.trg_start for i, trg_nuc in enumerate(aln.trg_seq): if trg_nuc == "-": continue #if trg_pos >= contig_info.length: #trg_pos -= contig_info.length if trg_pos >= next_bubble_start or trg_pos == 0: if not first_segment or chromosome_start: branch_seq = fp.to_acgt( aln.qry_seq[branch_start:i].replace("-", "")) bubbles[bubble_id].branches.append(branch_seq) first_segment = False bubble_id = bisect(partition, trg_pos) next_bubble_start = ext_partition[bubble_id + 1] branch_start = i trg_pos += 1 if chromosome_end: branch_seq = fp.to_acgt(aln.qry_seq[branch_start:].replace( "-", "")) bubbles[-1].branches.append(branch_seq) return bubbles
def _contig_profile(alignment, platform): """ Computes alignment profile """ if not alignment: return [] genome_len = alignment[0].trg_len aln_errors = [] profile = [Profile() for _ in range(genome_len)] #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] for aln in alignment: #if aln.err_rate > max_aln_err: continue aln_errors.append(aln.err_rate) #after gap shifting it is possible that #two gaps are aligned against each other qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 if trg_pos >= genome_len: trg_pos -= genome_len #total += 1 prof_elem = profile[trg_pos] if trg_nuc == "-" and qry_nuc != "-": prof_elem.insertions[aln.qry_id] += qry_nuc else: prof_elem.nucl = trg_nuc prof_elem.matches[qry_nuc] += 1 trg_pos += 1 #print "len", genome_len, "median coverage", cov_threshold #print "total bases: ", total, "discarded bases: ", discarded #print "filtered", float(discarded) / total #print "" return profile, aln_errors
def _compute_profile(alignment, platform, genome_len): """ Computes alignment profile """ max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] min_aln_len = cfg.vals["min_polish_aln_len"] aln_errors = [] #filtered = 0 profile = [ProfileInfo() for _ in range(genome_len)] for aln in alignment: if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len: #filtered += 1 continue aln_errors.append(aln.err_rate) qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 if trg_pos >= genome_len: trg_pos -= genome_len prof_elem = profile[trg_pos] if trg_nuc == "-": prof_elem.num_inserts += 1 else: prof_elem.nucl = trg_nuc prof_elem.coverage += 1 if qry_nuc == "-": prof_elem.num_deletions += 1 elif trg_nuc != qry_nuc: prof_elem.num_missmatch += 1 trg_pos += 1 #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment))) return profile, aln_errors
def _compute_profile(alignment, ref_sequence): """ Computes alignment profile """ if len(alignment) == 0: raise Exception("No alignmemnts!") genome_len = alignment[0].trg_len #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"] min_aln_len = cfg.vals["min_polish_aln_len"] aln_errors = [] #filtered = 0 profile = [ProfileInfo() for _ in range(genome_len)] for i in range(genome_len): profile[i].nucl = ref_sequence[i] for aln in alignment: #if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len: if len(aln.qry_seq) < min_aln_len: #filtered += 1 continue aln_errors.append(aln.err_rate) qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq) trg_seq = shift_gaps(qry_seq, aln.trg_seq) trg_pos = aln.trg_start for trg_nuc, qry_nuc in zip(trg_seq, qry_seq): if trg_nuc == "-": trg_pos -= 1 #if trg_pos >= genome_len: # trg_pos -= genome_len prof_elem = profile[trg_pos] if trg_nuc == "-": prof_elem.insertions[aln.qry_id] += qry_nuc #prof_elem.num_inserts += 1 else: #prof_elem.nucl = trg_nuc prof_elem.coverage += 1 if qry_nuc == "-": prof_elem.num_deletions += 1 elif trg_nuc != qry_nuc: prof_elem.num_missmatch += 1 trg_pos += 1 for i in range(genome_len): for ins_read, ins_str in profile[i].insertions.items(): profile[i].propagated_ins += 1 span = len(ins_str) for j in range(max(0, i - span), i): profile[j].propagated_ins += 1 for j in range(i + 1, min(i + span + 1, genome_len)): profile[j].propagated_ins += 1 #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment))) return profile, aln_errors
def get_simple_repeats(repeat_graph, alignments_file, edge_seqs): next_path_id = 1 path_ids = {} repeats_dict = {} MULT = 2 paths_to_resolve = [] interesting_edges = set() for path in repeat_graph.get_unbranching_paths(): if not path[0].repetitive or path[0].self_complement: continue is_simple = True inputs = set() for in_edge in path[0].node_left.in_edges: inputs.add(in_edge.edge_id) if in_edge.repetitive: is_simple = False outputs = set() for out_edge in path[-1].node_right.out_edges: outputs.add(out_edge.edge_id) if out_edge.repetitive: is_simple = False if not is_simple or len(inputs) != MULT or len(outputs) != MULT: continue paths_to_resolve.append((path, inputs, outputs)) interesting_edges.update(set([e.edge_id for e in path])) interesting_alignments = [] for read_aln in iter_alignments(alignments_file): repeat_read = False for edge_aln in read_aln: if edge_aln.edge_id in interesting_edges: repeat_read = True if repeat_read: interesting_alignments.append(read_aln) for path, inputs, outputs in paths_to_resolve: if path[0].edge_id not in path_ids: path_ids[path[0].edge_id] = next_path_id path_ids[-path[-1].edge_id] = -next_path_id next_path_id += 1 path_id = path_ids[path[0].edge_id] repeat_edge_ids = set([e.edge_id for e in path]) inner_reads = [] input_reads = defaultdict(list) output_reads = defaultdict(list) for read_aln in interesting_alignments: repeat_read = False for edge_aln in read_aln: if edge_aln.edge_id in repeat_edge_ids: repeat_read = True if not repeat_read: continue inner_reads.append(read_aln[0].overlap.cur_id) for prev_edge, next_edge in zip(read_aln[:-1], read_aln[1:]): if (prev_edge.edge_id in inputs and next_edge.edge_id == path[0].edge_id): input_reads[prev_edge.edge_id].append( prev_edge.overlap.cur_id) if (prev_edge.edge_id == path[-1].edge_id and next_edge.edge_id in outputs): output_reads[next_edge.edge_id].append( next_edge.overlap.cur_id) if (not len(inner_reads) or len(input_reads) != MULT or len(output_reads) != MULT): continue #add edges sequences: sequences = {} for edge in chain(input_reads, output_reads): seq_id = repeat_graph.edges[edge].edge_sequences[0].edge_seq_name seq = edge_seqs[seq_id[1:]] if seq_id[0] == "-": seq = fp.reverse_complement(seq) sequences[edge] = seq template_seq = "" for edge in path: seq_id = edge.edge_sequences[0].edge_seq_name seq = edge_seqs[seq_id[1:]] if seq_id[0] == "-": seq = fp.reverse_complement(seq) template_seq += seq sequences["template"] = template_seq #print path_id #for h, s in sequences.items(): # print h, s[:100] repeats_dict[path_id] = RepeatInfo(path_id, [e.edge_id for e in path], inner_reads, input_reads, output_reads, sequences, MULT) return repeats_dict