def extend_scaffold(contig): visited.add(contig) scf_name = "ragout-scaffold-{0}".format(counter[0]) counter[0] += 1 scf = Scaffold.with_contigs(scf_name, contig.blocks[0], contig.blocks[-1], [contig]) scaffolds.append(scf) #go right while scf.right in connections: adjacent = connections[scf.right].end #print adjacent, contig_index[abs(adjacent)] assert len(contig_index[abs(adjacent)]) == 1 contig = contig_index[abs(adjacent)][0] if contig in visited: break if contig.blocks[0] == adjacent: scf.contigs.append(contig) scf.right = contig.blocks[-1] visited.add(contig) continue if -contig.blocks[-1] == adjacent: scf.contigs.append(contig) scf.contigs[-1].sign = -1 scf.right = -contig.blocks[0] visited.add(contig) continue break #go left while -scf.left in connections: adjacent = -connections[-scf.left].end assert len(contig_index[abs(adjacent)]) == 1 contig = contig_index[abs(adjacent)][0] if contig in visited: break if contig.blocks[-1] == adjacent: scf.contigs.insert(0, contig) scf.left = contig.blocks[0] visited.add(contig) continue if -contig.blocks[0] == adjacent: scf.contigs.insert(0, contig) scf.contigs[0].sign = -1 scf.left = -contig.blocks[-1] visited.add(contig) continue break
def _update_scaffolds(scaffolds, perm_container, ancestral=False): """ Updates scaffolds wrt to given permutations """ perm_index = defaultdict(list) if not ancestral: for perm in perm_container.target_perms: perm_index[(perm.chr_name, perm.repeat_id)].append(perm) else: for perm in perm_container.ancestor_perms: perm_index[(perm.chr_name, perm.repeat_id)].append(perm) new_scaffolds = [] for scf in scaffolds: new_contigs = [] for contig in scf.contigs: inner_perms = [] for new_perm in perm_index[(contig.perm.chr_name, contig.perm.repeat_id)]: if (contig.perm.seq_start <= new_perm.seq_start < contig.perm.seq_end): inner_perms.append(new_perm) assert (contig.perm.seq_start < new_perm.seq_end <= contig.perm.seq_end) if not inner_perms: logger.debug("Lost: {0}".format(contig.perm)) continue inner_perms.sort(key=lambda p: p.seq_start, reverse=contig.sign < 0) gap_length = contig.link.gap for new_perm in inner_perms: gap_length -= new_perm.length() new_link = Link(gap_length, contig.link.supporting_genomes) new_contigs.append(Contig.with_perm(new_perm, contig.sign, new_link)) new_contigs[-1].link = contig.link new_scaffolds.append(Scaffold.with_contigs(scf.name, None, None, new_contigs)) return new_scaffolds
def _update_scaffolds(scaffolds, perm_container): """ Updates scaffolds wrt to given permutations """ perm_index = defaultdict(list) for perm in perm_container.target_perms: perm_index[(perm.chr_name, perm.repeat_id)].append(perm) new_scaffolds = [] for scf in scaffolds: new_contigs = [] for contig in scf.contigs: inner_perms = [] for new_perm in perm_index[(contig.perm.chr_name, contig.perm.repeat_id)]: if (contig.perm.seq_start <= new_perm.seq_start < contig.perm.seq_end): inner_perms.append(new_perm) assert (contig.perm.seq_start < new_perm.seq_end <= contig.perm.seq_end) if not inner_perms: logger.debug("Lost: {0}".format(contig.perm)) continue inner_perms.sort(key=lambda p: p.seq_start, reverse=contig.sign < 0) gap_length = contig.link.gap for new_perm in inner_perms: gap_length -= new_perm.length() new_link = Link(gap_length, contig.link.supporting_genomes) new_contigs.append( Contig.with_perm(new_perm, contig.sign, new_link)) new_contigs[-1].link = contig.link if len(new_contigs): new_scaffolds.append( Scaffold.with_contigs(scf.name, None, None, new_contigs)) return new_scaffolds
def _merge_consecutive_contigs(scaffolds): """ Merges consecutive contig fragments originating from a same contig """ new_scaffolds = [] num_contigs = 0 for scf in scaffolds: new_contigs = [] cur_sign, cur_perm, cur_link = None, None, None for cnt in scf.contigs: consistent = False if cur_sign == cnt.sign and cnt.perm.chr_name == cur_perm.chr_name: if cur_sign > 0 and cur_perm.seq_end == cnt.perm.seq_start: cur_perm.seq_end = cnt.perm.seq_end cur_perm.blocks.extend(cnt.perm.blocks) consistent = True if cur_sign < 0 and cur_perm.seq_start == cnt.perm.seq_end: cur_perm.seq_start = cnt.perm.seq_start cur_perm.blocks = cnt.perm.blocks + cur_perm.blocks consistent = True if not consistent: if cur_perm: new_contigs.append( Contig.with_perm(cur_perm, cur_sign, cur_link)) cur_perm = deepcopy(cnt.perm) cur_sign = cnt.sign cur_link = cnt.link if cur_perm: new_contigs.append(Contig.with_perm(cur_perm, cur_sign, cur_link)) num_contigs += len(new_contigs) new_scaffolds.append( Scaffold.with_contigs(scf.name, None, None, new_contigs)) logger.debug("Merging consequtive contigs: {0} left".format(num_contigs)) return new_scaffolds
def _insert_from_graph(graph_file, scaffolds_in, max_path_len): new_scaffolds = [] graph = _load_dot(graph_file) logger.debug("Loaded overlap graph with {0} nodes".format(len(graph))) ordered_contigs = set() for scf in scaffolds_in: ordered_contigs |= set(map(lambda s: s.name, scf.contigs)) for scf in scaffolds_in: new_scaffolds.append(Scaffold(scf.name)) for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): new_scaffolds[-1].contigs.append(prev_cont) #find unique path path_nodes = _get_unique_path(graph, prev_cont, new_cont, max_path_len) if not path_nodes: continue #check path consistency consistent = True for node in path_nodes: if node[1:] in ordered_contigs: logger.debug("Path inconsistency {0} -- {1}: {2}" .format(prev_cont, new_cont, node)) consistent = False break if not consistent: continue #insert contigs along the path for node in path_nodes: new_scaffolds[-1].contigs.append(Contig.from_sting(node)) new_scaffolds[-1].contigs.append(new_cont) return new_scaffolds
def _merge_consecutive_contigs(scaffolds): """ Merges consecutive contig fragments originating from a same contig """ new_scaffolds = [] num_contigs = 0 for scf in scaffolds: new_contigs = [] cur_sign, cur_perm, cur_link = None, None, None for cnt in scf.contigs: consistent = False if cur_sign == cnt.sign and cnt.perm.chr_name == cur_perm.chr_name: if cur_sign > 0 and cur_perm.seq_end == cnt.perm.seq_start: cur_perm.seq_end = cnt.perm.seq_end cur_perm.blocks.extend(cnt.perm.blocks) consistent = True if cur_sign < 0 and cur_perm.seq_start == cnt.perm.seq_end: cur_perm.seq_start = cnt.perm.seq_start cur_perm.blocks = cnt.perm.blocks + cur_perm.blocks consistent = True if not consistent: if cur_perm: new_contigs.append(Contig.with_perm(cur_perm, cur_sign, cur_link)) cur_perm = deepcopy(cnt.perm) cur_sign = cnt.sign cur_link = cnt.link if cur_perm: new_contigs.append(Contig.with_perm(cur_perm, cur_sign, cur_link)) num_contigs += len(new_contigs) new_scaffolds.append(Scaffold.with_contigs(scf.name, None, None, new_contigs)) logger.debug("Merging consequtive contigs: {0} left".format(num_contigs)) return new_scaffolds
def extend_scaffold(contig): visited.add(contig) scf_name = "ragout-scaffold-{0}".format(counter[0]) counter[0] += 1 scf = Scaffold.with_contigs(scf_name, contig.left_end(), contig.right_end(), [contig]) already_complete = (scf.right in adjacencies and adjacencies[scf.right].block == scf.left and adjacencies[scf.right].infinity) if already_complete: scaffolds.append(scf) return #go right while scf.right in adjacencies and not adjacencies[scf.right].infinity: adj_block = adjacencies[scf.right].block adj_distance = adjacencies[scf.right].distance adj_supporting_genomes = adjacencies[scf.right].supporting_genomes contig = contig_index[abs(adj_block)] if contig in visited: break if adj_block in [contig.left_end(), contig.right_end()]: if contig.left_end() == adj_block: scf.contigs.append(contig) else: scf.contigs.append(contig.reverse_copy()) flank = scf.contigs[-2].right_gap() + scf.contigs[-1].left_gap( ) gap = adj_distance - flank if correct_distances else adj_distance scf.contigs[-2].link = Link(gap, adj_supporting_genomes) scf.right = scf.contigs[-1].right_end() visited.add(contig) continue break #go left while scf.left in adjacencies and not adjacencies[scf.left].infinity: adj_block = adjacencies[scf.left].block adj_distance = adjacencies[scf.left].distance adj_supporting_genomes = adjacencies[scf.left].supporting_genomes contig = contig_index[abs(adj_block)] if contig in visited: break if adj_block in [contig.right_end(), contig.left_end()]: if contig.right_end() == adj_block: scf.contigs.insert(0, contig) else: scf.contigs.insert(0, contig.reverse_copy()) flank = scf.contigs[0].right_gap() + scf.contigs[1].left_gap() gap = adj_distance - flank if correct_distances else adj_distance scf.contigs[0].link = Link(gap, adj_supporting_genomes) scf.left = scf.contigs[0].left_end() visited.add(contig) continue break if len(scf.contigs) > 1: scaffolds.append(scf)
def _merge_scaffolds(big_scaffolds, small_scaffolds): """ Performs the final merging step """ count_diff_scaf = 0 count_diff_orient = 0 count_inconsistent = 0 total_success = 0 total_fail = 0 total_inserted = 0 not_found = 0 big_count = defaultdict(int) for scf in big_scaffolds: for c in scf.contigs: big_count[c.perm] += 1 small_count = defaultdict(int) for scf in small_scaffolds: for c in scf.contigs: small_count[c.perm] += 1 repeats = set(seq for ( seq, count) in chain(list(big_count.items()), list(small_count.items())) if count > 1) big_unique = set(seq for (seq, count) in big_count.items() if count == 1) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): if contig.perm not in repeats: assert contig.perm not in small_index small_index[contig.perm] = (scf, pos) new_scafflods = [] for big_scf in big_scaffolds: new_contigs = [] #non_repeats = list(filter(lambda i: big_scf.contigs[i].perm # not in repeats, # xrange(len(big_scf.contigs)))) non_repeats = [ i for i in range(len(big_scf.contigs)) if big_scf.contigs[i].perm not in repeats ] for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]): left_cnt = big_scf.contigs[left_idx] right_cnt = big_scf.contigs[right_idx] consistent = False weak_contigs = None link_to_change = None if (left_cnt.perm in small_index and right_cnt.perm in small_index): consistent = True left_scf, left_pos = small_index[left_cnt.perm] right_scf, right_pos = small_index[right_cnt.perm] big_sign = left_cnt.sign == right_cnt.sign small_sign = (left_scf.contigs[left_pos].sign == right_scf.contigs[right_pos].sign) if left_scf != right_scf: count_diff_scaf += 1 consistent = False elif big_sign != small_sign: count_diff_orient += 1 consistent = False else: same_dir = left_pos < right_pos if not same_dir: left_pos, right_pos = right_pos, left_pos weak_contigs = left_scf.contigs[left_pos + 1:right_pos] if any(c.perm in big_unique for c in weak_contigs): count_inconsistent += 1 consistent = False link_to_change = copy(left_scf.contigs[left_pos].link) #reverse complement if weak_contigs and not same_dir: link_to_change = copy(left_scf.contigs[right_pos - 1].link) weak_contigs = [ c.reverse_copy() for c in weak_contigs[::-1] ] for pw, nw in zip(weak_contigs[:-1], weak_contigs[1:]): pw.link = copy(nw.link) weak_contigs[-1].link = copy( left_scf.contigs[left_pos].link) else: not_found += 1 new_contigs.append(left_cnt) if consistent and weak_contigs: new_contigs[-1].link = link_to_change new_contigs.extend(weak_contigs) total_success += 1 total_inserted += len(weak_contigs) #logger.debug("Inserting '{0}' between {1} and {2}" # .format(map(lambda c: c.perm, weak_contigs), # left_cnt, right_cnt)) else: new_contigs.extend(big_scf.contigs[left_idx + 1:right_idx]) total_fail += 1 if len(new_contigs) > 1: new_contigs.append(right_cnt) s = Scaffold(big_scf.name) s.contigs = new_contigs new_scafflods.append(s) else: #because of repeats new_scafflods.append(big_scf) logger.debug("Fail: not found: %d", not_found) logger.debug("Fail: different scaffolds: %d", count_diff_scaf) logger.debug("Fail: different orientatilns: %d", count_diff_orient) logger.debug("Fail: inconsistent: %d", count_inconsistent) logger.debug("Total success: %d", total_success) logger.debug("Total fail: %d", total_fail) logger.debug("Total inserted: %d", total_inserted) num_contigs = 0 for scf in new_scafflods: num_contigs += len(scf.contigs) logger.debug("Result: %d contigs in %d scaffolds", num_contigs, len(new_scafflods)) return new_scafflods
def extend_scaffold(contig): visited.add(contig) scf_name = "ragout-scaffold-{0}".format(counter[0]) counter[0] += 1 scf = Scaffold.with_contigs(scf_name, contig.left_end(), contig.right_end(), [contig]) already_complete = (scf.right in adjacencies and adjacencies[scf.right].block == scf.left and adjacencies[scf.right].infinity) if already_complete: scaffolds.append(scf) return #go right while scf.right in adjacencies and not adjacencies[scf.right].infinity: adj_block = adjacencies[scf.right].block adj_distance = adjacencies[scf.right].distance adj_supporting_genomes = adjacencies[scf.right].supporting_genomes ###KeyError may be throw when ref.indels = False contig = contig_index[abs(adj_block)] if contig in visited: break if adj_block in [contig.left_end(), contig.right_end()]: if contig.left_end() == adj_block: scf.contigs.append(contig) else: scf.contigs.append(contig.reverse_copy()) flank = scf.contigs[-2].right_gap() + scf.contigs[-1].left_gap() try: gap = adj_distance - flank if correct_distances else adj_distance except Exception as e: print scf.right print adj_distance, flank scf.contigs[-2].link = Link(gap, adj_supporting_genomes) scf.right = scf.contigs[-1].right_end() visited.add(contig) continue break #go left while scf.left in adjacencies and not adjacencies[scf.left].infinity: adj_block = adjacencies[scf.left].block adj_distance = adjacencies[scf.left].distance adj_supporting_genomes = adjacencies[scf.left].supporting_genomes contig = contig_index[abs(adj_block)] if contig in visited: break if adj_block in [contig.right_end(), contig.left_end()]: if contig.right_end() == adj_block: scf.contigs.insert(0, contig) else: scf.contigs.insert(0, contig.reverse_copy()) flank = scf.contigs[0].right_gap() + scf.contigs[1].left_gap() try: gap = adj_distance - flank if correct_distances else adj_distance except Exception as e: print adj_distance, flank scf.contigs[0].link = Link(gap, adj_supporting_genomes) scf.left = scf.contigs[0].left_end() visited.add(contig) continue break if len(scf.contigs) > 1: scaffolds.append(scf)
def merge(big_scaffolds, small_scaffolds): """ The only function here """ logger.info("Merging two iterations") big_index = set() for scf in big_scaffolds: for c in scf.contigs: big_index.add(c.name) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): assert contig.name not in small_index small_index[contig.name] = (scf, pos) count = 0 new_scafflods = [] for scf in big_scaffolds: result = [] for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): result.append(prev_cont) try: scf_prev, begin = small_index[prev_cont.name] scf_new, end = small_index[new_cont.name] except KeyError: continue if scf_prev.name != scf_new.name: continue assert end != begin same_dir = True if end < begin: same_dir = False end, begin = begin, end consistent = True for c in scf_prev.contigs[begin + 1 : end]: if c.name in big_index: consistent = False break if not consistent or end - begin == 1: continue if ((prev_cont.sign == new_cont.sign) != (scf_prev.contigs[begin].sign == scf_prev.contigs[end].sign)): continue count += end - begin - 1 contigs = scf_prev.contigs[begin + 1 : end] if not same_dir: contigs = contigs[::-1] contigs = list(map(lambda c: c.reverse(), contigs)) #keeping gap from new contigs result[-1].link = scf_prev.contigs[begin].link result.extend(contigs) result.append(new_cont) s = Scaffold(scf.name) s.contigs = result new_scafflods.append(s) return new_scafflods
def _merge_scaffolds(big_scaffolds, small_scaffolds): """ Performs the final merging step """ count_diff_scaf = 0 count_diff_orient = 0 count_inconsistent = 0 total_success = 0 total_fail = 0 total_inserted = 0 not_found = 0 big_count = defaultdict(int) for scf in big_scaffolds: for c in scf.contigs: big_count[c.perm] += 1 small_count = defaultdict(int) for scf in small_scaffolds: for c in scf.contigs: small_count[c.perm] += 1 repeats = set(seq for (seq, count) in chain(big_count.items(), small_count.items()) if count > 1) big_unique = set(seq for (seq, count) in big_count.items() if count == 1) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): if contig.perm not in repeats: assert contig.perm not in small_index small_index[contig.perm] = (scf, pos) new_scafflods = [] for big_scf in big_scaffolds: new_contigs = [] non_repeats = list(filter(lambda i: big_scf.contigs[i].perm not in repeats, xrange(len(big_scf.contigs)))) for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]): left_cnt = big_scf.contigs[left_idx] right_cnt = big_scf.contigs[right_idx] consistent = False if (left_cnt.perm in small_index and right_cnt.perm in small_index): consistent = True left_scf, left_pos = small_index[left_cnt.perm] right_scf, right_pos = small_index[right_cnt.perm] big_sign = left_cnt.sign == right_cnt.sign small_sign = (left_scf.contigs[left_pos].sign == right_scf.contigs[right_pos].sign) if left_scf != right_scf: count_diff_scaf += 1 consistent = False elif big_sign != small_sign: count_diff_orient += 1 consistent = False else: same_dir = left_pos < right_pos if not same_dir: left_pos, right_pos = right_pos, left_pos weak_contigs = left_scf.contigs[left_pos + 1 : right_pos] if any(c.perm in big_unique for c in weak_contigs): count_inconsistent += 1 consistent = False if not same_dir: weak_contigs = list(map(lambda c: c.reverse_copy(), weak_contigs[::-1])) link_to_change = left_scf.contigs[left_pos].link else: not_found += 1 new_contigs.append(left_cnt) if consistent: new_contigs[-1].link = link_to_change new_contigs.extend(weak_contigs) total_success += 1 total_inserted += len(weak_contigs) #logger.debug("Inserting '{0}' between {1} and {2}" # .format(map(lambda c: c.perm, weak_contigs), # left_cnt, right_cnt)) else: new_contigs.extend(big_scf.contigs[left_idx+1:right_idx]) total_fail += 1 if len(new_contigs) > 1: new_contigs.append(right_cnt) s = Scaffold(big_scf.name) s.contigs = new_contigs new_scafflods.append(s) else: #because of repeats new_scafflods.append(big_scf) logger.debug("Fail: not found: {0}".format(not_found)) logger.debug("Fail: different scaffolds: {0}".format(count_diff_scaf)) logger.debug("Fail: different orientatilns: {0}".format(count_diff_orient)) logger.debug("Fail: inconsistent: {0}".format(count_inconsistent)) logger.debug("Total success: {0}".format(total_success)) logger.debug("Total fail: {0}".format(total_fail)) logger.debug("Total inserted: {0}".format(total_inserted)) num_contigs = 0 for scf in new_scafflods: num_contigs += len(scf.contigs) logger.debug("Result: {0} contigs in {1} scaffolds" .format(num_contigs, len(new_scafflods))) return new_scafflods
def merge(big_scaffolds, small_scaffolds): logger.info("Merging two iterations") big_index = set() for scf in big_scaffolds: for c in scf.contigs: big_index.add(c.name) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): assert contig.name not in small_index small_index[contig.name] = (scf, pos) count = 0 new_scafflods = [] for scf in big_scaffolds: result = [] for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): result.append(prev_cont) try: scf_prev, begin = small_index[prev_cont.name] scf_new, end = small_index[new_cont.name] except KeyError: continue if scf_prev.name != scf_new.name: continue assert end != begin same_dir = True if end < begin: same_dir = False end, begin = begin, end consistent = True for c in scf_prev.contigs[begin + 1:end]: if c.name in big_index: consistent = False break if not consistent or end - begin == 1: continue if ((prev_cont.sign == new_cont.sign) != (scf_prev.contigs[begin].sign == scf_prev.contigs[end].sign)): continue count += end - begin - 1 contigs = scf_prev.contigs[begin + 1:end] if not same_dir: contigs = contigs[::-1] contigs = list( map(lambda c: Contig(c.name, -c.sign, 0), contigs)) result.extend(contigs) result.append(new_cont) s = Scaffold(scf.name) s.contigs = result new_scafflods.append(s) return new_scafflods