def _insert_from_graph_experement(graph_file, scaffolds_in, max_path_len): new_scaffolds = [] graph = _load_dot(graph_file) ordered_contigs = set() for scf in scaffolds_in: ordered_contigs |= set(map(lambda s: s.name, scf.contigs)) for scf in scaffolds_in: new_scaffolds.append(Scaffold(scf.name)) for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): new_scaffolds[-1].contigs.append(prev_cont) #find unique path path_nodes = _get_unigue_path_experiment(graph, prev_cont, new_cont, max_path_len, ordered_contigs) if not path_nodes: continue #insert contigs along the path for node in path_nodes: new_scaffolds[-1].contigs.append(Contig.from_sting(node)) new_scaffolds[-1].contigs.append(new_cont) return new_scaffolds
def _insert_from_graph(graph, scaffolds_in, max_path_len, contigs_fasta): """ Inserts contigs from the assembly graph into scaffolds """ new_scaffolds = [] ordered_contigs = set() for scf in scaffolds_in: ordered_contigs |= set([c.name() for c in scf.contigs]) reverse_graph = graph.reverse() for scf in scaffolds_in: new_scaffolds.append(Scaffold(scf.name)) for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): new_scaffolds[-1].contigs.append(prev_cont) #find contigs to insert path_nodes = _get_cut_vertices(graph, reverse_graph, prev_cont, new_cont, max_path_len, ordered_contigs) if not path_nodes: continue #insert contigs along the path supp_genomes = prev_cont.link.supporting_genomes prev_cont.link.supporting_assembly = True prev_cont.link.gap = config.vals["min_scaffold_gap"] for node in path_nodes: sign = 1 if node[0] == "+" else -1 name = node[1:] new_contig = Contig.with_sequence(name, len(contigs_fasta[name]), sign) new_contig.link.supporting_assembly = True new_contig.link.gap = config.vals["min_scaffold_gap"] new_contig.link.supporting_genomes = supp_genomes new_scaffolds[-1].contigs.append(new_contig) new_scaffolds[-1].contigs.append(scf.contigs[-1]) return new_scaffolds
def _insert_from_graph(graph_file, scaffolds_in, max_path_len): new_scaffolds = [] graph = _load_dot(graph_file) logger.debug("Loaded overlap graph with {0} nodes".format(len(graph))) ordered_contigs = set() for scf in scaffolds_in: ordered_contigs |= set(map(lambda s: s.name, scf.contigs)) for scf in scaffolds_in: new_scaffolds.append(Scaffold(scf.name)) for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): new_scaffolds[-1].contigs.append(prev_cont) #find unique path path_nodes = _get_unique_path(graph, prev_cont, new_cont, max_path_len) if not path_nodes: continue #check path consistency consistent = True for node in path_nodes: if node[1:] in ordered_contigs: logger.debug("Path inconsistency {0} -- {1}: {2}" .format(prev_cont, new_cont, node)) consistent = False break if not consistent: continue #insert contigs along the path for node in path_nodes: new_scaffolds[-1].contigs.append(Contig.from_sting(node)) new_scaffolds[-1].contigs.append(new_cont) return new_scaffolds
def _merge_scaffolds(big_scaffolds, small_scaffolds): """ Performs the final merging step """ count_diff_scaf = 0 count_diff_orient = 0 count_inconsistent = 0 total_success = 0 total_fail = 0 total_inserted = 0 not_found = 0 big_count = defaultdict(int) for scf in big_scaffolds: for c in scf.contigs: big_count[c.perm] += 1 small_count = defaultdict(int) for scf in small_scaffolds: for c in scf.contigs: small_count[c.perm] += 1 repeats = set(seq for ( seq, count) in chain(list(big_count.items()), list(small_count.items())) if count > 1) big_unique = set(seq for (seq, count) in big_count.items() if count == 1) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): if contig.perm not in repeats: assert contig.perm not in small_index small_index[contig.perm] = (scf, pos) new_scafflods = [] for big_scf in big_scaffolds: new_contigs = [] #non_repeats = list(filter(lambda i: big_scf.contigs[i].perm # not in repeats, # xrange(len(big_scf.contigs)))) non_repeats = [ i for i in range(len(big_scf.contigs)) if big_scf.contigs[i].perm not in repeats ] for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]): left_cnt = big_scf.contigs[left_idx] right_cnt = big_scf.contigs[right_idx] consistent = False weak_contigs = None link_to_change = None if (left_cnt.perm in small_index and right_cnt.perm in small_index): consistent = True left_scf, left_pos = small_index[left_cnt.perm] right_scf, right_pos = small_index[right_cnt.perm] big_sign = left_cnt.sign == right_cnt.sign small_sign = (left_scf.contigs[left_pos].sign == right_scf.contigs[right_pos].sign) if left_scf != right_scf: count_diff_scaf += 1 consistent = False elif big_sign != small_sign: count_diff_orient += 1 consistent = False else: same_dir = left_pos < right_pos if not same_dir: left_pos, right_pos = right_pos, left_pos weak_contigs = left_scf.contigs[left_pos + 1:right_pos] if any(c.perm in big_unique for c in weak_contigs): count_inconsistent += 1 consistent = False link_to_change = copy(left_scf.contigs[left_pos].link) #reverse complement if weak_contigs and not same_dir: link_to_change = copy(left_scf.contigs[right_pos - 1].link) weak_contigs = [ c.reverse_copy() for c in weak_contigs[::-1] ] for pw, nw in zip(weak_contigs[:-1], weak_contigs[1:]): pw.link = copy(nw.link) weak_contigs[-1].link = copy( left_scf.contigs[left_pos].link) else: not_found += 1 new_contigs.append(left_cnt) if consistent and weak_contigs: new_contigs[-1].link = link_to_change new_contigs.extend(weak_contigs) total_success += 1 total_inserted += len(weak_contigs) #logger.debug("Inserting '{0}' between {1} and {2}" # .format(map(lambda c: c.perm, weak_contigs), # left_cnt, right_cnt)) else: new_contigs.extend(big_scf.contigs[left_idx + 1:right_idx]) total_fail += 1 if len(new_contigs) > 1: new_contigs.append(right_cnt) s = Scaffold(big_scf.name) s.contigs = new_contigs new_scafflods.append(s) else: #because of repeats new_scafflods.append(big_scf) logger.debug("Fail: not found: %d", not_found) logger.debug("Fail: different scaffolds: %d", count_diff_scaf) logger.debug("Fail: different orientatilns: %d", count_diff_orient) logger.debug("Fail: inconsistent: %d", count_inconsistent) logger.debug("Total success: %d", total_success) logger.debug("Total fail: %d", total_fail) logger.debug("Total inserted: %d", total_inserted) num_contigs = 0 for scf in new_scafflods: num_contigs += len(scf.contigs) logger.debug("Result: %d contigs in %d scaffolds", num_contigs, len(new_scafflods)) return new_scafflods
def merge(big_scaffolds, small_scaffolds): logger.info("Merging two iterations") big_index = set() for scf in big_scaffolds: for c in scf.contigs: big_index.add(c.name) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): assert contig.name not in small_index small_index[contig.name] = (scf, pos) count = 0 new_scafflods = [] for scf in big_scaffolds: result = [] for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): result.append(prev_cont) try: scf_prev, begin = small_index[prev_cont.name] scf_new, end = small_index[new_cont.name] except KeyError: continue if scf_prev.name != scf_new.name: continue assert end != begin same_dir = True if end < begin: same_dir = False end, begin = begin, end consistent = True for c in scf_prev.contigs[begin + 1:end]: if c.name in big_index: consistent = False break if not consistent or end - begin == 1: continue if ((prev_cont.sign == new_cont.sign) != (scf_prev.contigs[begin].sign == scf_prev.contigs[end].sign)): continue count += end - begin - 1 contigs = scf_prev.contigs[begin + 1:end] if not same_dir: contigs = contigs[::-1] contigs = list( map(lambda c: Contig(c.name, -c.sign, 0), contigs)) result.extend(contigs) result.append(new_cont) s = Scaffold(scf.name) s.contigs = result new_scafflods.append(s) return new_scafflods