def _reestimate_distances(graph, scaffolds, contigs_fasta): """ Estimates distances between contigs using overlap graph """ restricted_nodes = set() for scf in scaffolds: for contig in scf.contigs: restricted_nodes.add("+" + contig.name()) restricted_nodes.add("-" + contig.name()) for scf in scaffolds: for prev_cont, next_cont in zip(scf.contigs[:-1], scf.contigs[1:]): src, dst = prev_cont.signed_name(), next_cont.signed_name() if graph.has_edge(src, dst): overlap = graph[src][dst]["label"] prev_cont.link.gap = -int(overlap) else: path = _shortest_path(graph, src, dst, restricted_nodes) if not path: continue path_len = 0 for node in path[1:-1]: path_len += len(contigs_fasta[node[1:]]) for n1, n2 in zip(path[:-1], path[1:]): overlap = graph[n1][n2]["label"] path_len -= int(overlap) prev_cont.link.gap = path_len
def update_gaps(scaffolds): """ Do it in the very end """ for scf in scaffolds: for c1, c2 in zip(scf.contigs[:-1], scf.contigs[1:]): c1.link.gap -= c1.right_gap() + c2.left_gap()
def _make_unplaced_fasta(self): """ Creates unplaced (not used in scaffolds) sequences in FASTA format """ used_ranges_by_seq = defaultdict(list) for scf in self.scaffolds: for ctg in scf.contigs: seq_name, seq_start, seq_end = ctg.name_with_coords() used_ranges_by_seq[seq_name].append((seq_start, seq_end)) for seq_name in self.fragments_fasta: seq_len = len(self.fragments_fasta[seq_name]) used_ranges_by_seq[seq_name].append((0, 0)) used_ranges_by_seq[seq_name].append((seq_len, seq_len)) used_ranges_by_seq[seq_name].sort() unused_ranges_by_seq = defaultdict(list) for seq_name in self.fragments_fasta: for range_1, range_2 in zip(used_ranges_by_seq[seq_name][:-1], used_ranges_by_seq[seq_name][1:]): if range_1[1] < range_2[0]: unused_ranges_by_seq[seq_name].append((range_1[1], range_2[0])) unplaced_fasta = {} for seq_name, unused_ranges in unused_ranges_by_seq.items(): for ur in unused_ranges: if ur[0] == 0 and ur[1] == len(self.fragments_fasta[seq_name]): fragment_name = seq_name else: fragment_name = seq_name + "[{0}:{1}]".format(ur[0], ur[1]) unplaced_fasta[fragment_name] = \ self.fragments_fasta[seq_name][ur[0]:ur[1]] self.unplaced_fasta = unplaced_fasta
def alternating_cycle(self, node_1, node_2): """ Determines if there is a cycle of alternating colors that goes through the given red-supported (!) edge """ def get_genome_ids(xxx_todo_changeme): (u, v) = xxx_todo_changeme return self.genomes_support(u, v) good_path = False path = None for path in self._alternating_paths(node_1, node_2): assert len(path) % 2 == 0 if len(path) == 2: continue edges = list(zip(path[:-1], path[1:])) even_colors = list(map(get_genome_ids, edges[1::2])) even_good = all( [set(e) == set([self.target]) for e in even_colors]) if not even_good: continue odd_colors = [get_genome_ids(e) for e in edges[0::2]] common_genomes = set(odd_colors[0]) for edge_colors in odd_colors: common_genomes = common_genomes.intersection(edge_colors) if common_genomes: #self._check_distances(path) good_path = True break return len(path) // 2 if good_path else None
def _build_adj_graph(self): adj_graph = nx.Graph() for scf in self.old_scaffolds: for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]): adj_graph.add_edge(cnt_1.right_end(), cnt_2.left_end()) for cnt in scf.contigs: adj_graph.add_edge(cnt.left_end(), cnt.right_end()) #chromosome ends adj_graph.add_edge(scf.contigs[-1].right_end(), scf.contigs[0].left_end()) self.adj_graph = adj_graph
def output_links(scaffolds, out_links): """ Outputs pretty table with information about adjacencies """ HEADER = ["sequence", "start", "length", "gap", "support"] COL_GAP = 4 with open(out_links, "w") as f: for scf in sorted(scaffolds, key=lambda s: s.name): rows = [] cur_pos = 0 for contig in scf.contigs: start = cur_pos cur_pos = start + contig.length() + contig.link.gap support = _support_to_string(contig.link) rows.append([contig.signed_name(), str(start), str(contig.length()), str(contig.link.gap), support]) col_widths = repeat(0) for row in [HEADER] + rows: col_widths = [max(len(v), w) for v, w in zip(row, col_widths)] line_len = sum(col_widths) + COL_GAP * len(col_widths) #header f.write("-" * line_len + "\n") f.write(scf.name + "\n") f.write("-" * line_len + "\n") for hdr, width in zip(HEADER, col_widths): f.write(hdr + (" " * (width - len(hdr) + COL_GAP))) f.write("\n" + "-" * line_len + "\n") #values for row in rows: for val, width in zip(row, col_widths): f.write(val + (" " * (width - len(val) + COL_GAP))) f.write("\n") f.write("-" * line_len + "\n\n")
def _genome_distance(self, genome_1, genome_2): """ Calculates breakpoint distance between two genomes """ breakpoints_1 = set() n_blocks_1 = 0 for perm in self.perms_by_genome[genome_1]: n_blocks_1 += len(perm.blocks) for bl_1, bl_2 in zip(perm.blocks[:-1], perm.blocks[1:]): bp = sorted([-bl_1.signed_id(), bl_2.signed_id()]) breakpoints_1.add(tuple(bp)) breakpoints_2 = set() n_blocks_2 = 0 for perm in self.perms_by_genome[genome_2]: n_blocks_2 += len(perm.blocks) for bl_1, bl_2 in zip(perm.blocks[:-1], perm.blocks[1:]): bp = sorted([-bl_1.signed_id(), bl_2.signed_id()]) breakpoints_2.add(tuple(bp)) return (min(len(breakpoints_1), len(breakpoints_2)) - len(breakpoints_1 & breakpoints_2))
def _build_bp_graph(self): """ No repeats assumed! """ old_contigs = set() for scf in self.old_scaffolds: for cnt in scf.contigs: old_contigs.add(cnt.name()) ###creating 2-colored breakpoint graph bp_graph = nx.MultiGraph() for scf in self.old_scaffolds: for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]): bp_graph.add_edge(cnt_1.right_end(), cnt_2.left_end(), scf_set="old", link=copy(cnt_1.link), scf_name=scf.name, infinity=False) #chromosome ends bp_graph.add_edge(scf.contigs[-1].right_end(), scf.contigs[0].left_end(), scf_set="old", infinity=True) for scf in self.new_scaffolds: prev_cont = None first_ctg = None pos = 0 for pos, contig in enumerate(scf.contigs): if contig.name() in old_contigs: prev_cont = deepcopy(contig) first_ctg = prev_cont break if prev_cont is None: continue for next_cont in scf.contigs[pos + 1:]: if next_cont.name() not in old_contigs: prev_cont.link.gap += next_cont.length() + next_cont.link.gap common_genomes = (set(prev_cont.link.supporting_genomes) & set(next_cont.link.supporting_genomes)) prev_cont.link.supporting_genomes = list(common_genomes) else: bp_graph.add_edge(prev_cont.right_end(), next_cont.left_end(), scf_set="new", link=copy(prev_cont.link), scf_name=scf.name, infinity=False) prev_cont = deepcopy(next_cont) bp_graph.add_edge(prev_cont.right_end(), first_ctg.left_end(), scf_set="new", infinity=True, link=None) self.bp_graph = bp_graph
def _update_scaffolds(scaffolds, perm_container): """ Updates scaffolds wrt to given permutations """ perm_index = defaultdict(list) for perm in perm_container.target_perms: perm_index[(perm.chr_name, perm.repeat_id)].append(perm) new_scaffolds = [] for scf in scaffolds: new_contigs = [] for contig in scf.contigs: inner_perms = [] for new_perm in perm_index[(contig.perm.chr_name, contig.perm.repeat_id)]: if (contig.perm.seq_start <= new_perm.seq_start < contig.perm.seq_end): inner_perms.append(new_perm) assert (contig.perm.seq_start < new_perm.seq_end <= contig.perm.seq_end) if not inner_perms: logger.debug("Lost: %s", str(contig.perm)) continue inner_perms.sort(key=lambda p: p.seq_start, reverse=contig.sign < 0) for prev_perm, next_perm in zip(inner_perms[:-1], inner_perms[1:]): if contig.sign > 0: gap_length = next_perm.seq_start - prev_perm.seq_end else: gap_length = prev_perm.seq_start - next_perm.seq_end support = [ GenChrPair(prev_perm.genome_name, prev_perm.chr_name) ] new_contigs.append( Contig.with_perm(prev_perm, contig.sign, Link(gap_length, support))) new_contigs.append( Contig.with_perm(inner_perms[-1], contig.sign, copy(contig.link))) if len(new_contigs): new_scaffolds.append( Scaffold.with_contigs(scf.name, None, None, new_contigs)) return new_scaffolds
def _insert_from_graph(graph, scaffolds_in, max_path_len, contigs_fasta): """ Inserts contigs from the assembly graph into scaffolds """ new_scaffolds = [] ordered_contigs = set() for scf in scaffolds_in: ordered_contigs |= set([c.name() for c in scf.contigs]) reverse_graph = graph.reverse() for scf in scaffolds_in: new_scaffolds.append(Scaffold(scf.name)) for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): new_scaffolds[-1].contigs.append(prev_cont) #find contigs to insert path_nodes = _get_cut_vertices(graph, reverse_graph, prev_cont, new_cont, max_path_len, ordered_contigs) if not path_nodes: continue #insert contigs along the path supp_genomes = prev_cont.link.supporting_genomes prev_cont.link.supporting_assembly = True prev_cont.link.gap = config.vals["min_scaffold_gap"] for node in path_nodes: sign = 1 if node[0] == "+" else -1 name = node[1:] new_contig = Contig.with_sequence(name, len(contigs_fasta[name]), sign) new_contig.link.supporting_assembly = True new_contig.link.gap = config.vals["min_scaffold_gap"] new_contig.link.supporting_genomes = supp_genomes new_scaffolds[-1].contigs.append(new_contig) new_scaffolds[-1].contigs.append(scf.contigs[-1]) return new_scaffolds
def _fix_gaps(self): """ Handles negative gaps, ensures that gap values are within deined range """ def get_seq(contig): seq_name, seg_start, seg_end = contig.name_with_coords() cont_seq = self.fragments_fasta[seq_name][seg_start:seg_end] if contig.sign < 0: cont_seq = reverse_complement(cont_seq) return cont_seq def count_ns(cnt_1, cnt_2): seq_1, seq_2 = get_seq(cnt_1), get_seq(cnt_2) left_ns, right_ns = 0, 0 for i in range(len(seq_1) - 1, 0, -1): if seq_1[i].upper() != "N": break left_ns += 1 for i in range(len(seq_2) - 1): if seq_2[i].upper() != "N": break right_ns += 1 return left_ns, right_ns for scf in self.scaffolds: for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]): if cnt_1.link.supporting_assembly: cnt_1.trim_right(max(0, -cnt_1.link.gap)) cnt_1.link.gap = max(0, cnt_1.link.gap) continue left_ns, right_ns = count_ns(cnt_1, cnt_2) cnt_1.trim_right(left_ns) cnt_2.trim_left(right_ns) cnt_1.link.gap += left_ns + right_ns cnt_1.link.gap = max(cnt_1.link.gap, config.vals["min_scaffold_gap"])
def _merge_scaffolds(big_scaffolds, small_scaffolds): """ Performs the final merging step """ count_diff_scaf = 0 count_diff_orient = 0 count_inconsistent = 0 total_success = 0 total_fail = 0 total_inserted = 0 not_found = 0 big_count = defaultdict(int) for scf in big_scaffolds: for c in scf.contigs: big_count[c.perm] += 1 small_count = defaultdict(int) for scf in small_scaffolds: for c in scf.contigs: small_count[c.perm] += 1 repeats = set(seq for ( seq, count) in chain(list(big_count.items()), list(small_count.items())) if count > 1) big_unique = set(seq for (seq, count) in big_count.items() if count == 1) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): if contig.perm not in repeats: assert contig.perm not in small_index small_index[contig.perm] = (scf, pos) new_scafflods = [] for big_scf in big_scaffolds: new_contigs = [] #non_repeats = list(filter(lambda i: big_scf.contigs[i].perm # not in repeats, # xrange(len(big_scf.contigs)))) non_repeats = [ i for i in range(len(big_scf.contigs)) if big_scf.contigs[i].perm not in repeats ] for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]): left_cnt = big_scf.contigs[left_idx] right_cnt = big_scf.contigs[right_idx] consistent = False weak_contigs = None link_to_change = None if (left_cnt.perm in small_index and right_cnt.perm in small_index): consistent = True left_scf, left_pos = small_index[left_cnt.perm] right_scf, right_pos = small_index[right_cnt.perm] big_sign = left_cnt.sign == right_cnt.sign small_sign = (left_scf.contigs[left_pos].sign == right_scf.contigs[right_pos].sign) if left_scf != right_scf: count_diff_scaf += 1 consistent = False elif big_sign != small_sign: count_diff_orient += 1 consistent = False else: same_dir = left_pos < right_pos if not same_dir: left_pos, right_pos = right_pos, left_pos weak_contigs = left_scf.contigs[left_pos + 1:right_pos] if any(c.perm in big_unique for c in weak_contigs): count_inconsistent += 1 consistent = False link_to_change = copy(left_scf.contigs[left_pos].link) #reverse complement if weak_contigs and not same_dir: link_to_change = copy(left_scf.contigs[right_pos - 1].link) weak_contigs = [ c.reverse_copy() for c in weak_contigs[::-1] ] for pw, nw in zip(weak_contigs[:-1], weak_contigs[1:]): pw.link = copy(nw.link) weak_contigs[-1].link = copy( left_scf.contigs[left_pos].link) else: not_found += 1 new_contigs.append(left_cnt) if consistent and weak_contigs: new_contigs[-1].link = link_to_change new_contigs.extend(weak_contigs) total_success += 1 total_inserted += len(weak_contigs) #logger.debug("Inserting '{0}' between {1} and {2}" # .format(map(lambda c: c.perm, weak_contigs), # left_cnt, right_cnt)) else: new_contigs.extend(big_scf.contigs[left_idx + 1:right_idx]) total_fail += 1 if len(new_contigs) > 1: new_contigs.append(right_cnt) s = Scaffold(big_scf.name) s.contigs = new_contigs new_scafflods.append(s) else: #because of repeats new_scafflods.append(big_scf) logger.debug("Fail: not found: %d", not_found) logger.debug("Fail: different scaffolds: %d", count_diff_scaf) logger.debug("Fail: different orientatilns: %d", count_diff_orient) logger.debug("Fail: inconsistent: %d", count_inconsistent) logger.debug("Total success: %d", total_success) logger.debug("Total fail: %d", total_fail) logger.debug("Total inserted: %d", total_inserted) num_contigs = 0 for scf in new_scafflods: num_contigs += len(scf.contigs) logger.debug("Result: %d contigs in %d scaffolds", num_contigs, len(new_scafflods)) return new_scafflods
def iter_pairs(self): for pb, nb in zip(self.blocks[:-1], self.blocks[1:]): yield pb, nb