def create_contig_info(dict_edges, input_dirpath, output_dirpath, contig_edges, edges_by_component, edges_by_repeat_component, edges_by_ref_component, assembler): contig_info = None if is_canu(assembler): contig_info = parse_canu_contigs_info(input_dirpath) elif is_flye(assembler): contig_info = parse_flye_contigs_info(input_dirpath) elif is_spades(assembler): contig_info = parse_spades_contigs_info(input_dirpath, contig_edges) if not contig_info: with open(join(output_dirpath, 'contig_info.json'), 'a') as handle: handle.write("contigInfo=" + json.dumps([]) + ";\n") with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle: handle.write("edgeInfo=" + json.dumps([]) + ";") handle.write("medianCov=" + json.dumps(calculate_median_cov(dict_edges)) + ";\n") return edge_contigs = defaultdict(set) for contig, data in contig_info.items(): subgraph = None repeat_subgraph = None ref_subgraph = None edges = data['edges'] for edge_name in set(edges): edge_id = get_edge_agv_id(edge_name) if edge_id in dict_edges: edge_contigs[edge_id].add(contig) match_edge_id = get_match_edge_id(edge_id) if match_edge_id in dict_edges: edge_contigs[match_edge_id].add(contig) if not subgraph and edge_id in edges_by_component: data['g'] = edges_by_component[edge_id] if not repeat_subgraph and edge_id in edges_by_repeat_component: data['rep_g'] = edges_by_repeat_component[edge_id] if not ref_subgraph and edge_id in edges_by_ref_component: data['ref_g'] = edges_by_ref_component[edge_id] data['num_edges'] = str(len(edges)) contig_info[contig] = data for edge_id in edge_contigs: edge_contigs[edge_id] = list(edge_contigs[edge_id]) with open(join(output_dirpath, 'contig_info.json'), 'a') as handle: handle.write("contigInfo=" + json.dumps(contig_info) + ";\n") with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle: handle.write("edgeInfo=" + json.dumps(edge_contigs) + ";") handle.write("medianCov=" + json.dumps(calculate_median_cov(dict_edges)) + ";\n") return
def parse_alignments(alignments_fpath, json_output_dirpath): gaps_info = defaultdict(list) chrom_alignments = defaultdict(list) ms_info = defaultdict(list) aligns_by_chroms = defaultdict(list) # S1 E1 S2 E2 Reference Contig IDY Ambiguous Best_group with open(alignments_fpath) as f: for i, line in enumerate(f): if i == 0: continue fs = line.split('\t') if len(fs) > 5: start, end, start2, end2, chrom, edge_id = fs[:6] start, end = int(start), int(end) if int(start2) > int(end2): edge_id = get_match_edge_id(edge_id) chrom_alignments[chrom].append((start, end, edge_id)) elif line.startswith("relocation") or line.startswith( "transloc") or line.startswith("invers"): ms_info[(chrom, start, end)].append(line.strip()) for chrom, alignments in chrom_alignments.items(): alignments.sort(key=lambda x: (x[0], x[1])) prev_end = 0 for start, end, edge_id in alignments: if start - prev_end > GAP_THRESHOLD: gaps_info[chrom].append((prev_end, start - 1)) prev_end = max(prev_end, end) align = { 's': start, 'e': end, 'edge': edge_id, 'ms': ';'.join(ms_info[(chrom, start, end)]) } aligns_by_chroms[chrom].append(align) with open(join(json_output_dirpath, 'reference.json'), 'w') as handle: handle.write("chromGaps=" + json.dumps(gaps_info) + ";\n") handle.write("chromAligns=" + json.dumps(aligns_by_chroms) + ";\n")
def parse_gfa(gfa_fpath, min_edge_len, input_dirpath=None, assembler=None): dict_edges = dict() predecessors = defaultdict(list) successors = defaultdict(list) g = nx.DiGraph() print("Parsing " + gfa_fpath + "...") # gfa = gfapy.Gfa.from_file(gfa_fpath, vlevel = 0) links = [] edge_overlaps = defaultdict(dict) with open(gfa_fpath) as f: for line in f: record_type = line[0] if record_type == 'S': fs = line.split() name, seq_len = fs[1], len(fs[2]) if fs[2] == '*': seq_len = None add_fields = fs[3:] if len(fs) > 3 else [] add_info = dict((f.split(':')[0].lower(), f.split(':')[-1]) for f in add_fields) cov = 1 if "dp" in add_info: cov = float(add_info["dp"]) ## coverage depth elif "kc" in add_info: cov = max(1, int(add_info["kc"]) / seq_len) ## k-mer count / edge length if "ln" in add_info: seq_len = int(add_info["ln"]) ## sequence length if seq_len and seq_len >= min_edge_len: edge_id = get_edge_agv_id(get_edge_num(name)) edge = Edge(edge_id, get_edge_num(name), seq_len, cov, element_id=edge_id) dict_edges[edge_id] = edge for overlapped_edge, overlap in edge_overlaps[edge_id].items(): dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap)) rc_edge_id = get_edge_agv_id(-get_edge_num(name)) rc_edge = Edge(rc_edge_id, -get_edge_num(name), seq_len, cov, element_id=rc_edge_id) dict_edges[rc_edge_id] = rc_edge for overlapped_edge, overlap in edge_overlaps[rc_edge_id].items(): dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap)) if record_type != 'L' and record_type != 'E': continue if record_type == 'L': _, from_name, from_orient, to_name, to_orient = line.split()[:5] else: # E * 2+ 65397+ 21 68$ 0 47 47M from_name, to_name = line.split()[2], line.split()[3] from_orient, to_orient = from_name[-1], to_name[-1] from_name, to_name = from_name[:-1], to_name[:-1] edge1 = get_edge_agv_id(get_edge_num(from_name)) edge2 = get_edge_agv_id(get_edge_num(to_name)) if from_orient == '-': edge1 = get_match_edge_id(edge1) if to_orient == '-': edge2 = get_match_edge_id(edge2) overlap = 0 overlap_operations = re.split('(\d+)', line.split()[-1].strip()) for i in range(0, len(overlap_operations) - 1, 1): if not overlap_operations[i]: continue if overlap_operations[i+1] == 'M' or overlap_operations[i+1] == 'I': overlap += int(overlap_operations[i]) links.append((from_name, from_orient, to_name, to_orient, overlap)) if overlap: edge_overlaps[edge1][edge2] = overlap edge_overlaps[edge2][edge1] = overlap ### gfa retains only canonical links for link in links: from_name, from_orient, to_name, to_orient, overlap = link edge1 = get_edge_agv_id(get_edge_num(from_name)) edge2 = get_edge_agv_id(get_edge_num(to_name)) if from_orient == '-': edge1 = get_match_edge_id(edge1) if to_orient == '-': edge2 = get_match_edge_id(edge2) if edge1 != edge2: predecessors[edge2].append(edge1) successors[edge1].append(edge2) g.add_edge(edge1, edge2) if is_spades(assembler) or is_abyss(assembler): edge1, edge2 = get_match_edge_id(edge2), get_match_edge_id(edge1) if edge1 != edge2: predecessors[edge2].append(edge1) successors[edge1].append(edge2) g.add_edge(edge1, edge2) if assembler == "canu" and input_dirpath: dict_edges = parse_canu_unitigs_info(input_dirpath, dict_edges) dict_edges = construct_graph(dict_edges, predecessors, successors) print("Finish parsing.") return dict_edges
def process_graph(g, undirected_g, dict_edges, edges_by_nodes, two_way_edges, output_dirpath, suffix, assembler, base_graph=None, contig_edges=None, chrom_names=None, edge_by_chrom=None, mapping_info=None): last_idx = 0 parts_info = dict() graph = [] modified_dict_edges = dict() loop_edges = defaultdict(set) hanging_nodes = [] connected_nodes = [] enters = [] exits = [] base_graph = base_graph or g chrom_list = [] contig_list = [] complex_component = False if suffix == "ref": if chrom_names: ## create graph for reference-based mode for chrom in list(natural_sort(chrom_names)): edges = edge_by_chrom[chrom] # use only edges mapped to the chromosome graph_component = nx.DiGraph() for edge_id in set(edges): graph_component.add_edge(dict_edges[edge_id].start, dict_edges[edge_id].end) viewer_data, last_idx, sub_complex_component = \ split_graph(graph_component, g, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes, two_way_edges, last_idx, parts_info, mapping_info=mapping_info, chrom=chrom) parts_info = viewer_data.parts_info graph.extend(viewer_data.g) for i in range(len(viewer_data.g)): chrom_list.append(chrom) complex_component = complex_component or sub_complex_component with open(join(output_dirpath, 'reference.json'), 'a') as handle: handle.write("chromosomes=" + json.dumps(chrom_list) + ";\n") elif contig_edges and suffix == "contig": ## create graph for contig-focused mode for contig, edges in contig_edges.items(): graph_component = nx.DiGraph() edge_ids = set() for edge in edges: _, _, edge_id = edge edge_ids.add(edge_id) edge_ids.add(get_match_edge_id(edge_id)) filtered_edge_ids = set() for edge_id in edge_ids: if edge_id in dict_edges: graph_component.add_edge(dict_edges[edge_id].start, dict_edges[edge_id].end) filtered_edge_ids.add(edge_id) viewer_data, last_idx, sub_complex_component = \ split_graph(graph_component, g, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes, two_way_edges, last_idx, parts_info, contig_edges=filtered_edge_ids) parts_info = viewer_data.parts_info for i in range(len(viewer_data.g)): contig_list.append(contig) graph.extend(viewer_data.g) with open(join(output_dirpath, 'contig_info.json'), 'w') as handle: handle.write("contigs=" + json.dumps(contig_list) + ";\n") elif suffix == "repeat" or suffix == "def": fake_edges = [] if is_flye(assembler): ## add fake edges to keep forward and reverse complement components of an edge together for edge_id, edge in dict_edges.items(): if edge_id.startswith("rc"): continue if suffix == "repeat" and not edge.repetitive: continue match_edge_id = get_match_edge_id(edge_id) if match_edge_id not in dict_edges: continue match_edge_nodes = [dict_edges[match_edge_id].start, dict_edges[match_edge_id].end] if not any([e in undirected_g.neighbors(edge.start) for e in match_edge_nodes]) and not \ any([e in undirected_g.neighbors(edge.end) for e in match_edge_nodes]): g.add_edge(edge.end, dict_edges[match_edge_id].start) g.add_edge(edge.start, dict_edges[match_edge_id].end) fake_edges.append((edge.start, dict_edges[match_edge_id].end)) fake_edges.append((edge.end, dict_edges[match_edge_id].start)) # split graph into connected components connected_components = list(nx.weakly_connected_component_subgraphs(g)) if fake_edges: g.remove_edges_from(fake_edges) for i, graph_component in enumerate(connected_components): viewer_data, last_idx, sub_complex_component = \ split_graph(graph_component, base_graph, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes, two_way_edges, last_idx, parts_info, fake_edges=fake_edges, find_hanging_nodes=suffix == "def", is_repeat_graph=suffix == "repeat") parts_info = viewer_data.parts_info graph.extend(viewer_data.g) hanging_nodes.extend(viewer_data.hanging_nodes) connected_nodes.extend(viewer_data.connected_nodes) enters.extend(viewer_data.enters) exits.extend(viewer_data.exits) edges_by_component = save_graph(graph, hanging_nodes, connected_nodes, enters, exits, dict_edges, modified_dict_edges, loop_edges, parts_info, output_dirpath, suffix, complex_component=complex_component, mapping_info=mapping_info, chrom_list=chrom_list, contig_list=contig_list) return edges_by_component
def parse_mapping_info(mapping_fpath, json_output_dir, dict_edges): # assign edges to chromosomes and color edges to corresponding colors mapping_info = defaultdict(set) edge_mappings = defaultdict(lambda: defaultdict(list)) edge_lengths = dict() chrom_lengths = dict() with open(mapping_fpath) as f: for line in f: # contig_1 257261 14 160143 - chr13 924431 196490 356991 147365 161095 60 tp:A:P cm:i:14049 s1:i:147260 s2:i:4375 dv:f:0.0066 fs = line.split() edge_id = get_edge_agv_id(get_edge_num(fs[0])) start, end = int(fs[2]), int(fs[3]) edge_lengths[edge_id] = int(fs[1]) chrom, chrom_len = fs[5], int(fs[6]) ref_start, ref_end = int(fs[7]), int(fs[8]) chrom_lengths[chrom] = chrom_len edge_mappings[edge_id][chrom].append( (start, end, ref_start, ref_end)) chroms_by_edge = defaultdict(set) edge_by_chrom = defaultdict(set) chrom_names = set() best_aligns = defaultdict(defaultdict) for edge_id in edge_mappings: # assign an edge to a chromosome if more than 90% of edge aligned to the chromosome len_threshold = 0.9 * edge_lengths[edge_id] gap_threshold = min(5000, 0.05 * edge_lengths[edge_id]) for chrom, mappings in edge_mappings[edge_id].items(): mappings.sort(key=lambda x: (x[0], -x[1]), reverse=False) for chrom, mappings in edge_mappings[edge_id].items(): aligns = [] covered_len = 0 last_pos = 0 last_ref_pos = 0 align_s, align_e = 0, 0 # calculate covered length (do not count overlaps) for (start, end, ref_start, ref_end) in mappings: start = max(start, last_pos) covered_len += max(0, end - start + 1) last_pos = max(last_pos, end + 1) if covered_len >= len_threshold: chroms_by_edge[edge_id].add(chrom) chrom_names.add(chrom) edge_by_chrom[chrom].add(edge_id) mappings.sort(key=lambda x: (x[2], -x[3]), reverse=False) for (start, end, ref_start, ref_end) in mappings: ref_start = max(ref_start, last_ref_pos) last_ref_pos = max(last_ref_pos, ref_end + 1) if not align_s: align_s = ref_start if align_e and ref_start - align_e >= gap_threshold: if align_e - align_s >= 500: # break alignments if gap longer than 500 bp aligns.append((chrom, align_s, align_e)) align_s = ref_start align_e = ref_end - 1 if align_e and align_e - align_s >= 500: aligns.append((chrom, align_s, align_e)) aligns.sort(reverse=True, key=lambda x: x[2] - x[1]) edge_alignment = chrom + ":" if aligns: best_aligns[edge_id][chrom] = aligns[0][1] for align in aligns[: 3]: # store top 3 alignments for each edge edge_alignment += " %s-%s," % (format_pos( align[1]), format_pos(align[2])) dict_edges[edge_id].aligns[chrom] = edge_alignment[:-1] if get_match_edge_id(edge_id) in dict_edges: dict_edges[get_match_edge_id( edge_id)].aligns[chrom] = edge_alignment[:-1] chrom_len_dict = OrderedDict( (chrom, chrom_lengths[chrom]) for i, chrom in enumerate(list(natural_sort(chrom_names)))) non_alt_chroms = [ c for c in chrom_names if 'alt' not in c and 'random' not in c and 'chrUn' not in c ] chrom_order = OrderedDict( (chrom, i) for i, chrom in enumerate(list(natural_sort(non_alt_chroms)))) color_list = [ '#e6194b', '#3cb44b', '#ffe119', '#1792d4', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#00dbb1', '#dba2ff', '#aa6e28', '#83360e', '#800000', '#003bff', '#808000', '#8d73d4', '#000080', '#806680', '#51205a', '#558859', '#d1a187', '#87a1d1', '#87a1d1', '#afd187' ] edge_chroms = defaultdict(set) for edge_id, chroms in chroms_by_edge.items(): match_edge_id = edge_id.replace( 'rc', 'e') if edge_id.startswith('rc') else edge_id.replace( 'e', 'rc') for chrom in chroms: edge_chroms[edge_id].add(chrom) edge_chroms[match_edge_id].add(chrom) if match_edge_id in dict_edges: edge_by_chrom[chrom].add(match_edge_id) is_single_chrom = len(chrom_order.keys()) == 1 for edge_id, chroms in edge_chroms.items(): if edge_id not in dict_edges: continue mapping_info[edge_id] = list(chroms) colors = set() for chrom in chroms: if chrom in chrom_order: if is_single_chrom: # color an edge according to its position in reference pos = best_aligns[edge_id][chrom] if best_aligns[edge_id] else \ best_aligns[get_match_edge_id(edge_id)][chrom] color = get_rainbow_color(pos, chrom_len_dict[chrom]) else: color = color_list[chrom_order[chrom] % len(color_list)] else: color = '#808080' colors.add(color) if len(colors) <= 5: dict_edges[edge_id].chrom = ':'.join(list(colors)) else: dict_edges[edge_id].chrom = 'white:red:black:red:black:white' with open(join(json_output_dir, "reference.json"), 'a') as handle: handle.write("chrom_lengths=" + json.dumps(chrom_len_dict) + ";\n") handle.write("edgeMappingInfo=" + json.dumps(mapping_info) + ";\n") return mapping_info, non_alt_chroms, edge_by_chrom