def parse_canu_unitigs_info(input_dirpath, dict_edges): tiginfo_fpath = find_file_by_pattern(input_dirpath, ".unitigs.layout.tigInfo") if not is_empty_file(tiginfo_fpath): with open(tiginfo_fpath) as f: for i, line in enumerate(f): if i == 0: header = line.strip().split() repeat_col = header.index("sugRept") if "sugRept" in header else None cov_col = header.index("coverage") if "coverage" in header else None if repeat_col is None or cov_col is None: break continue fs = line.strip().split() edge_id = get_edge_agv_id(get_edge_num(fs[0])) rc_edge_id = get_edge_agv_id(-get_edge_num(fs[0])) if edge_id in dict_edges: coverage = int(float(fs[cov_col])) dict_edges[edge_id].cov = coverage dict_edges[rc_edge_id].cov = coverage if fs[repeat_col] == "yes": dict_edges[edge_id].repetitive = True dict_edges[rc_edge_id].repetitive = True # else: # print("Warning! Edge %s is not found!" % edge_id) return dict_edges
def get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len): if not gfa_fpath: return None input_edges_fpath = join(dirname(gfa_fpath), get_filename(gfa_fpath) + ".fasta") edges_fpath = join(output_dirpath, basename(input_edges_fpath)) if not is_empty_file(gfa_fpath) and not can_reuse(edges_fpath, files_to_check=[gfa_fpath]): print("Extracting edge sequences from " + gfa_fpath + "...") with open(edges_fpath, "w") as out: with open(gfa_fpath) as f: for line in f: if line.startswith('S'): fs = line.strip().split() seq_name = fs[1] seq = None if is_acgt_seq(fs[2]): seq = fs[2] elif len(fs) >= 4 and is_acgt_seq(fs[3]): seq = fs[3] if seq and len(seq) >= min_edge_len: out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name))) out.write(seq) out.write("\n") if is_empty_file(edges_fpath) and not is_empty_file(input_edges_fpath): with open(edges_fpath, "w") as out: with open(input_edges_fpath) as f: for line in f: if line.startswith('>'): seq_name = line.strip().split()[0][1:] out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name))) else: out.write(line) return edges_fpath
def parse_abyss_dot(dot_fpath, min_edge_len): '''digraph adj { graph [k=50] edge [d=-49] "3+" [l=99 C=454] "3-" [l=99 C=454] ''' dict_edges = dict() predecessors = defaultdict(list) successors = defaultdict(list) edge_pattern = '"?(?P<edge_id>\d+)(?P<edge_sign>[\+\-])"? (?P<info>.+)' link_pattern = '"?(?P<start>\d+)(?P<start_sign>[\+\-])"? -> "?(?P<end>\d+)(?P<end_sign>[\+\-])"?' info_pattern = 'l=(?P<edge_len>\d+) C=(?P<edge_cov>\d+)' with open(dot_fpath) as f: for line in f: if 'l=' in line: # "3+" -> "157446-" [d=-45] match = re.search(edge_pattern, line) if not match or len(match.groups()) < 3: continue edge_id, edge_sign, info = match.group('edge_id'), match.group( 'edge_sign'), match.group('info') edge_name = (edge_sign if edge_sign != '+' else '') + edge_id edge_id = get_edge_agv_id(edge_name) match = re.search(info_pattern, info) if match and len(match.groups()) == 2: cov = max(1, int(match.group('edge_cov'))) edge_len = max(1, int(float(match.group('edge_len')))) if edge_len >= min_edge_len: edge = Edge(edge_id, edge_name, edge_len, cov, element_id=edge_id) dict_edges[edge_id] = edge if '->' in line: # "3+" -> "157446-" [d=-45] match = re.search(link_pattern, line) if not match or len(match.groups()) < 2: continue start, start_sign, end, end_sign = match.group( 'start'), match.group('start_sign'), match.group( 'end'), match.group('end_sign') start_edge_id = get_edge_agv_id( (start_sign if start_sign == '-' else '') + start) end_edge_id = get_edge_agv_id( (end_sign if end_sign == '-' else '') + end) predecessors[end_edge_id].append(start_edge_id) successors[start_edge_id].append(end_edge_id) dict_edges = construct_graph(dict_edges, predecessors, successors) return dict_edges
def parse_flye_dot(dot_fpath, min_edge_len): dict_edges = dict() pattern = '"?(?P<start>\d+)"? -> "?(?P<end>\d+)"? \[(?P<info>.+)]' label_pattern = 'id (?P<edge_id>\-*.+) (?P<edge_len>[0-9\.]+)k (?P<coverage>\d+)' with open(dot_fpath) as f: for line in f: if 'label =' in line: # "7" -> "29" [label = "id 1\l53k 59x", color = "black"] ; line = line.replace('\\l', ' ') match = re.search(pattern, line) if not match or len(match.groups()) < 3: continue start, end, info = match.group('start'), match.group('end'), match.group('info') params_dict = dict(param.split(' = ') for param in info.split(', ') if '=' in param) # label = params_dict.get('label') color = params_dict.get('color').strip().replace('"', '') line = line.replace(' ,', ',') match = re.search(label_pattern, info) if match and match.group('edge_id'): edge_id = get_edge_agv_id(match.group('edge_id')) cov = max(1, int(match.group('coverage'))) edge_len = max(1, int(float(match.group('edge_len')) * 1000)) if edge_len < min_edge_len: continue edge = Edge(edge_id, match.group('edge_id'), edge_len, cov, element_id=edge_id) edge.color = color if edge.color != "black": edge.repetitive = True edge.start, edge.end = int(start), int(end) if 'dir = both' in line: edge.two_way = True dict_edges[edge_id] = edge dict_edges = calculate_multiplicities(dict_edges) return dict_edges
def parse_flye_assembly_info(input_dirpath, dict_edges): contig_edges = defaultdict(list) info_fpath = join(input_dirpath, "assembly_info.txt") if is_empty_file(info_fpath): print( "Warning! Assembly_info.txt is not found, information about contigs will not be provided" ) with open(info_fpath) as f: for i, line in enumerate(f): if i == 0: # header = line.strip().split() continue fs = line.strip().split() contig = fs[0] path = fs[-1] edges = path.split(',') start = 0 for edge_name in edges: edge_id = get_edge_agv_id(edge_name) if edge_id in dict_edges: edge_len = dict_edges[edge_id].length contig_edges[contig].append( (str(start), str(start + edge_len), edge_id)) start += edge_len return contig_edges
def parse_spades_paths(input_dirpath, dict_edges): contig_edges = defaultdict(list) paths_fpath = join(input_dirpath, "scaffolds.paths") if is_empty_file(paths_fpath): print( "Warning! %s is not found, information about scaffold paths will not be provided" % paths_fpath) # NODE_1_length_8242890_cov_19.815448 # 1893359+,1801779-,1893273-,400678-,1892977+,1869659-,1892443+,272108+,1694470+,1893863+ with open(paths_fpath) as f: contig = None start = 0 for line in f: if line.strip().endswith("'"): contig = None elif line.startswith("NODE"): contig = line.strip() start = 0 continue elif contig: edges = line.strip().replace(';', '').split(',') for edge_name in edges: edge_num = int(edge_name[:-1]) if edge_name[-1] == '-': edge_num *= -1 edge_id = get_edge_agv_id(edge_num) if edge_id in dict_edges: edge_len = dict_edges[edge_id].length contig_edges[contig].append( (str(start), str(start + edge_len), edge_id)) start += edge_len start += 10 # NNNNNNNNNN return contig_edges
def create_contig_info(dict_edges, input_dirpath, output_dirpath, contig_edges, edges_by_component, edges_by_repeat_component, edges_by_ref_component, assembler): contig_info = None if is_canu(assembler): contig_info = parse_canu_contigs_info(input_dirpath) elif is_flye(assembler): contig_info = parse_flye_contigs_info(input_dirpath) elif is_spades(assembler): contig_info = parse_spades_contigs_info(input_dirpath, contig_edges) if not contig_info: with open(join(output_dirpath, 'contig_info.json'), 'a') as handle: handle.write("contigInfo=" + json.dumps([]) + ";\n") with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle: handle.write("edgeInfo=" + json.dumps([]) + ";") handle.write("medianCov=" + json.dumps(calculate_median_cov(dict_edges)) + ";\n") return edge_contigs = defaultdict(set) for contig, data in contig_info.items(): subgraph = None repeat_subgraph = None ref_subgraph = None edges = data['edges'] for edge_name in set(edges): edge_id = get_edge_agv_id(edge_name) if edge_id in dict_edges: edge_contigs[edge_id].add(contig) match_edge_id = get_match_edge_id(edge_id) if match_edge_id in dict_edges: edge_contigs[match_edge_id].add(contig) if not subgraph and edge_id in edges_by_component: data['g'] = edges_by_component[edge_id] if not repeat_subgraph and edge_id in edges_by_repeat_component: data['rep_g'] = edges_by_repeat_component[edge_id] if not ref_subgraph and edge_id in edges_by_ref_component: data['ref_g'] = edges_by_ref_component[edge_id] data['num_edges'] = str(len(edges)) contig_info[contig] = data for edge_id in edge_contigs: edge_contigs[edge_id] = list(edge_contigs[edge_id]) with open(join(output_dirpath, 'contig_info.json'), 'a') as handle: handle.write("contigInfo=" + json.dumps(contig_info) + ";\n") with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle: handle.write("edgeInfo=" + json.dumps(edge_contigs) + ";") handle.write("medianCov=" + json.dumps(calculate_median_cov(dict_edges)) + ";\n") return
def format_edges_file(input_fpath, output_dirpath): if is_empty_file(input_fpath): return None edges_fpath = join(output_dirpath, "edges.fasta") if not can_reuse(edges_fpath, files_to_check=[input_fpath]): with open(input_fpath) as f: with open(edges_fpath, "w") as out_f: for line in f: if line.startswith('>'): edge_id = get_edge_agv_id(get_edge_num(line[1:])) out_f.write(">%s\n" % edge_id) else: out_f.write(line) return edges_fpath
def parse_canu_assembly_info(input_dirpath, dict_edges): contig_edges = defaultdict(list) unitigs_fpath = find_file_by_pattern(input_dirpath, ".unitigs.bed") if is_empty_file(unitigs_fpath): print( "Warning! Unitigs.bed is not found, information about contigs will not be provided" ) with open(unitigs_fpath) as f: for line in f: fs = line.strip().split() contig, start, end, unitig = fs[:4] edge_id = get_edge_agv_id(get_edge_num(unitig)) if edge_id in dict_edges: contig_id = get_canu_id(contig) contig_edges[contig_id].append((start, end, edge_id)) return contig_edges
def parse_gfa(gfa_fpath, min_edge_len, input_dirpath=None, assembler=None): dict_edges = dict() predecessors = defaultdict(list) successors = defaultdict(list) g = nx.DiGraph() print("Parsing " + gfa_fpath + "...") # gfa = gfapy.Gfa.from_file(gfa_fpath, vlevel = 0) links = [] edge_overlaps = defaultdict(dict) with open(gfa_fpath) as f: for line in f: record_type = line[0] if record_type == 'S': fs = line.split() name, seq_len = fs[1], len(fs[2]) if fs[2] == '*': seq_len = None add_fields = fs[3:] if len(fs) > 3 else [] add_info = dict((f.split(':')[0].lower(), f.split(':')[-1]) for f in add_fields) cov = 1 if "dp" in add_info: cov = float(add_info["dp"]) ## coverage depth elif "kc" in add_info: cov = max(1, int(add_info["kc"]) / seq_len) ## k-mer count / edge length if "ln" in add_info: seq_len = int(add_info["ln"]) ## sequence length if seq_len and seq_len >= min_edge_len: edge_id = get_edge_agv_id(get_edge_num(name)) edge = Edge(edge_id, get_edge_num(name), seq_len, cov, element_id=edge_id) dict_edges[edge_id] = edge for overlapped_edge, overlap in edge_overlaps[edge_id].items(): dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap)) rc_edge_id = get_edge_agv_id(-get_edge_num(name)) rc_edge = Edge(rc_edge_id, -get_edge_num(name), seq_len, cov, element_id=rc_edge_id) dict_edges[rc_edge_id] = rc_edge for overlapped_edge, overlap in edge_overlaps[rc_edge_id].items(): dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap)) if record_type != 'L' and record_type != 'E': continue if record_type == 'L': _, from_name, from_orient, to_name, to_orient = line.split()[:5] else: # E * 2+ 65397+ 21 68$ 0 47 47M from_name, to_name = line.split()[2], line.split()[3] from_orient, to_orient = from_name[-1], to_name[-1] from_name, to_name = from_name[:-1], to_name[:-1] edge1 = get_edge_agv_id(get_edge_num(from_name)) edge2 = get_edge_agv_id(get_edge_num(to_name)) if from_orient == '-': edge1 = get_match_edge_id(edge1) if to_orient == '-': edge2 = get_match_edge_id(edge2) overlap = 0 overlap_operations = re.split('(\d+)', line.split()[-1].strip()) for i in range(0, len(overlap_operations) - 1, 1): if not overlap_operations[i]: continue if overlap_operations[i+1] == 'M' or overlap_operations[i+1] == 'I': overlap += int(overlap_operations[i]) links.append((from_name, from_orient, to_name, to_orient, overlap)) if overlap: edge_overlaps[edge1][edge2] = overlap edge_overlaps[edge2][edge1] = overlap ### gfa retains only canonical links for link in links: from_name, from_orient, to_name, to_orient, overlap = link edge1 = get_edge_agv_id(get_edge_num(from_name)) edge2 = get_edge_agv_id(get_edge_num(to_name)) if from_orient == '-': edge1 = get_match_edge_id(edge1) if to_orient == '-': edge2 = get_match_edge_id(edge2) if edge1 != edge2: predecessors[edge2].append(edge1) successors[edge1].append(edge2) g.add_edge(edge1, edge2) if is_spades(assembler) or is_abyss(assembler): edge1, edge2 = get_match_edge_id(edge2), get_match_edge_id(edge1) if edge1 != edge2: predecessors[edge2].append(edge1) successors[edge1].append(edge2) g.add_edge(edge1, edge2) if assembler == "canu" and input_dirpath: dict_edges = parse_canu_unitigs_info(input_dirpath, dict_edges) dict_edges = construct_graph(dict_edges, predecessors, successors) print("Finish parsing.") return dict_edges
def run_quast_analysis(input_fpath, reference_fpath, output_dirpath, json_output_dirpath, threads, contig_edges, dict_edges=None, is_meta=False): ms_out_fpath = None quast_output_dir = join( output_dirpath, "quast_output" if not dict_edges else "quast_edge_output") if not is_empty_file(input_fpath) and not is_empty_file(reference_fpath): ms_out_fpath = get_mis_report_fpath(quast_output_dir, input_fpath) ms_out_fpath = run(input_fpath, reference_fpath, ms_out_fpath, quast_output_dir, threads, is_meta) if not ms_out_fpath: if not is_empty_file(input_fpath) and not is_empty_file( reference_fpath): print( "QUAST failed! Make sure you are using the latest version of QUAST" ) print("No information about %s mappings to the reference genome" % ("edge" if dict_edges else "contig")) with open(join(json_output_dirpath, "reference.json"), 'w') as handle: handle.write("chrom_lengths=" + json.dumps([]) + ";\n") handle.write("edgeMappingInfo=" + json.dumps([]) + ";\n") handle.write("chromGaps=" + json.dumps([]) + ";\n") handle.write("chromAligns=" + json.dumps([]) + ";\n") with open(join(json_output_dirpath, 'errors.json'), 'w') as handle: handle.write("misassembledContigs=[];\n") return None, None, None, dict_edges # search for misassemblies and store them for each edge and contig misassembled_seqs = defaultdict(list) with open(ms_out_fpath) as f: seq_id = '' for line in f: if line.startswith("Extensive misassembly"): match = re.search(align_pattern, line) if not match or len(match.groups()) < 4: continue start1, end1, start2, end2 = match.group( 'start1'), match.group('end1'), match.group( 'start2'), match.group('end2') if dict_edges: edge_id = get_edge_agv_id(get_edge_num(seq_id)) dict_edges[edge_id].errors.append( (start1, end1, start2, end2)) else: misassembled_seqs[seq_id].append( (start1, end1, start2, end2)) ## add misassembl edge else: seq_id = line.strip() if not dict_edges: with open(join(json_output_dirpath, 'errors.json'), 'w') as handle: handle.write("misassembledContigs='" + json.dumps(misassembled_seqs) + "';\n") return None, None, None, dict_edges else: parse_alignments(get_alignments_fpath(quast_output_dir, input_fpath), json_output_dirpath) mapping_fpath = map_edges_to_ref(input_fpath, output_dirpath, reference_fpath, threads) mapping_info, chrom_names, edge_by_chrom = parse_mapping_info( mapping_fpath, json_output_dirpath, dict_edges) return mapping_info, chrom_names, edge_by_chrom, dict_edges
def parse_mapping_info(mapping_fpath, json_output_dir, dict_edges): # assign edges to chromosomes and color edges to corresponding colors mapping_info = defaultdict(set) edge_mappings = defaultdict(lambda: defaultdict(list)) edge_lengths = dict() chrom_lengths = dict() with open(mapping_fpath) as f: for line in f: # contig_1 257261 14 160143 - chr13 924431 196490 356991 147365 161095 60 tp:A:P cm:i:14049 s1:i:147260 s2:i:4375 dv:f:0.0066 fs = line.split() edge_id = get_edge_agv_id(get_edge_num(fs[0])) start, end = int(fs[2]), int(fs[3]) edge_lengths[edge_id] = int(fs[1]) chrom, chrom_len = fs[5], int(fs[6]) ref_start, ref_end = int(fs[7]), int(fs[8]) chrom_lengths[chrom] = chrom_len edge_mappings[edge_id][chrom].append( (start, end, ref_start, ref_end)) chroms_by_edge = defaultdict(set) edge_by_chrom = defaultdict(set) chrom_names = set() best_aligns = defaultdict(defaultdict) for edge_id in edge_mappings: # assign an edge to a chromosome if more than 90% of edge aligned to the chromosome len_threshold = 0.9 * edge_lengths[edge_id] gap_threshold = min(5000, 0.05 * edge_lengths[edge_id]) for chrom, mappings in edge_mappings[edge_id].items(): mappings.sort(key=lambda x: (x[0], -x[1]), reverse=False) for chrom, mappings in edge_mappings[edge_id].items(): aligns = [] covered_len = 0 last_pos = 0 last_ref_pos = 0 align_s, align_e = 0, 0 # calculate covered length (do not count overlaps) for (start, end, ref_start, ref_end) in mappings: start = max(start, last_pos) covered_len += max(0, end - start + 1) last_pos = max(last_pos, end + 1) if covered_len >= len_threshold: chroms_by_edge[edge_id].add(chrom) chrom_names.add(chrom) edge_by_chrom[chrom].add(edge_id) mappings.sort(key=lambda x: (x[2], -x[3]), reverse=False) for (start, end, ref_start, ref_end) in mappings: ref_start = max(ref_start, last_ref_pos) last_ref_pos = max(last_ref_pos, ref_end + 1) if not align_s: align_s = ref_start if align_e and ref_start - align_e >= gap_threshold: if align_e - align_s >= 500: # break alignments if gap longer than 500 bp aligns.append((chrom, align_s, align_e)) align_s = ref_start align_e = ref_end - 1 if align_e and align_e - align_s >= 500: aligns.append((chrom, align_s, align_e)) aligns.sort(reverse=True, key=lambda x: x[2] - x[1]) edge_alignment = chrom + ":" if aligns: best_aligns[edge_id][chrom] = aligns[0][1] for align in aligns[: 3]: # store top 3 alignments for each edge edge_alignment += " %s-%s," % (format_pos( align[1]), format_pos(align[2])) dict_edges[edge_id].aligns[chrom] = edge_alignment[:-1] if get_match_edge_id(edge_id) in dict_edges: dict_edges[get_match_edge_id( edge_id)].aligns[chrom] = edge_alignment[:-1] chrom_len_dict = OrderedDict( (chrom, chrom_lengths[chrom]) for i, chrom in enumerate(list(natural_sort(chrom_names)))) non_alt_chroms = [ c for c in chrom_names if 'alt' not in c and 'random' not in c and 'chrUn' not in c ] chrom_order = OrderedDict( (chrom, i) for i, chrom in enumerate(list(natural_sort(non_alt_chroms)))) color_list = [ '#e6194b', '#3cb44b', '#ffe119', '#1792d4', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#00dbb1', '#dba2ff', '#aa6e28', '#83360e', '#800000', '#003bff', '#808000', '#8d73d4', '#000080', '#806680', '#51205a', '#558859', '#d1a187', '#87a1d1', '#87a1d1', '#afd187' ] edge_chroms = defaultdict(set) for edge_id, chroms in chroms_by_edge.items(): match_edge_id = edge_id.replace( 'rc', 'e') if edge_id.startswith('rc') else edge_id.replace( 'e', 'rc') for chrom in chroms: edge_chroms[edge_id].add(chrom) edge_chroms[match_edge_id].add(chrom) if match_edge_id in dict_edges: edge_by_chrom[chrom].add(match_edge_id) is_single_chrom = len(chrom_order.keys()) == 1 for edge_id, chroms in edge_chroms.items(): if edge_id not in dict_edges: continue mapping_info[edge_id] = list(chroms) colors = set() for chrom in chroms: if chrom in chrom_order: if is_single_chrom: # color an edge according to its position in reference pos = best_aligns[edge_id][chrom] if best_aligns[edge_id] else \ best_aligns[get_match_edge_id(edge_id)][chrom] color = get_rainbow_color(pos, chrom_len_dict[chrom]) else: color = color_list[chrom_order[chrom] % len(color_list)] else: color = '#808080' colors.add(color) if len(colors) <= 5: dict_edges[edge_id].chrom = ':'.join(list(colors)) else: dict_edges[edge_id].chrom = 'white:red:black:red:black:white' with open(join(json_output_dir, "reference.json"), 'a') as handle: handle.write("chrom_lengths=" + json.dumps(chrom_len_dict) + ";\n") handle.write("edgeMappingInfo=" + json.dumps(mapping_info) + ";\n") return mapping_info, non_alt_chroms, edge_by_chrom