def get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len): if not gfa_fpath: return None input_edges_fpath = join(dirname(gfa_fpath), get_filename(gfa_fpath) + ".fasta") edges_fpath = join(output_dirpath, basename(input_edges_fpath)) if not is_empty_file(gfa_fpath) and not can_reuse(edges_fpath, files_to_check=[gfa_fpath]): print("Extracting edge sequences from " + gfa_fpath + "...") with open(edges_fpath, "w") as out: with open(gfa_fpath) as f: for line in f: if line.startswith('S'): fs = line.strip().split() seq_name = fs[1] seq = None if is_acgt_seq(fs[2]): seq = fs[2] elif len(fs) >= 4 and is_acgt_seq(fs[3]): seq = fs[3] if seq and len(seq) >= min_edge_len: out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name))) out.write(seq) out.write("\n") if is_empty_file(edges_fpath) and not is_empty_file(input_edges_fpath): with open(edges_fpath, "w") as out: with open(input_edges_fpath) as f: for line in f: if line.startswith('>'): seq_name = line.strip().split()[0][1:] out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name))) else: out.write(line) return edges_fpath
def map_edges_to_ref(input_fpath, output_dirpath, reference_fpath, threads): mapping_fpath = join(output_dirpath, "mapping.paf") if reference_fpath: if not can_reuse(mapping_fpath, files_to_check=[input_fpath, reference_fpath]): if not is_empty_file(input_fpath): print("Aligning graph edges to the reference...") cmdline = [ "minimap2", "-x", "asm20", "--score-N", "0", "-E", "1,0", "-N", "200", "-p", "0.5", "-f", "200", "-t", str(threads), reference_fpath, input_fpath ] return_code = subprocess.call(cmdline, stdout=open(mapping_fpath, "w"), stderr=open( join(output_dirpath, "minimap.log"), "w")) if return_code != 0 or is_empty_file(mapping_fpath): print( "Warning! Minimap2 failed aligning edges to the reference" ) else: print( "Warning! File with edge sequences was not found, failed aligning edges to the reference" ) return mapping_fpath
def run(input_fpath, reference_fpath, out_fpath, output_dirpath, threads, is_meta): if not exists(output_dirpath): os.makedirs(output_dirpath) if not can_reuse(out_fpath, files_to_check=[input_fpath, reference_fpath]): quast_exec_path = get_path_to_program("quast.py") if not quast_exec_path: print("QUAST is not found!") return None cmdline = [quast_exec_path, "--fast", "--agb", input_fpath, "-r", reference_fpath, "-t", str(threads), "-o", output_dirpath, "--min-contig", "0"] + \ (["--large"] if getsize(input_fpath) > 10 * 1024 * 1024 or is_meta else []) + (["--min-identity", "90"] if is_meta else []) subprocess.call(cmdline, stdout=open("/dev/null", "w"), stderr=open("/dev/null", "w")) if is_empty_file(out_fpath) or not can_reuse( out_fpath, files_to_check=[input_fpath, reference_fpath]): return None return out_fpath
def format_edges_file(input_fpath, output_dirpath): if is_empty_file(input_fpath): return None edges_fpath = join(output_dirpath, "edges.fasta") if not can_reuse(edges_fpath, files_to_check=[input_fpath]): with open(input_fpath) as f: with open(edges_fpath, "w") as out_f: for line in f: if line.startswith('>'): edge_id = get_edge_agv_id(get_edge_num(line[1:])) out_f.write(">%s\n" % edge_id) else: out_f.write(line) return edges_fpath
def parse_canu_output(input_dirpath, output_dirpath, min_edge_len): raw_gfa_fpath = find_file_by_pattern(input_dirpath, ".unitigs.gfa") if not raw_gfa_fpath: print("ERROR! GFA file is not found in %s! Please check the options" % abspath(input_dirpath)) sys.exit(1) edges_fpath = get_edges_from_gfa(raw_gfa_fpath, output_dirpath, min_edge_len) gfa_fpath = join(output_dirpath, basename(raw_gfa_fpath)) if is_empty_file(gfa_fpath) or not can_reuse( gfa_fpath, files_to_check=[raw_gfa_fpath]): cmd = 'sed "1s/bogart.edges/1.0/" ' + raw_gfa_fpath subprocess.call(shlex.split(cmd), stdout=open(gfa_fpath, 'w')) dict_edges = parse_gfa(gfa_fpath, min_edge_len, input_dirpath, assembler="canu") contig_edges = parse_canu_assembly_info(input_dirpath, dict_edges) return dict_edges, contig_edges, edges_fpath