def main(): parser = argparse.ArgumentParser( description='Reference-guided scaffolding', usage="ragtag.py scaffold <reference.fa> <query.fa>") parser.add_argument("reference", metavar="<reference.fa>", nargs='?', default="", type=str, help="reference fasta file (uncompressed or bgzipped)") parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)") scaf_options = parser.add_argument_group("scaffolding options") scaf_options.add_argument( "-e", metavar="<exclude.txt>", type=str, default="", help="list of reference headers to ignore [null]") scaf_options.add_argument( "-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave unplaced [null]") scaf_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") scaf_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than -f") scaf_options.add_argument( "-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") scaf_options.add_argument("-d", metavar="INT", type=int, default=100000, help="alignment merge distance [100000]") scaf_options.add_argument("-i", metavar="FLOAT", type=float, default=0.2, help="minimum grouping confidence score [0.2]") scaf_options.add_argument("-a", metavar="FLOAT", type=float, default=0.0, help="minimum location confidence score [0.0]") scaf_options.add_argument( "-s", metavar="FLOAT", type=float, default=0.0, help="minimum orientation confidence score [0.0]") scaf_options.add_argument( "-C", action='store_true', default=False, help="concatenate unplaced contigs and make 'chr0'") scaf_options.add_argument( "-r", action='store_true', default=False, help="infer gap sizes. if not, all gaps are 100 bp") scaf_options.add_argument("-g", metavar="INT", type=int, default=100, help="minimum inferred gap size [100]") scaf_options.add_argument("-m", metavar="INT", type=int, default=100000, help="maximum inferred gap size [100000]") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2 threads [1]") aln_options.add_argument( "--aligner", metavar="PATH", type=str, default="minimap2", help="aligner executable ('nucmer' or 'minimap2') [minimap2]") mm2_default = "-x asm5" aln_options.add_argument( "--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters ['%s']" % mm2_default) aln_options.add_argument( "--nucmer-params", metavar="STR", type=str, default="-l 100 -c 500", help="space delimted nucmer parameters ['-l 100 -c 500']") args = parser.parse_args() if not args.reference or not args.query: parser.print_help() print("\n** The reference and query FASTA files are required **") sys.exit() log("RagTag " + get_ragtag_version()) log("CMD: ragtag.py scaffold " + " ".join(sys.argv[1:])) reference_file = os.path.abspath(args.reference) query_file = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_file): raise ValueError("Could not find file: %s" % reference_file) if not os.path.isfile(query_file): raise ValueError("Could not find file: %s" % query_file) min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d group_score_thresh = args.i loc_score_thresh = args.a orient_score_thresh = args.s make_chr0 = args.C infer_gaps = args.r num_threads = args.t # I/O options output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" # Setup a log file for external RagTag scripts ragtag_log = output_path + "ragtag.scaffold.err" open(ragtag_log, "w").close() # Wipe the log file overwrite_files = args.w remove_suffix = not args.u if remove_suffix: log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'." ) # Gap options min_gap_size = args.g max_gap_size = args.m if min_gap_size < 1: raise ValueError("the minimum gap size must be positive") if max_gap_size < 1: raise ValueError("the maximum gap size must be positive") # Skip/exclude options query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(args.j) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) # Get aligner arguments aligner_path = args.aligner aligner = aligner_path.split("/")[-1] if aligner.split("/")[-1] not in {'minimap2', 'nucmer'}: raise ValueError( "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'." ) mm2_params = args.mm2_params nucmer_params = args.nucmer_params # Mapq filtering params min_mapq = args.q if aligner == "nucmer": min_mapq = 0 # Add the number of mm2 threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) # Debugging options debug_mode = args.debug debug_non_fltrd_file = output_path + "ragtag.scaffolds.debug.unfiltered.paf" debug_fltrd_file = output_path + "ragtag.scaffolds.debug.filtered.paf" debug_merged_file = output_path + "ragtag.scaffolds.debug.merged.paf" debug_query_info_file = output_path + "ragtag.scaffolds.debug.query.info.txt" # Align the query to the reference log("Mapping the query genome to the reference genome") if aligner == "minimap2": al = Minimap2Aligner(reference_file, [query_file], aligner_path, mm2_params, output_path + "query_against_ref", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_file, [query_file], aligner_path, nucmer_params, output_path + "query_against_ref", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, need to convert from delta to paf if aligner == "nucmer": cmd = ["ragtag_delta2paf.py", output_path + "query_against_ref.delta"] run_oae(cmd, output_path + "query_against_ref.paf", ragtag_log) # Read and organize the alignments log('Reading whole genome alignments') # ctg_alns = dict :: key=query header, value=ContigAlignment object ctg_alns = read_genome_alignments(output_path + "query_against_ref.paf", query_blacklist, ref_blacklist) # Filter the alignments if debug_mode: # create new empty copies of debugging output files open(debug_non_fltrd_file, "w").close() open(debug_fltrd_file, "w").close() open(debug_merged_file, "w").close() open(debug_query_info_file, "w").close() log("Filtering and merging alignments") for i in ctg_alns: # Write unfiltered alignments if debug_mode: with open(debug_non_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter( min_ulen, keep_small=keep_small_uniques) if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: # Write filtered alignments if debug_mode: with open(debug_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist) # Remove query sequences which have no more qualifying alignments fltrd_ctg_alns = dict() for i in ctg_alns: if ctg_alns[i] is not None: # Write merged alignments and confidence scores if debug_mode: with open(debug_merged_file, "a") as f: f.write(str(ctg_alns[i])) with open(debug_query_info_file, "a") as f: f.write("\t".join([ i, ctg_alns[i].best_ref_header, str(ctg_alns[i].grouping_confidence), str(ctg_alns[i].location_confidence), str(ctg_alns[i].orientation_confidence), ]) + "\n") if all([ ctg_alns[i].grouping_confidence > group_score_thresh, ctg_alns[i].location_confidence > loc_score_thresh, ctg_alns[i].orientation_confidence > orient_score_thresh ]): fltrd_ctg_alns[i] = ctg_alns[i] # For each reference sequence which has at least one assigned query sequence, get the list of # all query sequences assigned to that reference sequence. log("Ordering and orienting query sequences") mapped_ref_seqs = defaultdict(list) for i in fltrd_ctg_alns: best_ref = fltrd_ctg_alns[i].best_ref_header ref_start, ref_end = fltrd_ctg_alns[i].get_best_ref_pos() mapped_ref_seqs[best_ref].append((ref_start, ref_end, i)) # Sort the query sequences for each reference sequence and define the padding sizes between adjacent query seqs g_inferred = 0 g_small = 0 g_large = 0 pad_sizes = dict() gap_types = dict() for i in mapped_ref_seqs: # Remove contained contigs and sort the rest non_contained = remove_contained(mapped_ref_seqs[i]) mapped_ref_seqs[i] = sorted(non_contained) if infer_gaps: # Infer the gap sizes between adjacent query seqs # Use the primary alignments to infer gap sizes pad_sizes[i] = [] gap_types[i] = [] for j in range(1, len(mapped_ref_seqs[i])): # Get info for the upstream alignment left_ctg = mapped_ref_seqs[i][j - 1][2] left_ref_start, left_ref_end = fltrd_ctg_alns[ left_ctg].get_best_ref_pos() left_qdist_start, left_qdist_end = fltrd_ctg_alns[ left_ctg].get_best_q_dist() # Get info for the downstream alignment right_ctg = mapped_ref_seqs[i][j][2] right_ref_start, right_ref_end = fltrd_ctg_alns[ right_ctg].get_best_ref_pos() right_qdist_start, right_qdist_end = fltrd_ctg_alns[ right_ctg].get_best_q_dist() # Get the inferred gap size i_gap_size = (right_ref_start - right_qdist_start) - ( left_ref_end + left_qdist_end) # Check if the gap size is too small or too large if i_gap_size <= min_gap_size: pad_sizes[i].append(100) gap_types[i].append("U") g_small += 1 elif i_gap_size > max_gap_size: pad_sizes[i].append(100) gap_types[i].append("U") g_large += 1 else: pad_sizes[i].append(i_gap_size) gap_types[i].append("N") g_inferred += 1 else: pad_sizes[i] = [100 for i in range(len(mapped_ref_seqs[i]) - 1)] gap_types[i] = ["U" for i in range(len(mapped_ref_seqs[i]) - 1)] if infer_gaps: log("%d inferred gap" % g_inferred) log("%d adjacent contig within min distance (%d) of each other" % (g_small, min_gap_size)) log("%d inferred gaps exceed length threshold (%d)" % (g_large, max_gap_size)) # Write the scaffolds log("Writing scaffolds") # Write the intermediate output file in AGP v2.1 format log("Writing: " + output_path + "ragtag.scaffolds.agp") write_orderings(output_path + "ragtag.scaffolds.agp", output_path + "ragtag.confidence.txt", query_file, mapped_ref_seqs, fltrd_ctg_alns, pad_sizes, gap_types, make_chr0, True, not remove_suffix) # Build a FASTA from the AGP cmd = [ "ragtag_agp2fasta.py", output_path + "ragtag.scaffolds.agp", query_file ] run_oae(cmd, output_path + "ragtag.scaffolds.fasta", ragtag_log) # Calculate the stats cmd = [ "ragtag_stats.py", output_path + "ragtag.scaffolds.agp", output_path + "ragtag.confidence.txt" ] run_oae(cmd, output_path + "ragtag.scaffolds.stats", ragtag_log) log("Goodbye")
def main(): parser = argparse.ArgumentParser( description='Reference-guided misassembly correction', usage="ragtag.py correct <reference.fa> <query.fa>") cor_options = parser.add_argument_group("correction options") cor_options.add_argument( "reference", metavar="<reference.fa>", nargs='?', default="", type=str, help="reference fasta file (can be uncompressed or bgzipped)") cor_options.add_argument( "query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (can be uncompressed or bgzipped)") cor_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") cor_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than -f") cor_options.add_argument( "-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") cor_options.add_argument("-d", metavar="INT", type=int, default=100000, help="alignment merge distance [100000]") cor_options.add_argument( "-b", metavar="INT", type=int, default=5000, help="minimum break distance from contig ends [5000]") cor_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of reference headers to ignore") cor_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave uncorrected") cor_options.add_argument( "--inter", action="store_true", default=False, help="only break misassemblies between reference sequences") cor_options.add_argument( "--intra", action="store_true", default=False, help="only break misassemblies within reference sequences") cor_options.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unaltered sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") mm2_default = "-x asm5" aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2 threads [1]") aln_options.add_argument( "--aligner", metavar="PATH", type=str, default="minimap2", help= "whole genome aligner executable ('nucmer' or 'minimap2') [minimap2]") aln_options.add_argument( "--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 whole genome alignment parameters ['%s']" % mm2_default) aln_options.add_argument( "--nucmer-params", metavar="STR", type=str, default="-l 100 -c 500", help= "space delimted nucmer whole genome alignment parameters ['-l 100 -c 500']" ) val_options = parser.add_argument_group("validation options") val_options.add_argument( "--read-aligner", metavar="PATH", type=str, default="minimap2", help="read aligner executable (only 'minimap2' is allowed) [minimap2]") val_options.add_argument( "-R", metavar="<reads.fasta>", type=str, default="", help="validation reads. gzipped fastq or fasta allowed.") val_options.add_argument("-F", metavar="<reads.fofn>", type=str, default="", help="same as '-R', but a list of files.") val_options.add_argument( "-T", metavar="sr", type=str, default="", help= "read type. 'sr' and 'corr' accepted for short reads and error corrected long-reads, respectively." ) val_options.add_argument("-v", metavar="INT", type=int, default=10000, help="coverage validation window size [10000]") val_options.add_argument( "--max-cov", metavar="INT", type=int, default=-1, help="break sequences at regions at or above this coverage level [AUTO]" ) val_options.add_argument( "--min-cov", metavar="INT", type=int, default=-1, help="break sequences at regions at or below this coverage level [AUTO]" ) val_options.add_argument( "-m", metavar="INT", type=int, default=1000, help=argparse.SUPPRESS ) # Merge breakpoints within this distance after validation args = parser.parse_args() if not args.reference or not args.query: parser.print_help() sys.exit() log("RagTag " + get_ragtag_version()) log("CMD: " + " ".join(sys.argv)) reference_file = os.path.abspath(args.reference) query_file = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_file): raise ValueError("Could not find file: %s" % reference_file) if not os.path.isfile(query_file): raise ValueError("Could not find file: %s" % query_file) num_threads = args.t min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d min_break_dist = args.m min_break_end_dist = args.b val_window_size = args.v # I/O options output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" overwrite_files = args.w remove_suffix = not args.u if remove_suffix: log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'." ) gff_file = args.gff if gff_file: gff_file = os.path.abspath(gff_file) # Skip/exclude options query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(args.j) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) # Get aligner arguments genome_aligner_path = args.aligner genome_aligner = genome_aligner_path.split("/")[-1] if genome_aligner.split("/")[-1] not in {'minimap2', 'nucmer'}: raise ValueError( "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'." ) mm2_params = args.mm2_params nucmer_params = args.nucmer_params # Mapq filtering params min_mapq = args.q if genome_aligner == "nucmer": min_mapq = 0 # Add the number of mm2 threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) # Check if intra/inter breaking is desired break_intra = True break_inter = True only_intra = args.intra only_inter = args.inter if only_intra and only_inter: raise ValueError( "Must speficity either '--inter' or '--intra', not both.") if only_intra: break_inter = False if only_inter: break_intra = False # read-alignment parameters val_reads = args.R val_reads_fofn = args.F val_reads_tech = args.T read_aligner_path = args.read_aligner read_aligner = read_aligner_path.split("/")[-1] if read_aligner != "minimap2": raise ValueError( "Only minimap2 can be used for read alignments. got: %s" % read_aligner) # If the genome aligner is minimap2, we can just use that path for read alignment if genome_aligner == 'minimap2': read_aligner_path = genome_aligner_path # Make sure that if -R or -F, -T has been specified. if val_reads or val_reads_fofn: if not val_reads_tech: raise ValueError("'-T' must be provided when using -R or -F.") # Make a list of read sequences. read_files = [] if val_reads_fofn: with open(val_reads_fofn, "r") as f: for line in f: read_files.append(os.path.abspath(line.rstrip())) elif val_reads: read_files.append(os.path.abspath(val_reads)) # Coverage thresholds max_cov = args.max_cov min_cov = args.min_cov if max_cov < 0: if max_cov != -1: raise ValueError("--max-cov must be >=0") if min_cov < 0: if min_cov != -1: raise ValueError("--min-cov must be >=0") # Debugging options debug_mode = args.debug debug_non_fltrd_file = output_path + "ragtag.correction.debug.unfiltered.paf" debug_fltrd_file = output_path + "ragtag.correction.debug.filtered.paf" debug_merged_file = output_path + "ragtag.correction.debug.merged.paf" debug_query_info_file = output_path + "ragtag.correction.debug.query.info.txt" # Align the query to the reference. log("Mapping the query genome to the reference genome") if genome_aligner == "minimap2": al = Minimap2Aligner(reference_file, [query_file], genome_aligner_path, mm2_params, output_path + "c_query_against_ref", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_file, [query_file], genome_aligner_path, nucmer_params, output_path + "c_query_against_ref", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, convert from delta to paf. if genome_aligner == "nucmer": cmd = [ "ragtag_delta2paf.py", output_path + "c_query_against_ref.delta" ] run_o( cmd, output_path + "c_query_against_ref.paf", ) # Read and organize the alignments. log('Reading whole genome alignments') # ctg_alns = dict :: key=query header, value=ContigAlignment object ctg_alns = read_genome_alignments(output_path + "c_query_against_ref.paf", query_blacklist, ref_blacklist) # Filter and merge the alignments. if debug_mode: # create new empty copies of debugging output files open(debug_non_fltrd_file, "w").close() open(debug_fltrd_file, "w").close() open(debug_merged_file, "w").close() open(debug_query_info_file, "w").close() log("Filtering and merging alignments") for i in ctg_alns: # Write unfiltered alignments if debug_mode: with open(debug_non_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter( min_ulen, keep_small=keep_small_uniques) if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: # Write filtered alignments if debug_mode: with open(debug_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist) # Get the putative breakpoints for each query sequence, if any. ctg_breaks = dict() for i in ctg_alns: if ctg_alns[i] is not None: # Write merged alignments and confidence scores if debug_mode: with open(debug_merged_file, "a") as f: f.write(str(ctg_alns[i])) with open(debug_query_info_file, "a") as f: f.write("\t".join([ i, ctg_alns[i].best_ref_header, str(ctg_alns[i].grouping_confidence), str(ctg_alns[i].location_confidence), str(ctg_alns[i].orientation_confidence), ]) + "\n") breaks = [] intra_breaks, inter_breaks = ctg_alns[i].get_break_candidates( min_dist=min_break_end_dist) if break_intra: breaks = breaks + intra_breaks if break_inter: breaks = breaks + inter_breaks if breaks: ctg_breaks[i] = breaks # If desired, validate the putative breakpoints by observing read coverage. if read_files: log("Validating putative query breakpoints via read alignment.") log("Aligning reads to query sequences.") if not os.path.isfile(output_path + "c_reads_against_query.s.bam"): if val_reads_tech == "sr": al = Minimap2SAMAligner(query_file, read_files, read_aligner_path, "-ax sr -t " + str(num_threads), output_path + "c_reads_against_query", in_overwrite=overwrite_files) elif val_reads_tech == "corr": al = Minimap2SAMAligner(query_file, read_files, read_aligner_path, "-ax asm5 -t " + str(num_threads), output_path + "c_reads_against_query", in_overwrite=overwrite_files) else: raise ValueError("'-T' must be either 'sr' or 'corr'.") al.run_aligner() else: log("Retaining pre-existing read alignments: " + output_path + "c_reads_against_query.s.bam") # Compress, sort and index the alignments. log("Compressing, sorting, and indexing read alignments") run_samtools(output_path, num_threads, overwrite_files) # Validate the breakpoints log("Validating putative query breakpoints") # Give at least 10k/1k from ctg ends for coverage to accumulate for corr and sr, respectively. val_min_break_end_dist = min_break_end_dist if val_reads_tech == "corr": val_min_break_end_dist = max(10000, min_break_end_dist) if val_reads_tech == "sr": val_min_break_end_dist = max(1000, min_break_end_dist) # Validate the breakpoints ctg_breaks = validate_breaks(ctg_breaks, output_path, num_threads, overwrite_files, val_min_break_end_dist, max_cov, min_cov, window_size=val_window_size, clean_dist=min_break_dist, debug=debug_mode) # Check if we need to avoid gff intervals if gff_file: log("Avoiding breaks within GFF intervals") it = make_gff_interval_tree(gff_file) non_gff_breaks = dict() for ctg in ctg_breaks: new_breaks = [] for i in ctg_breaks[ctg]: if it[ctg][i]: log("Avoiding breaking %s at %d. This point intersects a feature in the gff file." % (ctg, i)) else: new_breaks.append(i) if new_breaks: non_gff_breaks[ctg] = new_breaks ctg_breaks = non_gff_breaks # Write the summary of query sequence breaks in AGP format agp_file = output_path + "ragtag.correction.agp" write_breaks(agp_file, query_file, ctg_breaks, overwrite_files, remove_suffix) # Write the scaffolds. log("Writing broken contigs") qf_name = query_file.split("/")[-1] qf_pref = qf_name[:qf_name.rfind(".")] cmd = ["ragtag_break_query.py", agp_file, query_file] run_o(cmd, output_path + qf_pref + ".corrected.fasta") log("Goodbye")
def main(): description = "Homology-based assembly patching: Make continuous joins and fill gaps " \ "in 'target.fa' using sequences from 'query.fa'" parser = argparse.ArgumentParser(description=description, usage="ragtag.py patch <target.fa> <query.fa>") parser.add_argument("reference", metavar="<target.fa>", nargs='?', default="", type=str, help="target fasta file (uncompressed or bgzipped)") parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)") patch_options = parser.add_argument_group("patching") patch_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of target sequences to ignore [null]") patch_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query sequences to ignore [null]") patch_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") patch_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than '-f'") patch_options.add_argument("-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") patch_options.add_argument("-d", metavar="INT", type=int, default=100000, help="maximum alignment merge distance [100000]") patch_options.add_argument("-s", metavar="INT", type=int, default=50000, help="minimum merged alignment length [50000]") patch_options.add_argument("-i", metavar="FLOAT", type=float, default=0.05, help="maximum merged alignment distance from sequence terminus. fraction of the sequence length if < 1 [0.05]") patch_options.add_argument("--fill-only", action="store_true", default=False, help="only fill existing target gaps. do not join target sequences") patch_options.add_argument("--join-only", action="store_true", default=False, help="only join and patch target sequences. do not fill existing gaps") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2/unimap threads [1]") aln_options.add_argument("--aligner", metavar="PATH", type=str, default="nucmer", help="aligner executable ('nucmer' (recommended), 'unimap' or 'minimap2') [nucmer]") mm2_default = "-x asm5" aln_options.add_argument("--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters (overrides '-t') ['%s']" % mm2_default) aln_options.add_argument("--unimap-params", metavar="STR", type=str, default=mm2_default, help="space delimited unimap parameters (overrides '-t') ['%s']" % mm2_default) aln_options.add_argument("--nucmer-params", metavar="STR", type=str, default="--maxmatch -l 100 -c 500", help="space delimted nucmer parameters ['--maxmatch -l 100 -c 500']") args = parser.parse_args() if not args.reference or not args.query: parser.print_help() sys.exit("\n** The target and query FASTA files are required **") log("VERSION", "RagTag " + get_ragtag_version()) log("WARNING", "This is a beta version of `ragtag patch`") log("CMD", "ragtag.py patch " + " ".join(sys.argv[1:])) reference_fn = os.path.abspath(args.reference) query_fn = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_fn): raise FileNotFoundError("Could not find file: %s" % reference_fn) if not os.path.isfile(query_fn): raise FileNotFoundError("Could not find file: %s" % query_fn) # Alignment processing parameters min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d num_threads = args.t aligner_path = args.aligner aligner = aligner_path.split("/")[-1] if aligner.split("/")[-1] not in {'minimap2', 'unimap', 'nucmer'}: raise ValueError("Must specify either 'minimap2', 'unimap', or 'nucmer' (PATHs allowed) with '--aligner'.") mm2_params = args.mm2_params unimap_params = args.unimap_params nucmer_params = args.nucmer_params # Mapq filtering parameters min_mapq = args.q if aligner == "nucmer": min_mapq = 0 # Add the number of mm2/unimap threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) if unimap_params == mm2_default: unimap_params += " -t " + str(num_threads) # Set reference/query sequences to ignore ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(skip_file) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) # Supporting alignment parameters min_sup_aln_len = args.s max_term_dist = args.i if max_term_dist <= 0: raise ValueError("-i must be a positive nonzero number.") # Task options fill_only = args.fill_only join_only = args.join_only if fill_only and join_only: raise ValueError("'--fill-only' and '--join-only' cannot be used together") # I/O parameters add_suffix = args.u if not add_suffix: log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.") overwrite_files = args.w output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" file_prefix = "ragtag.patch" # Setup a log file for external RagTag scripts ragtag_log = output_path + file_prefix + ".err" open(ragtag_log, "w").close() # Wipe the log file # Debugging options debug_mode = args.debug # Break the reference assembly at gaps cmd = [ "ragtag_splitasm.py", "-o", output_path + file_prefix + ".ctg.agp", reference_fn ] reference_ctg_fn = output_path + file_prefix + ".ctg.fasta" if os.path.isfile(reference_ctg_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + reference_ctg_fn) run_oae(cmd, reference_ctg_fn, ragtag_log) else: log("INFO", "Retaining pre-existing file: " + reference_ctg_fn) else: run_oae(cmd, reference_ctg_fn, ragtag_log) # Rename the query sequences cmd = [ "ragtag_rename.py", query_fn, "-p", "qseq", "-o", output_path + file_prefix + ".rename.agp", ] query_rename_fn = output_path + file_prefix + ".rename.fasta" if os.path.isfile(query_rename_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + query_rename_fn) run_oae(cmd, query_rename_fn, ragtag_log) else: log("INFO", "Retaining pre-existing file: " + query_rename_fn) else: run_oae(cmd, query_rename_fn, ragtag_log) # Combine the reference contigs and query sequences to make a components fasta file components_fn = output_path + file_prefix + ".comps.fasta" if os.path.isfile(components_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + components_fn) write_comps = True else: log("INFO", "Retaining pre-existing file: " + components_fn) write_comps = False else: write_comps = True if write_comps: log("INFO", "Writing: " + components_fn) ref_fai = pysam.FastaFile(reference_ctg_fn) query_fai = pysam.FastaFile(query_rename_fn) with open(components_fn, "w") as f: for ref in ref_fai.references: f.write(">" + ref + "\n") f.write(ref_fai.fetch(ref) + "\n") for query in query_fai.references: f.write(">" + query + "\n") f.write(query_fai.fetch(query) + "\n") # Map the query assembly to the reference contigs log("INFO", "Mapping the query genome to the target genome") if aligner == "minimap2": al = Minimap2Aligner(reference_ctg_fn, [query_rename_fn], aligner_path, mm2_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) elif aligner == "unimap": al = UnimapAligner(reference_ctg_fn, [query_rename_fn], aligner_path, unimap_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_ctg_fn, [query_rename_fn], aligner_path, nucmer_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, need to convert from delta to paf if aligner == "nucmer": cmd = ["ragtag_delta2paf.py", output_path + file_prefix + ".asm.delta"] run_oae(cmd, output_path + file_prefix + ".asm.paf", ragtag_log) # Read and organize the alignments log("INFO", "Reading whole genome alignments") # ctg_alns: query header -> ContigAlignment object ctg_alns = read_genome_alignments(output_path + file_prefix + ".asm.paf", query_blacklist, ref_blacklist) # Check if any alignments are left if not ctg_alns: raise RuntimeError("There are no alignments. Check '{}'.".format(output_path + file_prefix + ".asm.paf")) # Filter the alignments unfiltered_strings, filtered_strings, merged_strings, useful_strings = [], [], [], [] log("INFO", "Filtering and merging alignments") fltrd_ctg_alns = dict() for i in ctg_alns: # Unique anchor filtering unfiltered_strings.append(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter(min_ulen, keep_small=keep_small_uniques) # mapq filtering if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: filtered_strings.append(str(ctg_alns[i])) # alignment merging ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist, careful_merge=True) if ctg_alns[i] is not None: merged_strings.append(str(ctg_alns[i])) # Length filtering ctg_alns[i] = ctg_alns[i].filter_lengths(min_sup_aln_len) if ctg_alns[i] is not None: # terminal filtering ctg_alns[i] = ctg_alns[i].keep_terminals(max_term_dist) # Save the remaining useful alignments if ctg_alns[i] is not None and ctg_alns[i].num_refs > 1 and not ctg_alns[i].has_internal_ref_cuttings(max_term_dist): useful_strings.append(str(ctg_alns[i])) fltrd_ctg_alns[i] = ctg_alns[i] # Write debugging files debug_non_fltrd_file = output_path + file_prefix + ".debug.unfiltered.paf" debug_fltrd_file = output_path + file_prefix + ".debug.filtered.paf" debug_merged_file = output_path + file_prefix + ".debug.merged.paf" debug_useful_file = output_path + file_prefix + ".debug.useful.paf" if debug_mode: with open(debug_non_fltrd_file, "w") as f: f.write("".join(unfiltered_strings)) with open(debug_fltrd_file, "w") as f: f.write("".join(filtered_strings)) with open(debug_merged_file, "w") as f: f.write("".join(merged_strings)) with open(debug_useful_file, "w") as f: f.write("".join(useful_strings)) # Make a Scaffold Graph encoding known reference contigs adjacencies log("INFO", "Building a scaffold graph from the contig AGP file") agp_multi_sg = AGPMultiScaffoldGraph(reference_ctg_fn) agp_multi_sg.add_agps([output_path + file_prefix + ".ctg.agp"]) agp_sg = agp_multi_sg.merge() # As a hack, go through the AGP sg and make the required directed scaffold graph agp_psg = PatchScaffoldGraph(components_fn) for u, v in agp_sg.edges: aln = Alignment( u, v, "", agp_sg[u][v]["gap_size"][0], 0, agp_sg[u][v]["gap_size"][0], 0, is_gap=True ) agp_psg.add_edge(u, v, aln) # Make a second directed scaffold graph from the alignments log("INFO", "Building a scaffold graph from the target/query mappings") aln_psg = build_aln_scaffold_graph(fltrd_ctg_alns, components_fn, max_term_dist) # Add edges for unfilled gaps for u, v in agp_psg.edges: if not aln_psg.has_edge(u, v): aln_psg.add_edge(u, v, agp_psg[u][v]["alignment"]) # Remove known false edges for u, v in agp_psg.edges: for neighbor in list(aln_psg.neighbors(u)): if neighbor != v: aln_psg.remove_edge(u, neighbor) aln_psg.remove_edge(neighbor, u) for neighbor in list(aln_psg.neighbors(v)): if neighbor != u: aln_psg.remove_edge(neighbor, v) aln_psg.remove_edge(v, neighbor) # Adjust the graph depending on if only fills or joins are requested if fill_only: psg = PatchScaffoldGraph(components_fn) for u, v in agp_psg.edges: psg.add_edge(u, v, aln_psg[u][v]["alignment"]) psg.add_edge(v, u, aln_psg[v][u]["alignment"]) aln_psg = psg if join_only: for u, v in agp_psg.edges: aln_psg[u][v]["alignment"] = agp_psg[u][v]["alignment"] aln_psg[v][u]["alignment"] = agp_psg[v][u]["alignment"] if debug_mode: aln_psg.write_gml(output_path + file_prefix + ".debug.sg.gml") # Compute a matching solution for the graph log("INFO", "Computing a matching solution to the scaffold graph") match_psg = aln_psg.max_weight_matching() if debug_mode: match_psg.write_gml(output_path + file_prefix + ".debug.matching.gml") # Write the output in AGP format log("INFO", "Writing output files") match_psg.write_agp(output_path + file_prefix + ".agp", output_path + file_prefix + ".ctg.fasta", add_suffix_to_unplaced=add_suffix) # Write the output in fasta format cmd = [ "ragtag_agp2fa.py", output_path + file_prefix + ".agp", components_fn ] run_oae(cmd, output_path + file_prefix + ".fasta", ragtag_log) log("INFO", "Goodbye")