def run_aligner(self): """ Run the aligner. """ if all([self.params_are_valid(), self.exec_is_valid()]): if not self.output_exists(): run_oe(self.compile_command(), self.out_file, self.out_log) else: if self.overwrite: log("overwriting pre-existing file: " + self.out_file) run_oe(self.compile_command(), self.out_file, self.out_log) else: log("Retaining pre-existing file: " + self.out_file)
def run_samtools(output_path, num_threads, overwrite_files): """ Compress, sort and index alignments with pysam. """ if os.path.isfile(output_path + "c_reads_against_query.s.bam"): if not overwrite_files: log("Retaining pre-existing file: " + output_path + "c_reads_against_query.s.bam") else: log("Overwriting pre-existing file: " + output_path + "c_reads_against_query.s.bam") pysam.view("-@", str(num_threads), "-b", "-o", output_path + "c_reads_against_query.bam", output_path + "c_reads_against_query.sam", catch_stdout=False) pysam.sort("-@", str(num_threads), "-o", output_path + "c_reads_against_query.s.bam", output_path + "c_reads_against_query.bam", catch_stdout=False) else: pysam.view("-@", str(num_threads), "-b", "-o", output_path + "c_reads_against_query.bam", output_path + "c_reads_against_query.sam", catch_stdout=False) pysam.sort("-@", str(num_threads), "-o", output_path + "c_reads_against_query.s.bam", output_path + "c_reads_against_query.bam", catch_stdout=False) log("Indexing read alignments") if os.path.isfile(output_path + "c_reads_against_query.s.bam.bai"): if not overwrite_files: log("Retaining pre-existing file: " + output_path + "c_reads_against_query.s.bam.bai") else: log("Overwriting pre-existing file: " + output_path + "c_reads_against_query.s.bam.bai") pysam.index(output_path + "c_reads_against_query.s.bam", catch_stdout=False) else: pysam.index(output_path + "c_reads_against_query.s.bam", catch_stdout=False)
def params_are_valid(self): """ Do a basic check to make sure the unimap parameters are valid. I won't check that every parameter is valid, but will check anything that can cause a problem for RagTag later on. :return: True if the parameters are valid. Raises appropriate errors otherwise """ all_flags = "".join([i for i in self.params_string.split(" ") if i.startswith("-")]) if "a" in all_flags: raise ValueError("Alignments must not be in SAM format (-a).") if "c" in all_flags: log("WARNING", "Computing base-alignments (-c) will slow down Unimap alignment.") return True
def make_gff_interval_tree(gff_file): # Dictionary storing an interval tree for each sequence header t = defaultdict(IntervalTree) # Iterate over the gff file with open(gff_file, "r") as f: for line in f: if not line.startswith("#"): fields = line.split("\t") h, start, end = fields[0], int(fields[3]), int(fields[4]) start = start - 1 # make everything zero-indexed assert start < end if end - start > 100000: coords = "%s:%d-%d" %(h, start+1, end) log("WARNING: large interval in this gff file (%s). This could disproportionately invalidate putative query breakpoints." % coords) t[h][start:end] = (start, end) return t
def get_median_read_coverage(output_path, num_threads, overwrite_files): """ Given the read alignments, use samtools stats to return an approximate median coverage value. """ log("Calculating global read coverage") if os.path.isfile(output_path + "c_reads_against_query.s.bam.stats"): if not overwrite_files: log("retaining pre-existing file: " + output_path + "c_reads_against_query.s.bam.stats") else: log("overwriting pre-existing file: " + output_path + "c_reads_against_query.s.bam.stats") st = pysam.stats("-@", str(num_threads), output_path + "c_reads_against_query.s.bam") with open(output_path + "c_reads_against_query.s.bam.stats", "w") as f: f.write(st) else: st = pysam.stats("-@", str(num_threads), output_path + "c_reads_against_query.s.bam") with open(output_path + "c_reads_against_query.s.bam.stats", "w") as f: f.write(st) # Get the coverage histogram (for 1 to 1k) covs = [] with open(output_path + "c_reads_against_query.s.bam.stats") as f: for line in f: if line.startswith("COV"): covs.append(int(line.split("\t")[3])) # Get the median from the histogram covs = np.asarray(covs, dtype=np.int32) # Remove the last value, which is a catch-all for coverages > 1k covs = covs[:-1] mid = sum(covs) // 2 cs = 0 for i in range(len(covs)): cs += covs[i] if cs >= mid: return i raise ValueError("Unable to calculate read coverage. Check SAM/BAM files and stats file.")
def main(): parser = argparse.ArgumentParser( description="Update gff intervals given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>") parser.add_argument("gff", nargs='?', default="", metavar="<genes.gff>", type=str, help="gff file") parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.*.agp>", type=str, help="agp file") parser.add_argument( "-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)") args = parser.parse_args() if not args.gff or not args.agp: parser.print_help() sys.exit() log("RagTag " + get_ragtag_version()) log("CMD: " + " ".join(sys.argv)) gff_file = os.path.abspath(args.gff) agp_file = os.path.abspath(args.agp) is_sub = args.c if is_sub: sub_update(gff_file, agp_file) else: sup_update(gff_file, agp_file)
def main(): parser = argparse.ArgumentParser(description="Check AGP v2.1 files for validity.", usage="ragtag.py agpcheck <asm1.agp> [<asm2.agp> ... <asmN.agp>]") parser.add_argument("agp", metavar="<asm1.agp> [<asm2.agp> ... <asmN.agp>]", nargs='+', default=[], type=str, help="AGP v2.1 files") DISCLAIMER = """ DISCLAIMER: This utility performs most (but not all) checks necessary to validate an AGP v2.1 file: https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/ Please additionally use the NCBI AGP validator for robust validation: https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Validation/ """ args = parser.parse_args() print(DISCLAIMER) agp_file_list = [os.path.abspath(i) for i in args.agp] for agp_file in agp_file_list: print() log("INFO", "Checking {} ...".format(agp_file)) agp = AGPFile(agp_file, mode="r") for _ in agp.iterate_lines(): pass log("INFO", "Check for {} is complete with no errors.".format(agp_file))
def write_breaks(out_file, query_file, ctg_breaks, overwrite, remove_suffix): """ Write the intermediate file for contig breaks in AGP v2.1 format.""" # Check if the output file already exists if os.path.isfile(out_file): if not overwrite: log("Retaining pre-existing file: " + out_file) return else: log("Overwriting pre-existing file: " + out_file) fai = pysam.FastaFile(query_file) all_q_seqs = sorted(fai.references) agp = AGPFile(out_file, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) for q in all_q_seqs: # Check if this sequence was broken during misassembly correction if q not in ctg_breaks: # Add suffix to query header, unless otherwise requested unchanged_comp_header = q if not remove_suffix: unchanged_comp_header = q + ":0" + "-" + str( fai.get_reference_length(q)) + "(+)" agp.add_seq_line(q, "1", str(fai.get_reference_length(q)), "1", "W", unchanged_comp_header, "1", str(fai.get_reference_length(q)), "+") else: # This query sequence was broken pid = 1 sorted_breaks = sorted(ctg_breaks[q]) start = 0 for i in sorted_breaks: agp.add_seq_line(q, str(start + 1), str(i), str(pid), "W", q + ":" + str(start) + "-" + str(i) + "(+)", "1", str(i - start), "+") start = i pid += 1 # Add one line for the last interval agp.add_seq_line( q, str(start + 1), str(fai.get_reference_length(q)), str(pid), "W", q + ":" + str(start) + "-" + str(fai.get_reference_length(q)) + "(+)", "1", str(fai.get_reference_length(q) - start), "+") log("Writing: " + out_file) agp.write() fai.close()
def main(): parser = argparse.ArgumentParser( description='Reference-guided misassembly correction', usage="ragtag.py correct <reference.fa> <query.fa>") cor_options = parser.add_argument_group("correction options") cor_options.add_argument( "reference", metavar="<reference.fa>", nargs='?', default="", type=str, help="reference fasta file (can be uncompressed or bgzipped)") cor_options.add_argument( "query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (can be uncompressed or bgzipped)") cor_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") cor_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than -f") cor_options.add_argument( "-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") cor_options.add_argument("-d", metavar="INT", type=int, default=100000, help="alignment merge distance [100000]") cor_options.add_argument( "-b", metavar="INT", type=int, default=5000, help="minimum break distance from contig ends [5000]") cor_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of reference headers to ignore") cor_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave uncorrected") cor_options.add_argument( "--inter", action="store_true", default=False, help="only break misassemblies between reference sequences") cor_options.add_argument( "--intra", action="store_true", default=False, help="only break misassemblies within reference sequences") cor_options.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unaltered sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") mm2_default = "-x asm5" aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2 threads [1]") aln_options.add_argument( "--aligner", metavar="PATH", type=str, default="minimap2", help= "whole genome aligner executable ('nucmer' or 'minimap2') [minimap2]") aln_options.add_argument( "--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 whole genome alignment parameters ['%s']" % mm2_default) aln_options.add_argument( "--nucmer-params", metavar="STR", type=str, default="-l 100 -c 500", help= "space delimted nucmer whole genome alignment parameters ['-l 100 -c 500']" ) val_options = parser.add_argument_group("validation options") val_options.add_argument( "--read-aligner", metavar="PATH", type=str, default="minimap2", help="read aligner executable (only 'minimap2' is allowed) [minimap2]") val_options.add_argument( "-R", metavar="<reads.fasta>", type=str, default="", help="validation reads. gzipped fastq or fasta allowed.") val_options.add_argument("-F", metavar="<reads.fofn>", type=str, default="", help="same as '-R', but a list of files.") val_options.add_argument( "-T", metavar="sr", type=str, default="", help= "read type. 'sr' and 'corr' accepted for short reads and error corrected long-reads, respectively." ) val_options.add_argument("-v", metavar="INT", type=int, default=10000, help="coverage validation window size [10000]") val_options.add_argument( "--max-cov", metavar="INT", type=int, default=-1, help="break sequences at regions at or above this coverage level [AUTO]" ) val_options.add_argument( "--min-cov", metavar="INT", type=int, default=-1, help="break sequences at regions at or below this coverage level [AUTO]" ) val_options.add_argument( "-m", metavar="INT", type=int, default=1000, help=argparse.SUPPRESS ) # Merge breakpoints within this distance after validation args = parser.parse_args() if not args.reference or not args.query: parser.print_help() sys.exit() log("RagTag " + get_ragtag_version()) log("CMD: " + " ".join(sys.argv)) reference_file = os.path.abspath(args.reference) query_file = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_file): raise ValueError("Could not find file: %s" % reference_file) if not os.path.isfile(query_file): raise ValueError("Could not find file: %s" % query_file) num_threads = args.t min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d min_break_dist = args.m min_break_end_dist = args.b val_window_size = args.v # I/O options output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" overwrite_files = args.w remove_suffix = not args.u if remove_suffix: log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'." ) gff_file = args.gff if gff_file: gff_file = os.path.abspath(gff_file) # Skip/exclude options query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(args.j) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) # Get aligner arguments genome_aligner_path = args.aligner genome_aligner = genome_aligner_path.split("/")[-1] if genome_aligner.split("/")[-1] not in {'minimap2', 'nucmer'}: raise ValueError( "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'." ) mm2_params = args.mm2_params nucmer_params = args.nucmer_params # Mapq filtering params min_mapq = args.q if genome_aligner == "nucmer": min_mapq = 0 # Add the number of mm2 threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) # Check if intra/inter breaking is desired break_intra = True break_inter = True only_intra = args.intra only_inter = args.inter if only_intra and only_inter: raise ValueError( "Must speficity either '--inter' or '--intra', not both.") if only_intra: break_inter = False if only_inter: break_intra = False # read-alignment parameters val_reads = args.R val_reads_fofn = args.F val_reads_tech = args.T read_aligner_path = args.read_aligner read_aligner = read_aligner_path.split("/")[-1] if read_aligner != "minimap2": raise ValueError( "Only minimap2 can be used for read alignments. got: %s" % read_aligner) # If the genome aligner is minimap2, we can just use that path for read alignment if genome_aligner == 'minimap2': read_aligner_path = genome_aligner_path # Make sure that if -R or -F, -T has been specified. if val_reads or val_reads_fofn: if not val_reads_tech: raise ValueError("'-T' must be provided when using -R or -F.") # Make a list of read sequences. read_files = [] if val_reads_fofn: with open(val_reads_fofn, "r") as f: for line in f: read_files.append(os.path.abspath(line.rstrip())) elif val_reads: read_files.append(os.path.abspath(val_reads)) # Coverage thresholds max_cov = args.max_cov min_cov = args.min_cov if max_cov < 0: if max_cov != -1: raise ValueError("--max-cov must be >=0") if min_cov < 0: if min_cov != -1: raise ValueError("--min-cov must be >=0") # Debugging options debug_mode = args.debug debug_non_fltrd_file = output_path + "ragtag.correction.debug.unfiltered.paf" debug_fltrd_file = output_path + "ragtag.correction.debug.filtered.paf" debug_merged_file = output_path + "ragtag.correction.debug.merged.paf" debug_query_info_file = output_path + "ragtag.correction.debug.query.info.txt" # Align the query to the reference. log("Mapping the query genome to the reference genome") if genome_aligner == "minimap2": al = Minimap2Aligner(reference_file, [query_file], genome_aligner_path, mm2_params, output_path + "c_query_against_ref", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_file, [query_file], genome_aligner_path, nucmer_params, output_path + "c_query_against_ref", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, convert from delta to paf. if genome_aligner == "nucmer": cmd = [ "ragtag_delta2paf.py", output_path + "c_query_against_ref.delta" ] run_o( cmd, output_path + "c_query_against_ref.paf", ) # Read and organize the alignments. log('Reading whole genome alignments') # ctg_alns = dict :: key=query header, value=ContigAlignment object ctg_alns = read_genome_alignments(output_path + "c_query_against_ref.paf", query_blacklist, ref_blacklist) # Filter and merge the alignments. if debug_mode: # create new empty copies of debugging output files open(debug_non_fltrd_file, "w").close() open(debug_fltrd_file, "w").close() open(debug_merged_file, "w").close() open(debug_query_info_file, "w").close() log("Filtering and merging alignments") for i in ctg_alns: # Write unfiltered alignments if debug_mode: with open(debug_non_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter( min_ulen, keep_small=keep_small_uniques) if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: # Write filtered alignments if debug_mode: with open(debug_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist) # Get the putative breakpoints for each query sequence, if any. ctg_breaks = dict() for i in ctg_alns: if ctg_alns[i] is not None: # Write merged alignments and confidence scores if debug_mode: with open(debug_merged_file, "a") as f: f.write(str(ctg_alns[i])) with open(debug_query_info_file, "a") as f: f.write("\t".join([ i, ctg_alns[i].best_ref_header, str(ctg_alns[i].grouping_confidence), str(ctg_alns[i].location_confidence), str(ctg_alns[i].orientation_confidence), ]) + "\n") breaks = [] intra_breaks, inter_breaks = ctg_alns[i].get_break_candidates( min_dist=min_break_end_dist) if break_intra: breaks = breaks + intra_breaks if break_inter: breaks = breaks + inter_breaks if breaks: ctg_breaks[i] = breaks # If desired, validate the putative breakpoints by observing read coverage. if read_files: log("Validating putative query breakpoints via read alignment.") log("Aligning reads to query sequences.") if not os.path.isfile(output_path + "c_reads_against_query.s.bam"): if val_reads_tech == "sr": al = Minimap2SAMAligner(query_file, read_files, read_aligner_path, "-ax sr -t " + str(num_threads), output_path + "c_reads_against_query", in_overwrite=overwrite_files) elif val_reads_tech == "corr": al = Minimap2SAMAligner(query_file, read_files, read_aligner_path, "-ax asm5 -t " + str(num_threads), output_path + "c_reads_against_query", in_overwrite=overwrite_files) else: raise ValueError("'-T' must be either 'sr' or 'corr'.") al.run_aligner() else: log("Retaining pre-existing read alignments: " + output_path + "c_reads_against_query.s.bam") # Compress, sort and index the alignments. log("Compressing, sorting, and indexing read alignments") run_samtools(output_path, num_threads, overwrite_files) # Validate the breakpoints log("Validating putative query breakpoints") # Give at least 10k/1k from ctg ends for coverage to accumulate for corr and sr, respectively. val_min_break_end_dist = min_break_end_dist if val_reads_tech == "corr": val_min_break_end_dist = max(10000, min_break_end_dist) if val_reads_tech == "sr": val_min_break_end_dist = max(1000, min_break_end_dist) # Validate the breakpoints ctg_breaks = validate_breaks(ctg_breaks, output_path, num_threads, overwrite_files, val_min_break_end_dist, max_cov, min_cov, window_size=val_window_size, clean_dist=min_break_dist, debug=debug_mode) # Check if we need to avoid gff intervals if gff_file: log("Avoiding breaks within GFF intervals") it = make_gff_interval_tree(gff_file) non_gff_breaks = dict() for ctg in ctg_breaks: new_breaks = [] for i in ctg_breaks[ctg]: if it[ctg][i]: log("Avoiding breaking %s at %d. This point intersects a feature in the gff file." % (ctg, i)) else: new_breaks.append(i) if new_breaks: non_gff_breaks[ctg] = new_breaks ctg_breaks = non_gff_breaks # Write the summary of query sequence breaks in AGP format agp_file = output_path + "ragtag.correction.agp" write_breaks(agp_file, query_file, ctg_breaks, overwrite_files, remove_suffix) # Write the scaffolds. log("Writing broken contigs") qf_name = query_file.split("/")[-1] qf_pref = qf_name[:qf_name.rfind(".")] cmd = ["ragtag_break_query.py", agp_file, query_file] run_o(cmd, output_path + qf_pref + ".corrected.fasta") log("Goodbye")
def validate_breaks(ctg_breaks, output_path, num_threads, overwrite_files, min_break_end_dist, max_cutoff, min_cutoff, window_size=10000, num_devs=3, clean_dist=1000, debug=False): """ """ # Get the median coverage over all bp glob_med = get_median_read_coverage(output_path, num_threads, overwrite_files) dev = round(math.sqrt(glob_med)) if max_cutoff == -1: max_cutoff = glob_med + (num_devs * dev) if min_cutoff == -1: min_cutoff = max(0, (glob_med - (num_devs * dev))) log("The global median read coverage is %dX" % glob_med) log("The max and min coverage thresholds are %dX and %dX, respectively" % (max_cutoff, min_cutoff)) # Go through each break point and query the coverage within the vicinity of the breakpoint. bam = pysam.AlignmentFile(output_path + "c_reads_against_query.s.bam") validated_ctg_breaks = dict() for ctg in ctg_breaks: val_breaks = [] # Iterate over each breakpoint for this query sequence for b in ctg_breaks[ctg]: # Don't extend the validation window too close to the contig ends (defined by min_break_end_dist) min_range = max(min_break_end_dist, b - (window_size // 2)) max_range = min( (bam.get_reference_length(ctg) - min_break_end_dist), b + (window_size // 2)) if min_range >= max_range: continue region = "%s:%d-%d" % (ctg, min_range, max_range - 1) depth_out = pysam.samtools.depth( "-aa", "-r", region, output_path + "c_reads_against_query.s.bam") covs = np.asarray([ j.split("\t")[2] for j in [i for i in depth_out.rstrip().split("\n")] ], dtype=np.int32) assert len(covs) == max_range - min_range # Given the coverage in vicinity of the breakpoint, find the max and min coverage. cov_min, cov_max = np.min(covs), np.max(covs) too_high = True if cov_max >= max_cutoff else False too_low = True if cov_min <= min_cutoff else False new_break = None status = "not validated" if too_low and too_high: val_breaks.append(np.argmin(covs) + min_range) new_break = np.argmin(covs) + min_range status = "low and high cov" elif too_low: val_breaks.append(np.argmin(covs) + min_range) new_break = np.argmin(covs) + min_range status = "low cov" elif too_high: val_breaks.append(np.argmax(covs) + min_range) new_break = np.argmax(covs) + min_range status = "high cov" if debug: log("query: %s, original break: %s, window start: %d, window end: %d, status: %s, new_break: %s, cov max: %d, cov min: %d" % (ctg, b, min_range, max_range, status, str(new_break), cov_max, cov_min)) validated_ctg_breaks[ctg] = clean_breaks(val_breaks, clean_dist) return validated_ctg_breaks
def write_orderings(out_agp_file, out_confidence_file, query_file, ordering_dict, ctg_dict, gap_dict, gap_type_dict, make_chr0, overwrite, add_suffix): # Check if the output file already exists if os.path.isfile(out_agp_file): if not overwrite: log("Retaining pre-existing file: " + out_agp_file) return else: log("Overwriting pre-existing file: " + out_agp_file) # Proceed with writing the intermediate output placed_seqs = set() all_out_cs_lines = [] # For confidence scores agp = AGPFile(out_agp_file, mode="w") agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) # Go through the reference sequences in sorted order sorted_ref_headers = sorted(list(ordering_dict.keys())) for ref_header in sorted_ref_headers: pid = 1 pos = 0 new_ref_header = ref_header + "_RagTag" q_seqs = ordering_dict[ref_header] gap_seqs = gap_dict[ref_header] gap_types = gap_type_dict[ref_header] # Iterate through the query sequences for this reference header for i in range(len(q_seqs)): out_agp_line = [] out_cs_line = [] q = q_seqs[i][2] placed_seqs.add(q) qlen = ctg_dict[q].query_len strand = ctg_dict[q].orientation gc, lc, oc = ctg_dict[q].grouping_confidence, ctg_dict[ q].location_confidence, ctg_dict[q].orientation_confidence out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += qlen out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(ctg_dict[q].query_len)) out_agp_line.append(strand) # Save the confidence score info out_cs_line.append(q) out_cs_line.append(str(gc)) out_cs_line.append(str(lc)) out_cs_line.append(str(oc)) agp.add_seq_line(*out_agp_line) all_out_cs_lines.append("\t".join(out_cs_line)) pid += 1 if i < len(gap_seqs): # Print the gap line out_agp_line = [] out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += gap_seqs[i] out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) gap_type = gap_types[i] out_agp_line.append(gap_type) out_agp_line.append(str(gap_seqs[i])) out_agp_line.append("scaffold") out_agp_line.append("yes") out_agp_line.append("align_genus") pid += 1 agp.add_gap_line(*out_agp_line) # Write unplaced sequences fai = pysam.FastaFile(query_file) all_seqs = set(fai.references) unplaced_seqs = sorted(list(all_seqs - placed_seqs)) if unplaced_seqs: if make_chr0: pos = 0 pid = 1 new_ref_header = "Chr0_RagTag" for q in unplaced_seqs: out_agp_line = [] qlen = fai.get_reference_length(q) out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += qlen out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("+") agp.add_seq_line(*out_agp_line) pid += 1 # Now for the gap, since we are making a chr0 out_agp_line = [] out_agp_line.append(new_ref_header) out_agp_line.append(str(pos + 1)) pos += 100 out_agp_line.append(str(pos)) out_agp_line.append(str(pid)) out_agp_line.append("U") out_agp_line.append("100") out_agp_line.append("contig") out_agp_line.append("no") out_agp_line.append("na") agp.add_gap_line(*out_agp_line) pid += 1 # Remove the final unecessary gap agp.pop_agp_line() else: # List the unplaced contigs individually for q in unplaced_seqs: out_agp_line = [] qlen = fai.get_reference_length(q) if add_suffix: out_agp_line.append(q + "_RagTag") else: out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("1") out_agp_line.append("W") out_agp_line.append(q) out_agp_line.append("1") out_agp_line.append(str(qlen)) out_agp_line.append("+") agp.add_seq_line(*out_agp_line) agp.write() fai.close() # Write the confidence scores with open(out_confidence_file, "w") as f: f.write( "query\tgrouping_confidence\tlocation_confidence\torientation_confidence\n" ) f.write("\n".join(all_out_cs_lines) + "\n")
def main(): parser = argparse.ArgumentParser( description='Reference-guided scaffolding', usage="ragtag.py scaffold <reference.fa> <query.fa>") parser.add_argument("reference", metavar="<reference.fa>", nargs='?', default="", type=str, help="reference fasta file (uncompressed or bgzipped)") parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)") scaf_options = parser.add_argument_group("scaffolding options") scaf_options.add_argument( "-e", metavar="<exclude.txt>", type=str, default="", help="list of reference headers to ignore [null]") scaf_options.add_argument( "-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave unplaced [null]") scaf_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") scaf_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than -f") scaf_options.add_argument( "-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") scaf_options.add_argument("-d", metavar="INT", type=int, default=100000, help="alignment merge distance [100000]") scaf_options.add_argument("-i", metavar="FLOAT", type=float, default=0.2, help="minimum grouping confidence score [0.2]") scaf_options.add_argument("-a", metavar="FLOAT", type=float, default=0.0, help="minimum location confidence score [0.0]") scaf_options.add_argument( "-s", metavar="FLOAT", type=float, default=0.0, help="minimum orientation confidence score [0.0]") scaf_options.add_argument( "-C", action='store_true', default=False, help="concatenate unplaced contigs and make 'chr0'") scaf_options.add_argument( "-r", action='store_true', default=False, help="infer gap sizes. if not, all gaps are 100 bp") scaf_options.add_argument("-g", metavar="INT", type=int, default=100, help="minimum inferred gap size [100]") scaf_options.add_argument("-m", metavar="INT", type=int, default=100000, help="maximum inferred gap size [100000]") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2 threads [1]") aln_options.add_argument( "--aligner", metavar="PATH", type=str, default="minimap2", help="aligner executable ('nucmer' or 'minimap2') [minimap2]") mm2_default = "-x asm5" aln_options.add_argument( "--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters ['%s']" % mm2_default) aln_options.add_argument( "--nucmer-params", metavar="STR", type=str, default="-l 100 -c 500", help="space delimted nucmer parameters ['-l 100 -c 500']") args = parser.parse_args() if not args.reference or not args.query: parser.print_help() print("\n** The reference and query FASTA files are required **") sys.exit() log("RagTag " + get_ragtag_version()) log("CMD: ragtag.py scaffold " + " ".join(sys.argv[1:])) reference_file = os.path.abspath(args.reference) query_file = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_file): raise ValueError("Could not find file: %s" % reference_file) if not os.path.isfile(query_file): raise ValueError("Could not find file: %s" % query_file) min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d group_score_thresh = args.i loc_score_thresh = args.a orient_score_thresh = args.s make_chr0 = args.C infer_gaps = args.r num_threads = args.t # I/O options output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" # Setup a log file for external RagTag scripts ragtag_log = output_path + "ragtag.scaffold.err" open(ragtag_log, "w").close() # Wipe the log file overwrite_files = args.w remove_suffix = not args.u if remove_suffix: log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'." ) # Gap options min_gap_size = args.g max_gap_size = args.m if min_gap_size < 1: raise ValueError("the minimum gap size must be positive") if max_gap_size < 1: raise ValueError("the maximum gap size must be positive") # Skip/exclude options query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(args.j) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) # Get aligner arguments aligner_path = args.aligner aligner = aligner_path.split("/")[-1] if aligner.split("/")[-1] not in {'minimap2', 'nucmer'}: raise ValueError( "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'." ) mm2_params = args.mm2_params nucmer_params = args.nucmer_params # Mapq filtering params min_mapq = args.q if aligner == "nucmer": min_mapq = 0 # Add the number of mm2 threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) # Debugging options debug_mode = args.debug debug_non_fltrd_file = output_path + "ragtag.scaffolds.debug.unfiltered.paf" debug_fltrd_file = output_path + "ragtag.scaffolds.debug.filtered.paf" debug_merged_file = output_path + "ragtag.scaffolds.debug.merged.paf" debug_query_info_file = output_path + "ragtag.scaffolds.debug.query.info.txt" # Align the query to the reference log("Mapping the query genome to the reference genome") if aligner == "minimap2": al = Minimap2Aligner(reference_file, [query_file], aligner_path, mm2_params, output_path + "query_against_ref", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_file, [query_file], aligner_path, nucmer_params, output_path + "query_against_ref", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, need to convert from delta to paf if aligner == "nucmer": cmd = ["ragtag_delta2paf.py", output_path + "query_against_ref.delta"] run_oae(cmd, output_path + "query_against_ref.paf", ragtag_log) # Read and organize the alignments log('Reading whole genome alignments') # ctg_alns = dict :: key=query header, value=ContigAlignment object ctg_alns = read_genome_alignments(output_path + "query_against_ref.paf", query_blacklist, ref_blacklist) # Filter the alignments if debug_mode: # create new empty copies of debugging output files open(debug_non_fltrd_file, "w").close() open(debug_fltrd_file, "w").close() open(debug_merged_file, "w").close() open(debug_query_info_file, "w").close() log("Filtering and merging alignments") for i in ctg_alns: # Write unfiltered alignments if debug_mode: with open(debug_non_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter( min_ulen, keep_small=keep_small_uniques) if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: # Write filtered alignments if debug_mode: with open(debug_fltrd_file, "a") as f: f.write(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist) # Remove query sequences which have no more qualifying alignments fltrd_ctg_alns = dict() for i in ctg_alns: if ctg_alns[i] is not None: # Write merged alignments and confidence scores if debug_mode: with open(debug_merged_file, "a") as f: f.write(str(ctg_alns[i])) with open(debug_query_info_file, "a") as f: f.write("\t".join([ i, ctg_alns[i].best_ref_header, str(ctg_alns[i].grouping_confidence), str(ctg_alns[i].location_confidence), str(ctg_alns[i].orientation_confidence), ]) + "\n") if all([ ctg_alns[i].grouping_confidence > group_score_thresh, ctg_alns[i].location_confidence > loc_score_thresh, ctg_alns[i].orientation_confidence > orient_score_thresh ]): fltrd_ctg_alns[i] = ctg_alns[i] # For each reference sequence which has at least one assigned query sequence, get the list of # all query sequences assigned to that reference sequence. log("Ordering and orienting query sequences") mapped_ref_seqs = defaultdict(list) for i in fltrd_ctg_alns: best_ref = fltrd_ctg_alns[i].best_ref_header ref_start, ref_end = fltrd_ctg_alns[i].get_best_ref_pos() mapped_ref_seqs[best_ref].append((ref_start, ref_end, i)) # Sort the query sequences for each reference sequence and define the padding sizes between adjacent query seqs g_inferred = 0 g_small = 0 g_large = 0 pad_sizes = dict() gap_types = dict() for i in mapped_ref_seqs: # Remove contained contigs and sort the rest non_contained = remove_contained(mapped_ref_seqs[i]) mapped_ref_seqs[i] = sorted(non_contained) if infer_gaps: # Infer the gap sizes between adjacent query seqs # Use the primary alignments to infer gap sizes pad_sizes[i] = [] gap_types[i] = [] for j in range(1, len(mapped_ref_seqs[i])): # Get info for the upstream alignment left_ctg = mapped_ref_seqs[i][j - 1][2] left_ref_start, left_ref_end = fltrd_ctg_alns[ left_ctg].get_best_ref_pos() left_qdist_start, left_qdist_end = fltrd_ctg_alns[ left_ctg].get_best_q_dist() # Get info for the downstream alignment right_ctg = mapped_ref_seqs[i][j][2] right_ref_start, right_ref_end = fltrd_ctg_alns[ right_ctg].get_best_ref_pos() right_qdist_start, right_qdist_end = fltrd_ctg_alns[ right_ctg].get_best_q_dist() # Get the inferred gap size i_gap_size = (right_ref_start - right_qdist_start) - ( left_ref_end + left_qdist_end) # Check if the gap size is too small or too large if i_gap_size <= min_gap_size: pad_sizes[i].append(100) gap_types[i].append("U") g_small += 1 elif i_gap_size > max_gap_size: pad_sizes[i].append(100) gap_types[i].append("U") g_large += 1 else: pad_sizes[i].append(i_gap_size) gap_types[i].append("N") g_inferred += 1 else: pad_sizes[i] = [100 for i in range(len(mapped_ref_seqs[i]) - 1)] gap_types[i] = ["U" for i in range(len(mapped_ref_seqs[i]) - 1)] if infer_gaps: log("%d inferred gap" % g_inferred) log("%d adjacent contig within min distance (%d) of each other" % (g_small, min_gap_size)) log("%d inferred gaps exceed length threshold (%d)" % (g_large, max_gap_size)) # Write the scaffolds log("Writing scaffolds") # Write the intermediate output file in AGP v2.1 format log("Writing: " + output_path + "ragtag.scaffolds.agp") write_orderings(output_path + "ragtag.scaffolds.agp", output_path + "ragtag.confidence.txt", query_file, mapped_ref_seqs, fltrd_ctg_alns, pad_sizes, gap_types, make_chr0, True, not remove_suffix) # Build a FASTA from the AGP cmd = [ "ragtag_agp2fasta.py", output_path + "ragtag.scaffolds.agp", query_file ] run_oae(cmd, output_path + "ragtag.scaffolds.fasta", ragtag_log) # Calculate the stats cmd = [ "ragtag_stats.py", output_path + "ragtag.scaffolds.agp", output_path + "ragtag.confidence.txt" ] run_oae(cmd, output_path + "ragtag.scaffolds.stats", ragtag_log) log("Goodbye")
def main(): description = "Scaffold merging: derive a consensus scaffolding solution by reconciling distinct scaffoldings of " \ "'asm.fa'" parser = argparse.ArgumentParser(description=description, usage="ragtag.py merge <asm.fa> <scf1.agp> <scf2.agp> [...]") parser.add_argument("components", metavar="<asm.fasta>", nargs='?', default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") parser.add_argument("agps", metavar="<scf1.agp> <scf2.agp> [...]", nargs='*', default=[], type=str, help="scaffolding AGP files") merge_options = parser.add_argument_group("merging options") merge_options.add_argument("-f", metavar="FILE", default="", type=str, help="CSV list of (AGP file,weight) [null]") merge_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query headers to leave unplaced [null]") merge_options.add_argument("-l", metavar="INT", default=100000, type=int, help="minimum assembly sequence length [100000]") merge_options.add_argument("-e", metavar="FLOAT", default=0.0, type=float, help="minimum edge weight. NA if using Hi-C [0.0]") merge_options.add_argument("--gap-func", metavar="STR", default="min", type=str, help="function for merging gap lengths {'min', 'max', or 'mean'} [min]") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) hic_options = parser.add_argument_group("Hi-C options") hic_options.add_argument("-b", metavar="FILE", default="", type=str, help="Hi-C alignments in BAM format, sorted by read name [null]") hic_options.add_argument("-r", metavar="STR", default="GATC", type=str, help="CSV list of restriction enzymes/sites or 'DNase' [GATC]") hic_options.add_argument("-p", metavar="FLOAT", default=1.0, type=float, help="portion of the sequence termini to consider for links [1.0]") hic_options.add_argument("--list-enzymes", action='store_true', default=False, help="list all available restriction enzymes/sites") args = parser.parse_args() # Print a restriction enzyme help message if requested if args.list_enzymes: RestrictionEnzymes.get_info() sys.exit(0) if not args.components: parser.print_help() sys.exit("\n** The assembly FASTA file is required **") if not args.agps and not args.f: parser.print_help() sys.exit("\n** At least two AGP files are required **") log("VERSION", "RagTag " + get_ragtag_version()) log("WARNING", "This is a beta version of `ragtag merge`") log("CMD", "ragtag.py merge " + " ".join(sys.argv[1:])) # Check that the components FASTA file exists comp_fname = args.components if not os.path.isfile(comp_fname): raise ValueError("Could not find file: %s" % comp_fname) # Optional arguments agp_fofn = args.f hic_bam_fname = args.b re_string = args.r portion = args.p # Set the minimum component sequence length min_comp_len = args.l if min_comp_len < 0: min_comp_len = 0 # Set the minimum edge weight min_edge_weight = args.e if min_edge_weight < 0: min_edge_weight = 0 # Set the gap merging function options gap_func = args.gap_func.upper() if gap_func not in {"MIN", "MAX", "MEAN"}: raise ValueError("Gap merging function must be either 'min', 'max', or 'mean'. Got: {}".format(args.gap_func)) # Debugging options debug_mode = args.debug # I/O options output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" file_prefix = "ragtag.merge" overwrite_files = args.w add_suffix = args.u if not add_suffix: log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.") # get the set of contigs to skip comp_exclusion_set = set() skip_fname = args.j if skip_fname: skip_fname = os.path.abspath(skip_fname) with open(skip_fname, "r") as f: for line in f: comp_exclusion_set.add(line.rstrip().split()[0]) # Setup a file for general logging merge_log = output_path + file_prefix + ".err" open(merge_log, "w").close() # Wipe the log file # Process the AGP files agp_list = [os.path.abspath(i) for i in args.agps] weight_list = [1 for _ in range(len(agp_list))] # Check for file of AGPs and weights if agp_fofn: agp_list, weight_list = [], [] with open(agp_fofn, "r") as f: for line in f: fields = line.rstrip().split(",") agp_list.append(fields[0]) weight_list.append(float(fields[1])) if len(agp_list) < 2: raise ValueError("At least two AGP files are required for merging") # Build the graph and filter nodes by sequence length log("INFO", "Building the scaffold graph from the AGP files") agp_multi_sg = AGPMultiScaffoldGraph(comp_fname) agp_multi_sg.add_agps(agp_list, in_weights=weight_list, exclusion_set=comp_exclusion_set) if min_comp_len: agp_multi_sg.filter_by_seq_len(min_comp_len) if debug_mode: nx.readwrite.gml.write_gml(agp_multi_sg.graph, output_path + "ragtag.merge.msg.gml") # Merge the SAG log("INFO", "Merging the scaffold graph") agp_sg = agp_multi_sg.merge() # Check if we are using Hi-C links to weight the graph. if hic_bam_fname: log("INFO", "Weighting the scaffold graph with Hi-C links") if not comp_fname or not re_string: raise RuntimeError("Hi-C requires alignments (-b) assembly sequences (-a) and restriction sites (-r)") cmd = [ "ragtag_create_links.py", "-a", comp_fname, "-b", hic_bam_fname, "-r", re_string, "-p", str(portion) ] out_links_fname = output_path + file_prefix + ".links" if os.path.isfile(out_links_fname): if not overwrite_files: log("INFO", "Retaining pre-existing file: " + out_links_fname) else: run_oae(cmd, out_links_fname, merge_log) else: run_oae(cmd, out_links_fname, merge_log) hic_sg = build_hic_graph(out_links_fname, comp_fname) agp_sg = agp_sg.steal_weights_from(hic_sg) # Filter by edge weight if min_edge_weight and not hic_bam_fname: agp_sg.filter_by_weight(min_edge_weight) if debug_mode: agp_sg.connect_and_write_gml(output_path + file_prefix + ".sg.gml") # Compute a solution to the ScaffoldGraph log("INFO", "Computing a scaffolding solution") cover_graph = get_maximal_matching(agp_sg) if debug_mode: tmp_cover_graph = nx.Graph() for u, v in cover_graph.edges: tmp_cover_graph.add_edge(u, v) nx.readwrite.gml.write_gml(tmp_cover_graph, output_path + file_prefix + ".covergraph.gml") # Write the scaffolding output to an AGP file log("INFO", "Writing results") write_agp_solution(cover_graph, agp_sg, output_path + file_prefix + ".agp", gap_func=gap_func, add_suffix_to_unplaced=add_suffix) # Generate a FASTA file corresponding to the AGP cmd = [ "ragtag_agp2fa.py", output_path + file_prefix + ".agp", comp_fname ] run_oae(cmd, output_path + file_prefix + ".fasta", merge_log) log("INFO", "Goodbye")
def main(): description = """ """ parser = argparse.ArgumentParser( description="Quantify links from a Hi-C BAM file.", usage= "ragtag_create_links.py -c components.fasta -b <hic.bam> -r <RE_site>") parser.add_argument("-a", metavar="FILE", default="", type=str, help="assembly fasta file [null]") parser.add_argument( "-b", metavar="FILE", default="", type=str, help="Hi-C alignments in BAM format, sorted by read name [null]") parser.add_argument("-r", metavar="STR", default="GATC", type=str, help="CSV list of restriction sites or 'DNase' [GATC]") parser.add_argument( "-p", metavar="FLOAT", default=1.0, type=float, help="portion of the sequence termini to consider for links [1.0]") parser.add_argument("--list-enzymes", action='store_true', default=False, help="list all available restriction enzymes/sites") args = parser.parse_args() # Print a restriction enzyme help message if requested if args.list_enzymes: RestrictionEnzymes.get_info() sys.exit() # Continue with normal functionality if no restriction enzyme help message is requested if not args.a or not args.b or not args.r: parser.print_help() sys.exit() # Set the terminus portion portion = args.p if not 1 >= portion > 0: raise ValueError( "portion must be between 0 (exclusive) and 1 (inclusive)") asm_file = os.path.abspath(args.a) bam_file = os.path.abspath(args.b) dnase_mode = False re_string = args.r.upper() if "DNASE" in re_string: dnase_mode = True log("Running in DNase mode.") re_set = set() if not dnase_mode: re_set = set(filter(None, args.r.split(","))) if not re_set: raise ValueError( "At least one restriction enzyme/site is needed (-r) if not using 'DNase'." ) # Store the sequence lengths asm_lens = dict() fai = pysam.FastaFile(asm_file) for ref in fai.references: asm_lens[ref] = fai.get_reference_length(ref) fai.close() # Get the left and right cutoff positions for each sequence l_cutoffs = dict() r_cutoffs = dict() for ref in asm_lens: l = asm_lens[ref] // 2 r = asm_lens[ref] - l l_cutoffs[ref] = round(l * portion) r_cutoffs[ref] = asm_lens[ref] - round(r * portion) # Get the raw Hi-C links log("Computing raw Hi-C links from: {}".format(bam_file)) raw_links = count_links(bam_file, l_cutoffs, r_cutoffs) # Normalize the Hi-C links l_norm_factors = l_cutoffs r_norm_factors = r_cutoffs # Normalize by the number of restriction sites if not using DNase if not dnase_mode: l_norm_factors = dict() r_norm_factors = dict() # Set the restriction enzymes RE = RestrictionEnzymes(re_set) log("Using the following restriction sites:\n{}".format(str(RE))) log("Counting restriction sites") rfm = RestrictionFragmentMap(asm_file, RE) # Get the number of sites for each contig terminus (l/b and r/e) for ref in l_cutoffs: l_norm_factors[ref] = rfm.count_sites_lte(ref, l_cutoffs[ref]) r_norm_factors[ref] = rfm.count_sites_gt(ref, r_cutoffs[ref]) log("Normalizing raw Hi-C links") norm_links = normalize_links(raw_links, l_norm_factors, r_norm_factors) write_links(raw_links, norm_links)
def main(): description = "Homology-based assembly patching: Make continuous joins and fill gaps " \ "in 'target.fa' using sequences from 'query.fa'" parser = argparse.ArgumentParser(description=description, usage="ragtag.py patch <target.fa> <query.fa>") parser.add_argument("reference", metavar="<target.fa>", nargs='?', default="", type=str, help="target fasta file (uncompressed or bgzipped)") parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)") patch_options = parser.add_argument_group("patching") patch_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of target sequences to ignore [null]") patch_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query sequences to ignore [null]") patch_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]") patch_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than '-f'") patch_options.add_argument("-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]") patch_options.add_argument("-d", metavar="INT", type=int, default=100000, help="maximum alignment merge distance [100000]") patch_options.add_argument("-s", metavar="INT", type=int, default=50000, help="minimum merged alignment length [50000]") patch_options.add_argument("-i", metavar="FLOAT", type=float, default=0.05, help="maximum merged alignment distance from sequence terminus. fraction of the sequence length if < 1 [0.05]") patch_options.add_argument("--fill-only", action="store_true", default=False, help="only fill existing target gaps. do not join target sequences") patch_options.add_argument("--join-only", action="store_true", default=False, help="only join and patch target sequences. do not fill existing gaps") io_options = parser.add_argument_group("input/output options") io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]") io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files") io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers") io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS) aln_options = parser.add_argument_group("mapping options") aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2/unimap threads [1]") aln_options.add_argument("--aligner", metavar="PATH", type=str, default="nucmer", help="aligner executable ('nucmer' (recommended), 'unimap' or 'minimap2') [nucmer]") mm2_default = "-x asm5" aln_options.add_argument("--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters (overrides '-t') ['%s']" % mm2_default) aln_options.add_argument("--unimap-params", metavar="STR", type=str, default=mm2_default, help="space delimited unimap parameters (overrides '-t') ['%s']" % mm2_default) aln_options.add_argument("--nucmer-params", metavar="STR", type=str, default="--maxmatch -l 100 -c 500", help="space delimted nucmer parameters ['--maxmatch -l 100 -c 500']") args = parser.parse_args() if not args.reference or not args.query: parser.print_help() sys.exit("\n** The target and query FASTA files are required **") log("VERSION", "RagTag " + get_ragtag_version()) log("WARNING", "This is a beta version of `ragtag patch`") log("CMD", "ragtag.py patch " + " ".join(sys.argv[1:])) reference_fn = os.path.abspath(args.reference) query_fn = os.path.abspath(args.query) # Check that the reference/query file exists if not os.path.isfile(reference_fn): raise FileNotFoundError("Could not find file: %s" % reference_fn) if not os.path.isfile(query_fn): raise FileNotFoundError("Could not find file: %s" % query_fn) # Alignment processing parameters min_ulen = args.f keep_small_uniques = not args.remove_small merge_dist = args.d num_threads = args.t aligner_path = args.aligner aligner = aligner_path.split("/")[-1] if aligner.split("/")[-1] not in {'minimap2', 'unimap', 'nucmer'}: raise ValueError("Must specify either 'minimap2', 'unimap', or 'nucmer' (PATHs allowed) with '--aligner'.") mm2_params = args.mm2_params unimap_params = args.unimap_params nucmer_params = args.nucmer_params # Mapq filtering parameters min_mapq = args.q if aligner == "nucmer": min_mapq = 0 # Add the number of mm2/unimap threads if the mm2 params haven't been overridden. if mm2_params == mm2_default: mm2_params += " -t " + str(num_threads) if unimap_params == mm2_default: unimap_params += " -t " + str(num_threads) # Set reference/query sequences to ignore ref_blacklist = set() exclude_file = args.e if exclude_file: exclude_file = os.path.abspath(args.e) with open(exclude_file, "r") as f: for line in f: ref_blacklist.add(line.rstrip()) query_blacklist = set() skip_file = args.j if skip_file: skip_file = os.path.abspath(skip_file) with open(skip_file, "r") as f: for line in f: query_blacklist.add(line.rstrip()) # Supporting alignment parameters min_sup_aln_len = args.s max_term_dist = args.i if max_term_dist <= 0: raise ValueError("-i must be a positive nonzero number.") # Task options fill_only = args.fill_only join_only = args.join_only if fill_only and join_only: raise ValueError("'--fill-only' and '--join-only' cannot be used together") # I/O parameters add_suffix = args.u if not add_suffix: log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.") overwrite_files = args.w output_path = args.o if not os.path.isdir(output_path): os.mkdir(output_path) output_path = os.path.abspath(output_path) + "/" file_prefix = "ragtag.patch" # Setup a log file for external RagTag scripts ragtag_log = output_path + file_prefix + ".err" open(ragtag_log, "w").close() # Wipe the log file # Debugging options debug_mode = args.debug # Break the reference assembly at gaps cmd = [ "ragtag_splitasm.py", "-o", output_path + file_prefix + ".ctg.agp", reference_fn ] reference_ctg_fn = output_path + file_prefix + ".ctg.fasta" if os.path.isfile(reference_ctg_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + reference_ctg_fn) run_oae(cmd, reference_ctg_fn, ragtag_log) else: log("INFO", "Retaining pre-existing file: " + reference_ctg_fn) else: run_oae(cmd, reference_ctg_fn, ragtag_log) # Rename the query sequences cmd = [ "ragtag_rename.py", query_fn, "-p", "qseq", "-o", output_path + file_prefix + ".rename.agp", ] query_rename_fn = output_path + file_prefix + ".rename.fasta" if os.path.isfile(query_rename_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + query_rename_fn) run_oae(cmd, query_rename_fn, ragtag_log) else: log("INFO", "Retaining pre-existing file: " + query_rename_fn) else: run_oae(cmd, query_rename_fn, ragtag_log) # Combine the reference contigs and query sequences to make a components fasta file components_fn = output_path + file_prefix + ".comps.fasta" if os.path.isfile(components_fn): if overwrite_files: log("INFO", "Overwriting pre-existing file: " + components_fn) write_comps = True else: log("INFO", "Retaining pre-existing file: " + components_fn) write_comps = False else: write_comps = True if write_comps: log("INFO", "Writing: " + components_fn) ref_fai = pysam.FastaFile(reference_ctg_fn) query_fai = pysam.FastaFile(query_rename_fn) with open(components_fn, "w") as f: for ref in ref_fai.references: f.write(">" + ref + "\n") f.write(ref_fai.fetch(ref) + "\n") for query in query_fai.references: f.write(">" + query + "\n") f.write(query_fai.fetch(query) + "\n") # Map the query assembly to the reference contigs log("INFO", "Mapping the query genome to the target genome") if aligner == "minimap2": al = Minimap2Aligner(reference_ctg_fn, [query_rename_fn], aligner_path, mm2_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) elif aligner == "unimap": al = UnimapAligner(reference_ctg_fn, [query_rename_fn], aligner_path, unimap_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) else: al = NucmerAligner(reference_ctg_fn, [query_rename_fn], aligner_path, nucmer_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files) al.run_aligner() # If alignments are from Nucmer, need to convert from delta to paf if aligner == "nucmer": cmd = ["ragtag_delta2paf.py", output_path + file_prefix + ".asm.delta"] run_oae(cmd, output_path + file_prefix + ".asm.paf", ragtag_log) # Read and organize the alignments log("INFO", "Reading whole genome alignments") # ctg_alns: query header -> ContigAlignment object ctg_alns = read_genome_alignments(output_path + file_prefix + ".asm.paf", query_blacklist, ref_blacklist) # Check if any alignments are left if not ctg_alns: raise RuntimeError("There are no alignments. Check '{}'.".format(output_path + file_prefix + ".asm.paf")) # Filter the alignments unfiltered_strings, filtered_strings, merged_strings, useful_strings = [], [], [], [] log("INFO", "Filtering and merging alignments") fltrd_ctg_alns = dict() for i in ctg_alns: # Unique anchor filtering unfiltered_strings.append(str(ctg_alns[i])) ctg_alns[i] = ctg_alns[i].unique_anchor_filter(min_ulen, keep_small=keep_small_uniques) # mapq filtering if ctg_alns[i] is not None: ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq) if ctg_alns[i] is not None: filtered_strings.append(str(ctg_alns[i])) # alignment merging ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist, careful_merge=True) if ctg_alns[i] is not None: merged_strings.append(str(ctg_alns[i])) # Length filtering ctg_alns[i] = ctg_alns[i].filter_lengths(min_sup_aln_len) if ctg_alns[i] is not None: # terminal filtering ctg_alns[i] = ctg_alns[i].keep_terminals(max_term_dist) # Save the remaining useful alignments if ctg_alns[i] is not None and ctg_alns[i].num_refs > 1 and not ctg_alns[i].has_internal_ref_cuttings(max_term_dist): useful_strings.append(str(ctg_alns[i])) fltrd_ctg_alns[i] = ctg_alns[i] # Write debugging files debug_non_fltrd_file = output_path + file_prefix + ".debug.unfiltered.paf" debug_fltrd_file = output_path + file_prefix + ".debug.filtered.paf" debug_merged_file = output_path + file_prefix + ".debug.merged.paf" debug_useful_file = output_path + file_prefix + ".debug.useful.paf" if debug_mode: with open(debug_non_fltrd_file, "w") as f: f.write("".join(unfiltered_strings)) with open(debug_fltrd_file, "w") as f: f.write("".join(filtered_strings)) with open(debug_merged_file, "w") as f: f.write("".join(merged_strings)) with open(debug_useful_file, "w") as f: f.write("".join(useful_strings)) # Make a Scaffold Graph encoding known reference contigs adjacencies log("INFO", "Building a scaffold graph from the contig AGP file") agp_multi_sg = AGPMultiScaffoldGraph(reference_ctg_fn) agp_multi_sg.add_agps([output_path + file_prefix + ".ctg.agp"]) agp_sg = agp_multi_sg.merge() # As a hack, go through the AGP sg and make the required directed scaffold graph agp_psg = PatchScaffoldGraph(components_fn) for u, v in agp_sg.edges: aln = Alignment( u, v, "", agp_sg[u][v]["gap_size"][0], 0, agp_sg[u][v]["gap_size"][0], 0, is_gap=True ) agp_psg.add_edge(u, v, aln) # Make a second directed scaffold graph from the alignments log("INFO", "Building a scaffold graph from the target/query mappings") aln_psg = build_aln_scaffold_graph(fltrd_ctg_alns, components_fn, max_term_dist) # Add edges for unfilled gaps for u, v in agp_psg.edges: if not aln_psg.has_edge(u, v): aln_psg.add_edge(u, v, agp_psg[u][v]["alignment"]) # Remove known false edges for u, v in agp_psg.edges: for neighbor in list(aln_psg.neighbors(u)): if neighbor != v: aln_psg.remove_edge(u, neighbor) aln_psg.remove_edge(neighbor, u) for neighbor in list(aln_psg.neighbors(v)): if neighbor != u: aln_psg.remove_edge(neighbor, v) aln_psg.remove_edge(v, neighbor) # Adjust the graph depending on if only fills or joins are requested if fill_only: psg = PatchScaffoldGraph(components_fn) for u, v in agp_psg.edges: psg.add_edge(u, v, aln_psg[u][v]["alignment"]) psg.add_edge(v, u, aln_psg[v][u]["alignment"]) aln_psg = psg if join_only: for u, v in agp_psg.edges: aln_psg[u][v]["alignment"] = agp_psg[u][v]["alignment"] aln_psg[v][u]["alignment"] = agp_psg[v][u]["alignment"] if debug_mode: aln_psg.write_gml(output_path + file_prefix + ".debug.sg.gml") # Compute a matching solution for the graph log("INFO", "Computing a matching solution to the scaffold graph") match_psg = aln_psg.max_weight_matching() if debug_mode: match_psg.write_gml(output_path + file_prefix + ".debug.matching.gml") # Write the output in AGP format log("INFO", "Writing output files") match_psg.write_agp(output_path + file_prefix + ".agp", output_path + file_prefix + ".ctg.fasta", add_suffix_to_unplaced=add_suffix) # Write the output in fasta format cmd = [ "ragtag_agp2fa.py", output_path + file_prefix + ".agp", components_fn ] run_oae(cmd, output_path + file_prefix + ".fasta", ragtag_log) log("INFO", "Goodbye")