Пример #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Reference-guided scaffolding',
        usage="ragtag.py scaffold <reference.fa> <query.fa>")

    parser.add_argument("reference",
                        metavar="<reference.fa>",
                        nargs='?',
                        default="",
                        type=str,
                        help="reference fasta file (uncompressed or bgzipped)")
    parser.add_argument("query",
                        metavar="<query.fa>",
                        nargs='?',
                        default="",
                        type=str,
                        help="query fasta file (uncompressed or bgzipped)")

    scaf_options = parser.add_argument_group("scaffolding options")
    scaf_options.add_argument(
        "-e",
        metavar="<exclude.txt>",
        type=str,
        default="",
        help="list of reference headers to ignore [null]")
    scaf_options.add_argument(
        "-j",
        metavar="<skip.txt>",
        type=str,
        default="",
        help="list of query headers to leave unplaced [null]")
    scaf_options.add_argument("-f",
                              metavar="INT",
                              type=int,
                              default=1000,
                              help="minimum unique alignment length [1000]")
    scaf_options.add_argument("--remove-small",
                              action="store_true",
                              default=False,
                              help="remove unique alignments shorter than -f")
    scaf_options.add_argument(
        "-q",
        metavar="INT",
        type=int,
        default=10,
        help="minimum mapq (NA for Nucmer alignments) [10]")
    scaf_options.add_argument("-d",
                              metavar="INT",
                              type=int,
                              default=100000,
                              help="alignment merge distance [100000]")
    scaf_options.add_argument("-i",
                              metavar="FLOAT",
                              type=float,
                              default=0.2,
                              help="minimum grouping confidence score [0.2]")
    scaf_options.add_argument("-a",
                              metavar="FLOAT",
                              type=float,
                              default=0.0,
                              help="minimum location confidence score [0.0]")
    scaf_options.add_argument(
        "-s",
        metavar="FLOAT",
        type=float,
        default=0.0,
        help="minimum orientation confidence score [0.0]")
    scaf_options.add_argument(
        "-C",
        action='store_true',
        default=False,
        help="concatenate unplaced contigs and make 'chr0'")
    scaf_options.add_argument(
        "-r",
        action='store_true',
        default=False,
        help="infer gap sizes. if not, all gaps are 100 bp")
    scaf_options.add_argument("-g",
                              metavar="INT",
                              type=int,
                              default=100,
                              help="minimum inferred gap size [100]")
    scaf_options.add_argument("-m",
                              metavar="INT",
                              type=int,
                              default=100000,
                              help="maximum inferred gap size [100000]")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o",
                            metavar="PATH",
                            type=str,
                            default="ragtag_output",
                            help="output directory [./ragtag_output]")
    io_options.add_argument("-w",
                            action='store_true',
                            default=False,
                            help="overwrite intermediate files")
    io_options.add_argument("-u",
                            action='store_true',
                            default=False,
                            help="add suffix to unplaced sequence headers")
    io_options.add_argument("--debug",
                            action='store_true',
                            default=False,
                            help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    aln_options.add_argument("-t",
                             metavar="INT",
                             type=int,
                             default=1,
                             help="number of minimap2 threads [1]")
    aln_options.add_argument(
        "--aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help="aligner executable ('nucmer' or 'minimap2') [minimap2]")
    mm2_default = "-x asm5"
    aln_options.add_argument(
        "--mm2-params",
        metavar="STR",
        type=str,
        default=mm2_default,
        help="space delimited minimap2 parameters ['%s']" % mm2_default)
    aln_options.add_argument(
        "--nucmer-params",
        metavar="STR",
        type=str,
        default="-l 100 -c 500",
        help="space delimted nucmer parameters ['-l 100 -c 500']")

    args = parser.parse_args()
    if not args.reference or not args.query:
        parser.print_help()
        print("\n** The reference and query FASTA files are required **")
        sys.exit()

    log("RagTag " + get_ragtag_version())
    log("CMD: ragtag.py scaffold " + " ".join(sys.argv[1:]))

    reference_file = os.path.abspath(args.reference)
    query_file = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_file):
        raise ValueError("Could not find file: %s" % reference_file)

    if not os.path.isfile(query_file):
        raise ValueError("Could not find file: %s" % query_file)

    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    group_score_thresh = args.i
    loc_score_thresh = args.a
    orient_score_thresh = args.s
    make_chr0 = args.C
    infer_gaps = args.r
    num_threads = args.t

    # I/O options
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"

    # Setup a log file for external RagTag scripts
    ragtag_log = output_path + "ragtag.scaffold.err"
    open(ragtag_log, "w").close()  # Wipe the log file

    overwrite_files = args.w
    remove_suffix = not args.u
    if remove_suffix:
        log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'."
            )

    # Gap options
    min_gap_size = args.g
    max_gap_size = args.m
    if min_gap_size < 1:
        raise ValueError("the minimum gap size must be positive")

    if max_gap_size < 1:
        raise ValueError("the maximum gap size must be positive")

    # Skip/exclude options
    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(args.j)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    # Get aligner arguments
    aligner_path = args.aligner
    aligner = aligner_path.split("/")[-1]
    if aligner.split("/")[-1] not in {'minimap2', 'nucmer'}:
        raise ValueError(
            "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'."
        )

    mm2_params = args.mm2_params
    nucmer_params = args.nucmer_params

    # Mapq filtering params
    min_mapq = args.q
    if aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2 threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)

    # Debugging options
    debug_mode = args.debug
    debug_non_fltrd_file = output_path + "ragtag.scaffolds.debug.unfiltered.paf"
    debug_fltrd_file = output_path + "ragtag.scaffolds.debug.filtered.paf"
    debug_merged_file = output_path + "ragtag.scaffolds.debug.merged.paf"
    debug_query_info_file = output_path + "ragtag.scaffolds.debug.query.info.txt"

    # Align the query to the reference
    log("Mapping the query genome to the reference genome")
    if aligner == "minimap2":
        al = Minimap2Aligner(reference_file, [query_file],
                             aligner_path,
                             mm2_params,
                             output_path + "query_against_ref",
                             in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_file, [query_file],
                           aligner_path,
                           nucmer_params,
                           output_path + "query_against_ref",
                           in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, need to convert from delta to paf
    if aligner == "nucmer":
        cmd = ["ragtag_delta2paf.py", output_path + "query_against_ref.delta"]
        run_oae(cmd, output_path + "query_against_ref.paf", ragtag_log)

    # Read and organize the alignments
    log('Reading whole genome alignments')
    # ctg_alns = dict :: key=query header, value=ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + "query_against_ref.paf",
                                      query_blacklist, ref_blacklist)

    # Filter the alignments
    if debug_mode:
        # create new empty copies of debugging output files
        open(debug_non_fltrd_file, "w").close()
        open(debug_fltrd_file, "w").close()
        open(debug_merged_file, "w").close()
        open(debug_query_info_file, "w").close()

    log("Filtering and merging alignments")
    for i in ctg_alns:

        # Write unfiltered alignments
        if debug_mode:
            with open(debug_non_fltrd_file, "a") as f:
                f.write(str(ctg_alns[i]))

        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(
            min_ulen, keep_small=keep_small_uniques)
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:

                # Write filtered alignments
                if debug_mode:
                    with open(debug_fltrd_file, "a") as f:
                        f.write(str(ctg_alns[i]))

                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist)

    # Remove query sequences which have no more qualifying alignments
    fltrd_ctg_alns = dict()
    for i in ctg_alns:
        if ctg_alns[i] is not None:

            # Write merged alignments and confidence scores
            if debug_mode:
                with open(debug_merged_file, "a") as f:
                    f.write(str(ctg_alns[i]))

                with open(debug_query_info_file, "a") as f:
                    f.write("\t".join([
                        i,
                        ctg_alns[i].best_ref_header,
                        str(ctg_alns[i].grouping_confidence),
                        str(ctg_alns[i].location_confidence),
                        str(ctg_alns[i].orientation_confidence),
                    ]) + "\n")

            if all([
                    ctg_alns[i].grouping_confidence > group_score_thresh,
                    ctg_alns[i].location_confidence > loc_score_thresh,
                    ctg_alns[i].orientation_confidence > orient_score_thresh
            ]):
                fltrd_ctg_alns[i] = ctg_alns[i]

    # For each reference sequence which has at least one assigned query sequence, get the list of
    # all query sequences assigned to that reference sequence.
    log("Ordering and orienting query sequences")
    mapped_ref_seqs = defaultdict(list)
    for i in fltrd_ctg_alns:
        best_ref = fltrd_ctg_alns[i].best_ref_header
        ref_start, ref_end = fltrd_ctg_alns[i].get_best_ref_pos()
        mapped_ref_seqs[best_ref].append((ref_start, ref_end, i))

    # Sort the query sequences for each reference sequence and define the padding sizes between adjacent query seqs
    g_inferred = 0
    g_small = 0
    g_large = 0
    pad_sizes = dict()
    gap_types = dict()
    for i in mapped_ref_seqs:
        # Remove contained contigs and sort the rest
        non_contained = remove_contained(mapped_ref_seqs[i])
        mapped_ref_seqs[i] = sorted(non_contained)
        if infer_gaps:
            # Infer the gap sizes between adjacent query seqs
            # Use the primary alignments to infer gap sizes
            pad_sizes[i] = []
            gap_types[i] = []
            for j in range(1, len(mapped_ref_seqs[i])):
                # Get info for the upstream alignment
                left_ctg = mapped_ref_seqs[i][j - 1][2]
                left_ref_start, left_ref_end = fltrd_ctg_alns[
                    left_ctg].get_best_ref_pos()
                left_qdist_start, left_qdist_end = fltrd_ctg_alns[
                    left_ctg].get_best_q_dist()

                # Get info for the downstream alignment
                right_ctg = mapped_ref_seqs[i][j][2]
                right_ref_start, right_ref_end = fltrd_ctg_alns[
                    right_ctg].get_best_ref_pos()
                right_qdist_start, right_qdist_end = fltrd_ctg_alns[
                    right_ctg].get_best_q_dist()

                # Get the inferred gap size
                i_gap_size = (right_ref_start - right_qdist_start) - (
                    left_ref_end + left_qdist_end)

                # Check if the gap size is too small or too large
                if i_gap_size <= min_gap_size:
                    pad_sizes[i].append(100)
                    gap_types[i].append("U")
                    g_small += 1
                elif i_gap_size > max_gap_size:
                    pad_sizes[i].append(100)
                    gap_types[i].append("U")
                    g_large += 1
                else:
                    pad_sizes[i].append(i_gap_size)
                    gap_types[i].append("N")
                    g_inferred += 1
        else:
            pad_sizes[i] = [100 for i in range(len(mapped_ref_seqs[i]) - 1)]
            gap_types[i] = ["U" for i in range(len(mapped_ref_seqs[i]) - 1)]

    if infer_gaps:
        log("%d inferred gap" % g_inferred)
        log("%d adjacent contig within min distance (%d) of each other" %
            (g_small, min_gap_size))
        log("%d inferred gaps exceed length threshold (%d)" %
            (g_large, max_gap_size))

    # Write the scaffolds
    log("Writing scaffolds")

    # Write the intermediate output file in AGP v2.1 format
    log("Writing: " + output_path + "ragtag.scaffolds.agp")
    write_orderings(output_path + "ragtag.scaffolds.agp",
                    output_path + "ragtag.confidence.txt", query_file,
                    mapped_ref_seqs, fltrd_ctg_alns, pad_sizes, gap_types,
                    make_chr0, True, not remove_suffix)

    # Build a FASTA from the AGP
    cmd = [
        "ragtag_agp2fasta.py", output_path + "ragtag.scaffolds.agp", query_file
    ]
    run_oae(cmd, output_path + "ragtag.scaffolds.fasta", ragtag_log)

    # Calculate the stats
    cmd = [
        "ragtag_stats.py", output_path + "ragtag.scaffolds.agp",
        output_path + "ragtag.confidence.txt"
    ]
    run_oae(cmd, output_path + "ragtag.scaffolds.stats", ragtag_log)

    log("Goodbye")
Пример #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Reference-guided misassembly correction',
        usage="ragtag.py correct <reference.fa> <query.fa>")

    cor_options = parser.add_argument_group("correction options")
    cor_options.add_argument(
        "reference",
        metavar="<reference.fa>",
        nargs='?',
        default="",
        type=str,
        help="reference fasta file (can be uncompressed or bgzipped)")
    cor_options.add_argument(
        "query",
        metavar="<query.fa>",
        nargs='?',
        default="",
        type=str,
        help="query fasta file (can be uncompressed or bgzipped)")
    cor_options.add_argument("-f",
                             metavar="INT",
                             type=int,
                             default=1000,
                             help="minimum unique alignment length [1000]")
    cor_options.add_argument("--remove-small",
                             action="store_true",
                             default=False,
                             help="remove unique alignments shorter than -f")
    cor_options.add_argument(
        "-q",
        metavar="INT",
        type=int,
        default=10,
        help="minimum mapq (NA for Nucmer alignments) [10]")
    cor_options.add_argument("-d",
                             metavar="INT",
                             type=int,
                             default=100000,
                             help="alignment merge distance [100000]")
    cor_options.add_argument(
        "-b",
        metavar="INT",
        type=int,
        default=5000,
        help="minimum break distance from contig ends [5000]")
    cor_options.add_argument("-e",
                             metavar="<exclude.txt>",
                             type=str,
                             default="",
                             help="list of reference headers to ignore")
    cor_options.add_argument("-j",
                             metavar="<skip.txt>",
                             type=str,
                             default="",
                             help="list of query headers to leave uncorrected")
    cor_options.add_argument(
        "--inter",
        action="store_true",
        default=False,
        help="only break misassemblies between reference sequences")
    cor_options.add_argument(
        "--intra",
        action="store_true",
        default=False,
        help="only break misassemblies within reference sequences")
    cor_options.add_argument("--gff",
                             metavar="<features.gff>",
                             type=str,
                             default="",
                             help="don't break sequences within gff intervals")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o",
                            metavar="PATH",
                            type=str,
                            default="ragtag_output",
                            help="output directory [./ragtag_output]")
    io_options.add_argument("-w",
                            action='store_true',
                            default=False,
                            help="overwrite intermediate files")
    io_options.add_argument("-u",
                            action='store_true',
                            default=False,
                            help="add suffix to unaltered sequence headers")
    io_options.add_argument("--debug",
                            action='store_true',
                            default=False,
                            help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    mm2_default = "-x asm5"
    aln_options.add_argument("-t",
                             metavar="INT",
                             type=int,
                             default=1,
                             help="number of minimap2 threads [1]")
    aln_options.add_argument(
        "--aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help=
        "whole genome aligner executable ('nucmer' or 'minimap2') [minimap2]")
    aln_options.add_argument(
        "--mm2-params",
        metavar="STR",
        type=str,
        default=mm2_default,
        help="space delimited minimap2 whole genome alignment parameters ['%s']"
        % mm2_default)
    aln_options.add_argument(
        "--nucmer-params",
        metavar="STR",
        type=str,
        default="-l 100 -c 500",
        help=
        "space delimted nucmer whole genome alignment parameters ['-l 100 -c 500']"
    )

    val_options = parser.add_argument_group("validation options")
    val_options.add_argument(
        "--read-aligner",
        metavar="PATH",
        type=str,
        default="minimap2",
        help="read aligner executable (only 'minimap2' is allowed) [minimap2]")
    val_options.add_argument(
        "-R",
        metavar="<reads.fasta>",
        type=str,
        default="",
        help="validation reads. gzipped fastq or fasta allowed.")
    val_options.add_argument("-F",
                             metavar="<reads.fofn>",
                             type=str,
                             default="",
                             help="same as '-R', but a list of files.")
    val_options.add_argument(
        "-T",
        metavar="sr",
        type=str,
        default="",
        help=
        "read type. 'sr' and 'corr' accepted for short reads and error corrected long-reads, respectively."
    )
    val_options.add_argument("-v",
                             metavar="INT",
                             type=int,
                             default=10000,
                             help="coverage validation window size [10000]")
    val_options.add_argument(
        "--max-cov",
        metavar="INT",
        type=int,
        default=-1,
        help="break sequences at regions at or above this coverage level [AUTO]"
    )
    val_options.add_argument(
        "--min-cov",
        metavar="INT",
        type=int,
        default=-1,
        help="break sequences at regions at or below this coverage level [AUTO]"
    )
    val_options.add_argument(
        "-m", metavar="INT", type=int, default=1000, help=argparse.SUPPRESS
    )  # Merge breakpoints within this distance after validation

    args = parser.parse_args()

    if not args.reference or not args.query:
        parser.print_help()
        sys.exit()

    log("RagTag " + get_ragtag_version())
    log("CMD: " + " ".join(sys.argv))

    reference_file = os.path.abspath(args.reference)
    query_file = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_file):
        raise ValueError("Could not find file: %s" % reference_file)

    if not os.path.isfile(query_file):
        raise ValueError("Could not find file: %s" % query_file)

    num_threads = args.t
    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    min_break_dist = args.m
    min_break_end_dist = args.b
    val_window_size = args.v

    # I/O options
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"

    overwrite_files = args.w
    remove_suffix = not args.u
    if remove_suffix:
        log("WARNING: Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'."
            )

    gff_file = args.gff
    if gff_file:
        gff_file = os.path.abspath(gff_file)

    # Skip/exclude options
    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(args.j)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    # Get aligner arguments
    genome_aligner_path = args.aligner
    genome_aligner = genome_aligner_path.split("/")[-1]
    if genome_aligner.split("/")[-1] not in {'minimap2', 'nucmer'}:
        raise ValueError(
            "Must specify either 'minimap2' or 'nucmer' (PATHs allowed) with '--aligner'."
        )

    mm2_params = args.mm2_params
    nucmer_params = args.nucmer_params

    # Mapq filtering params
    min_mapq = args.q
    if genome_aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2 threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)

    # Check if intra/inter breaking is desired
    break_intra = True
    break_inter = True
    only_intra = args.intra
    only_inter = args.inter
    if only_intra and only_inter:
        raise ValueError(
            "Must speficity either '--inter' or '--intra', not both.")

    if only_intra:
        break_inter = False
    if only_inter:
        break_intra = False

    # read-alignment parameters
    val_reads = args.R
    val_reads_fofn = args.F
    val_reads_tech = args.T
    read_aligner_path = args.read_aligner
    read_aligner = read_aligner_path.split("/")[-1]
    if read_aligner != "minimap2":
        raise ValueError(
            "Only minimap2 can be used for read alignments. got: %s" %
            read_aligner)

    # If the genome aligner is minimap2, we can just use that path for read alignment
    if genome_aligner == 'minimap2':
        read_aligner_path = genome_aligner_path

    # Make sure that if -R or -F, -T has been specified.
    if val_reads or val_reads_fofn:
        if not val_reads_tech:
            raise ValueError("'-T' must be provided when using -R or -F.")

    # Make a list of read sequences.
    read_files = []
    if val_reads_fofn:
        with open(val_reads_fofn, "r") as f:
            for line in f:
                read_files.append(os.path.abspath(line.rstrip()))
    elif val_reads:
        read_files.append(os.path.abspath(val_reads))

    # Coverage thresholds
    max_cov = args.max_cov
    min_cov = args.min_cov

    if max_cov < 0:
        if max_cov != -1:
            raise ValueError("--max-cov must be >=0")

    if min_cov < 0:
        if min_cov != -1:
            raise ValueError("--min-cov must be >=0")

    # Debugging options
    debug_mode = args.debug
    debug_non_fltrd_file = output_path + "ragtag.correction.debug.unfiltered.paf"
    debug_fltrd_file = output_path + "ragtag.correction.debug.filtered.paf"
    debug_merged_file = output_path + "ragtag.correction.debug.merged.paf"
    debug_query_info_file = output_path + "ragtag.correction.debug.query.info.txt"

    # Align the query to the reference.
    log("Mapping the query genome to the reference genome")
    if genome_aligner == "minimap2":
        al = Minimap2Aligner(reference_file, [query_file],
                             genome_aligner_path,
                             mm2_params,
                             output_path + "c_query_against_ref",
                             in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_file, [query_file],
                           genome_aligner_path,
                           nucmer_params,
                           output_path + "c_query_against_ref",
                           in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, convert from delta to paf.
    if genome_aligner == "nucmer":
        cmd = [
            "ragtag_delta2paf.py", output_path + "c_query_against_ref.delta"
        ]
        run_o(
            cmd,
            output_path + "c_query_against_ref.paf",
        )

    # Read and organize the alignments.
    log('Reading whole genome alignments')
    # ctg_alns = dict :: key=query header, value=ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + "c_query_against_ref.paf",
                                      query_blacklist, ref_blacklist)

    # Filter and merge the alignments.
    if debug_mode:
        # create new empty copies of debugging output files
        open(debug_non_fltrd_file, "w").close()
        open(debug_fltrd_file, "w").close()
        open(debug_merged_file, "w").close()
        open(debug_query_info_file, "w").close()

    log("Filtering and merging alignments")
    for i in ctg_alns:

        # Write unfiltered alignments
        if debug_mode:
            with open(debug_non_fltrd_file, "a") as f:
                f.write(str(ctg_alns[i]))

        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(
            min_ulen, keep_small=keep_small_uniques)
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:

                # Write filtered alignments
                if debug_mode:
                    with open(debug_fltrd_file, "a") as f:
                        f.write(str(ctg_alns[i]))

                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist)

    # Get the putative breakpoints for each query sequence, if any.
    ctg_breaks = dict()
    for i in ctg_alns:
        if ctg_alns[i] is not None:

            # Write merged alignments and confidence scores
            if debug_mode:
                with open(debug_merged_file, "a") as f:
                    f.write(str(ctg_alns[i]))

                with open(debug_query_info_file, "a") as f:
                    f.write("\t".join([
                        i,
                        ctg_alns[i].best_ref_header,
                        str(ctg_alns[i].grouping_confidence),
                        str(ctg_alns[i].location_confidence),
                        str(ctg_alns[i].orientation_confidence),
                    ]) + "\n")

            breaks = []
            intra_breaks, inter_breaks = ctg_alns[i].get_break_candidates(
                min_dist=min_break_end_dist)
            if break_intra:
                breaks = breaks + intra_breaks
            if break_inter:
                breaks = breaks + inter_breaks
            if breaks:
                ctg_breaks[i] = breaks

    # If desired, validate the putative breakpoints by observing read coverage.
    if read_files:
        log("Validating putative query breakpoints via read alignment.")
        log("Aligning reads to query sequences.")
        if not os.path.isfile(output_path + "c_reads_against_query.s.bam"):
            if val_reads_tech == "sr":
                al = Minimap2SAMAligner(query_file,
                                        read_files,
                                        read_aligner_path,
                                        "-ax sr -t " + str(num_threads),
                                        output_path + "c_reads_against_query",
                                        in_overwrite=overwrite_files)
            elif val_reads_tech == "corr":
                al = Minimap2SAMAligner(query_file,
                                        read_files,
                                        read_aligner_path,
                                        "-ax asm5 -t " + str(num_threads),
                                        output_path + "c_reads_against_query",
                                        in_overwrite=overwrite_files)
            else:
                raise ValueError("'-T' must be either 'sr' or 'corr'.")
            al.run_aligner()
        else:
            log("Retaining pre-existing read alignments: " + output_path +
                "c_reads_against_query.s.bam")

        # Compress, sort and index the alignments.
        log("Compressing, sorting, and indexing read alignments")
        run_samtools(output_path, num_threads, overwrite_files)

        # Validate the breakpoints
        log("Validating putative query breakpoints")

        # Give at least 10k/1k from ctg ends for coverage to accumulate for corr and sr, respectively.
        val_min_break_end_dist = min_break_end_dist
        if val_reads_tech == "corr":
            val_min_break_end_dist = max(10000, min_break_end_dist)
        if val_reads_tech == "sr":
            val_min_break_end_dist = max(1000, min_break_end_dist)

        # Validate the breakpoints
        ctg_breaks = validate_breaks(ctg_breaks,
                                     output_path,
                                     num_threads,
                                     overwrite_files,
                                     val_min_break_end_dist,
                                     max_cov,
                                     min_cov,
                                     window_size=val_window_size,
                                     clean_dist=min_break_dist,
                                     debug=debug_mode)

    # Check if we need to avoid gff intervals
    if gff_file:
        log("Avoiding breaks within GFF intervals")
        it = make_gff_interval_tree(gff_file)
        non_gff_breaks = dict()
        for ctg in ctg_breaks:
            new_breaks = []
            for i in ctg_breaks[ctg]:
                if it[ctg][i]:
                    log("Avoiding breaking %s at %d. This point intersects a feature in the gff file."
                        % (ctg, i))
                else:
                    new_breaks.append(i)
            if new_breaks:
                non_gff_breaks[ctg] = new_breaks
        ctg_breaks = non_gff_breaks

    # Write the summary of query sequence breaks in AGP format
    agp_file = output_path + "ragtag.correction.agp"
    write_breaks(agp_file, query_file, ctg_breaks, overwrite_files,
                 remove_suffix)

    # Write the scaffolds.
    log("Writing broken contigs")
    qf_name = query_file.split("/")[-1]
    qf_pref = qf_name[:qf_name.rfind(".")]
    cmd = ["ragtag_break_query.py", agp_file, query_file]
    run_o(cmd, output_path + qf_pref + ".corrected.fasta")

    log("Goodbye")
Пример #3
0
def main():
    description = "Homology-based assembly patching: Make continuous joins and fill gaps " \
                  "in 'target.fa' using sequences from 'query.fa'"

    parser = argparse.ArgumentParser(description=description, usage="ragtag.py patch <target.fa> <query.fa>")

    parser.add_argument("reference", metavar="<target.fa>", nargs='?', default="", type=str, help="target fasta file (uncompressed or bgzipped)")
    parser.add_argument("query", metavar="<query.fa>", nargs='?', default="", type=str, help="query fasta file (uncompressed or bgzipped)")

    patch_options = parser.add_argument_group("patching")
    patch_options.add_argument("-e", metavar="<exclude.txt>", type=str, default="", help="list of target sequences to ignore [null]")
    patch_options.add_argument("-j", metavar="<skip.txt>", type=str, default="", help="list of query sequences to ignore [null]")
    patch_options.add_argument("-f", metavar="INT", type=int, default=1000, help="minimum unique alignment length [1000]")
    patch_options.add_argument("--remove-small", action="store_true", default=False, help="remove unique alignments shorter than '-f'")
    patch_options.add_argument("-q", metavar="INT", type=int, default=10, help="minimum mapq (NA for Nucmer alignments) [10]")
    patch_options.add_argument("-d", metavar="INT", type=int, default=100000, help="maximum alignment merge distance [100000]")
    patch_options.add_argument("-s", metavar="INT", type=int, default=50000, help="minimum merged alignment length [50000]")
    patch_options.add_argument("-i", metavar="FLOAT", type=float, default=0.05, help="maximum merged alignment distance from sequence terminus. fraction of the sequence length if < 1 [0.05]")
    patch_options.add_argument("--fill-only", action="store_true", default=False, help="only fill existing target gaps. do not join target sequences")
    patch_options.add_argument("--join-only", action="store_true", default=False, help="only join and patch target sequences. do not fill existing gaps")

    io_options = parser.add_argument_group("input/output options")
    io_options.add_argument("-o", metavar="PATH", type=str, default="ragtag_output", help="output directory [./ragtag_output]")
    io_options.add_argument("-w", action='store_true', default=False, help="overwrite intermediate files")
    io_options.add_argument("-u", action='store_true', default=False, help="add suffix to unplaced sequence headers")
    io_options.add_argument("--debug", action='store_true', default=False, help=argparse.SUPPRESS)

    aln_options = parser.add_argument_group("mapping options")
    aln_options.add_argument("-t", metavar="INT", type=int, default=1, help="number of minimap2/unimap threads [1]")
    aln_options.add_argument("--aligner", metavar="PATH", type=str, default="nucmer", help="aligner executable ('nucmer' (recommended), 'unimap' or 'minimap2') [nucmer]")
    mm2_default = "-x asm5"
    aln_options.add_argument("--mm2-params", metavar="STR", type=str, default=mm2_default, help="space delimited minimap2 parameters (overrides '-t') ['%s']" % mm2_default)
    aln_options.add_argument("--unimap-params", metavar="STR", type=str, default=mm2_default, help="space delimited unimap parameters (overrides '-t') ['%s']" % mm2_default)
    aln_options.add_argument("--nucmer-params", metavar="STR", type=str, default="--maxmatch -l 100 -c 500", help="space delimted nucmer parameters ['--maxmatch -l 100 -c 500']")

    args = parser.parse_args()
    if not args.reference or not args.query:
        parser.print_help()
        sys.exit("\n** The target and query FASTA files are required **")

    log("VERSION", "RagTag " + get_ragtag_version())
    log("WARNING", "This is a beta version of `ragtag patch`")
    log("CMD", "ragtag.py patch " + " ".join(sys.argv[1:]))

    reference_fn = os.path.abspath(args.reference)
    query_fn = os.path.abspath(args.query)

    # Check that the reference/query file exists
    if not os.path.isfile(reference_fn):
        raise FileNotFoundError("Could not find file: %s" % reference_fn)

    if not os.path.isfile(query_fn):
        raise FileNotFoundError("Could not find file: %s" % query_fn)

    # Alignment processing parameters
    min_ulen = args.f
    keep_small_uniques = not args.remove_small
    merge_dist = args.d
    num_threads = args.t

    aligner_path = args.aligner
    aligner = aligner_path.split("/")[-1]
    if aligner.split("/")[-1] not in {'minimap2', 'unimap', 'nucmer'}:
        raise ValueError("Must specify either 'minimap2', 'unimap', or 'nucmer' (PATHs allowed) with '--aligner'.")

    mm2_params = args.mm2_params
    unimap_params = args.unimap_params
    nucmer_params = args.nucmer_params

    # Mapq filtering parameters
    min_mapq = args.q
    if aligner == "nucmer":
        min_mapq = 0

    # Add the number of mm2/unimap threads if the mm2 params haven't been overridden.
    if mm2_params == mm2_default:
        mm2_params += " -t " + str(num_threads)
    if unimap_params == mm2_default:
        unimap_params += " -t " + str(num_threads)

    # Set reference/query sequences to ignore
    ref_blacklist = set()
    exclude_file = args.e
    if exclude_file:
        exclude_file = os.path.abspath(args.e)
        with open(exclude_file, "r") as f:
            for line in f:
                ref_blacklist.add(line.rstrip())

    query_blacklist = set()
    skip_file = args.j
    if skip_file:
        skip_file = os.path.abspath(skip_file)
        with open(skip_file, "r") as f:
            for line in f:
                query_blacklist.add(line.rstrip())

    # Supporting alignment parameters
    min_sup_aln_len = args.s
    max_term_dist = args.i
    if max_term_dist <= 0:
        raise ValueError("-i must be a positive nonzero number.")

    # Task options
    fill_only = args.fill_only
    join_only = args.join_only
    if fill_only and join_only:
        raise ValueError("'--fill-only' and '--join-only' cannot be used together")

    # I/O parameters
    add_suffix = args.u
    if not add_suffix:
        log("WARNING", "Without '-u' invoked, some component/object AGP pairs might share the same ID. Some external programs/databases don't like this. To ensure valid AGP format, use '-u'.")

    overwrite_files = args.w
    output_path = args.o
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    output_path = os.path.abspath(output_path) + "/"
    file_prefix = "ragtag.patch"

    # Setup a log file for external RagTag scripts
    ragtag_log = output_path + file_prefix + ".err"
    open(ragtag_log, "w").close()  # Wipe the log file

    # Debugging options
    debug_mode = args.debug

    # Break the reference assembly at gaps
    cmd = [
        "ragtag_splitasm.py",
        "-o",
        output_path + file_prefix + ".ctg.agp",
        reference_fn
    ]
    reference_ctg_fn = output_path + file_prefix + ".ctg.fasta"
    if os.path.isfile(reference_ctg_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + reference_ctg_fn)
            run_oae(cmd, reference_ctg_fn, ragtag_log)
        else:
            log("INFO", "Retaining pre-existing file: " + reference_ctg_fn)
    else:
        run_oae(cmd, reference_ctg_fn, ragtag_log)

    # Rename the query sequences
    cmd = [
        "ragtag_rename.py",
        query_fn,
        "-p",
        "qseq",
        "-o",
        output_path + file_prefix + ".rename.agp",
    ]
    query_rename_fn = output_path + file_prefix + ".rename.fasta"
    if os.path.isfile(query_rename_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + query_rename_fn)
            run_oae(cmd, query_rename_fn, ragtag_log)
        else:
            log("INFO", "Retaining pre-existing file: " + query_rename_fn)
    else:
        run_oae(cmd, query_rename_fn, ragtag_log)

    # Combine the reference contigs and query sequences to make a components fasta file
    components_fn = output_path + file_prefix + ".comps.fasta"
    if os.path.isfile(components_fn):
        if overwrite_files:
            log("INFO", "Overwriting pre-existing file: " + components_fn)
            write_comps = True
        else:
            log("INFO", "Retaining pre-existing file: " + components_fn)
            write_comps = False
    else:
        write_comps = True

    if write_comps:
        log("INFO", "Writing: " + components_fn)
        ref_fai = pysam.FastaFile(reference_ctg_fn)
        query_fai = pysam.FastaFile(query_rename_fn)
        with open(components_fn, "w") as f:
            for ref in ref_fai.references:
                f.write(">" + ref + "\n")
                f.write(ref_fai.fetch(ref) + "\n")

            for query in query_fai.references:
                f.write(">" + query + "\n")
                f.write(query_fai.fetch(query) + "\n")

    # Map the query assembly to the reference contigs
    log("INFO", "Mapping the query genome to the target genome")
    if aligner == "minimap2":
        al = Minimap2Aligner(reference_ctg_fn, [query_rename_fn], aligner_path, mm2_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    elif aligner == "unimap":
        al = UnimapAligner(reference_ctg_fn, [query_rename_fn], aligner_path, unimap_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    else:
        al = NucmerAligner(reference_ctg_fn, [query_rename_fn], aligner_path, nucmer_params, output_path + file_prefix + ".asm", in_overwrite=overwrite_files)
    al.run_aligner()

    # If alignments are from Nucmer, need to convert from delta to paf
    if aligner == "nucmer":
        cmd = ["ragtag_delta2paf.py", output_path + file_prefix + ".asm.delta"]
        run_oae(cmd, output_path + file_prefix + ".asm.paf", ragtag_log)

    # Read and organize the alignments
    log("INFO", "Reading whole genome alignments")
    # ctg_alns: query header -> ContigAlignment object
    ctg_alns = read_genome_alignments(output_path + file_prefix + ".asm.paf", query_blacklist, ref_blacklist)

    # Check if any alignments are left
    if not ctg_alns:
        raise RuntimeError("There are no alignments. Check '{}'.".format(output_path + file_prefix + ".asm.paf"))

    # Filter the alignments
    unfiltered_strings, filtered_strings, merged_strings, useful_strings = [], [], [], []
    log("INFO", "Filtering and merging alignments")
    fltrd_ctg_alns = dict()
    for i in ctg_alns:
        # Unique anchor filtering
        unfiltered_strings.append(str(ctg_alns[i]))
        ctg_alns[i] = ctg_alns[i].unique_anchor_filter(min_ulen, keep_small=keep_small_uniques)

        # mapq filtering
        if ctg_alns[i] is not None:
            ctg_alns[i] = ctg_alns[i].filter_mapq(min_mapq)
            if ctg_alns[i] is not None:
                filtered_strings.append(str(ctg_alns[i]))

                # alignment merging
                ctg_alns[i] = ctg_alns[i].merge_alns(merge_dist=merge_dist, careful_merge=True)
                if ctg_alns[i] is not None:
                    merged_strings.append(str(ctg_alns[i]))

                    # Length filtering
                    ctg_alns[i] = ctg_alns[i].filter_lengths(min_sup_aln_len)
                    if ctg_alns[i] is not None:
                        # terminal filtering
                        ctg_alns[i] = ctg_alns[i].keep_terminals(max_term_dist)

                        # Save the remaining useful alignments
                        if ctg_alns[i] is not None and ctg_alns[i].num_refs > 1 and not ctg_alns[i].has_internal_ref_cuttings(max_term_dist):
                            useful_strings.append(str(ctg_alns[i]))
                            fltrd_ctg_alns[i] = ctg_alns[i]

    # Write debugging files
    debug_non_fltrd_file = output_path + file_prefix + ".debug.unfiltered.paf"
    debug_fltrd_file = output_path + file_prefix + ".debug.filtered.paf"
    debug_merged_file = output_path + file_prefix + ".debug.merged.paf"
    debug_useful_file = output_path + file_prefix + ".debug.useful.paf"
    if debug_mode:
        with open(debug_non_fltrd_file, "w") as f:
            f.write("".join(unfiltered_strings))

        with open(debug_fltrd_file, "w") as f:
            f.write("".join(filtered_strings))

        with open(debug_merged_file, "w") as f:
            f.write("".join(merged_strings))

        with open(debug_useful_file, "w") as f:
            f.write("".join(useful_strings))

    # Make a Scaffold Graph encoding known reference contigs adjacencies
    log("INFO", "Building a scaffold graph from the contig AGP file")
    agp_multi_sg = AGPMultiScaffoldGraph(reference_ctg_fn)
    agp_multi_sg.add_agps([output_path + file_prefix + ".ctg.agp"])
    agp_sg = agp_multi_sg.merge()

    # As a hack, go through the AGP sg and make the required directed scaffold graph
    agp_psg = PatchScaffoldGraph(components_fn)
    for u, v in agp_sg.edges:
        aln = Alignment(
            u,
            v,
            "",
            agp_sg[u][v]["gap_size"][0],
            0,
            agp_sg[u][v]["gap_size"][0],
            0,
            is_gap=True
        )
        agp_psg.add_edge(u, v, aln)

    # Make a second directed scaffold graph from the alignments
    log("INFO", "Building a scaffold graph from the target/query mappings")
    aln_psg = build_aln_scaffold_graph(fltrd_ctg_alns, components_fn, max_term_dist)

    # Add edges for unfilled gaps
    for u, v in agp_psg.edges:
        if not aln_psg.has_edge(u, v):
            aln_psg.add_edge(u, v, agp_psg[u][v]["alignment"])

    # Remove known false edges
    for u, v in agp_psg.edges:
        for neighbor in list(aln_psg.neighbors(u)):
            if neighbor != v:
                aln_psg.remove_edge(u, neighbor)
                aln_psg.remove_edge(neighbor, u)

        for neighbor in list(aln_psg.neighbors(v)):
            if neighbor != u:
                aln_psg.remove_edge(neighbor, v)
                aln_psg.remove_edge(v, neighbor)

    # Adjust the graph depending on if only fills or joins are requested
    if fill_only:
        psg = PatchScaffoldGraph(components_fn)
        for u, v in agp_psg.edges:
            psg.add_edge(u, v, aln_psg[u][v]["alignment"])
            psg.add_edge(v, u, aln_psg[v][u]["alignment"])
        aln_psg = psg

    if join_only:
        for u, v in agp_psg.edges:
            aln_psg[u][v]["alignment"] = agp_psg[u][v]["alignment"]
            aln_psg[v][u]["alignment"] = agp_psg[v][u]["alignment"]

    if debug_mode:
        aln_psg.write_gml(output_path + file_prefix + ".debug.sg.gml")

    # Compute a matching solution for the graph
    log("INFO", "Computing a matching solution to the scaffold graph")
    match_psg = aln_psg.max_weight_matching()

    if debug_mode:
        match_psg.write_gml(output_path + file_prefix + ".debug.matching.gml")

    # Write the output in AGP format
    log("INFO", "Writing output files")
    match_psg.write_agp(output_path + file_prefix + ".agp", output_path + file_prefix + ".ctg.fasta", add_suffix_to_unplaced=add_suffix)

    # Write the output in fasta format
    cmd = [
        "ragtag_agp2fa.py",
        output_path + file_prefix + ".agp",
        components_fn
    ]
    run_oae(cmd, output_path + file_prefix + ".fasta", ragtag_log)

    log("INFO", "Goodbye")