예제 #1
0
def test_run_racon():
    # This has a SNP and two indels to fix. Also one position where
    # about 2/3 of the reads say A and the rest say T. Expect this to get
    # corrected to T.
    fa_to_polish = os.path.join(data_dir, "run_racon.to_polish.fa")
    seq_to_polish = utils.load_single_seq_fasta(fa_to_polish)
    reads = os.path.join(data_dir, "run_racon.reads.fa")
    pre_out = "tmp.run_racon"
    utils.rm_rf(f"{pre_out}.sam")
    utils.rm_rf(f"{pre_out}.to_polish.fa")
    polished1 = racon.run_racon(seq_to_polish, reads, pre_out, debug=True)
    assert polished1 != fa_to_polish
    assert (
        polished1 ==
        "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT"
    )
    # we used debug mode, so intermediate files should be left on disk
    assert os.path.exists(f"{pre_out}.sam")
    assert os.path.exists(f"{pre_out}.to_polish.fa")
    os.unlink(f"{pre_out}.sam")
    os.unlink(f"{pre_out}.to_polish.fa")
    # Another round of polishing shouldn't do anything
    polished2 = racon.run_racon(polished1, reads, pre_out, debug=False)
    assert polished1 == polished2
    # we didn't use debug mode so intermediate files should be deleted
    assert not os.path.exists(f"{pre_out}.sam")
    assert not os.path.exists(f"{pre_out}.to_polish.fa")
예제 #2
0
def test_run_racon_iterations():
    # A bit hard to come with small artificial test data for this one.
    # We'll just use the same data as for test_run_racon. Should stop after
    # 2 iterations because only the first run corrects anything
    fa_to_polish = os.path.join(data_dir, "run_racon.to_polish.fa")
    seq_to_polish = utils.load_single_seq_fasta(fa_to_polish)
    reads = os.path.join(data_dir, "run_racon.reads.fa")
    outdir = "tmp.run_racon_iterations"
    utils.rm_rf(outdir)
    got_polished = racon.run_racon_iterations(seq_to_polish,
                                              reads,
                                              outdir,
                                              max_iterations=3,
                                              debug=True)
    for i in range(2):
        outprefix = os.path.join(outdir, f"racon.{i}")
        assert os.path.exists(f"{outprefix}.sam")
        assert os.path.exists(f"{outprefix}.polished.fa")
        assert os.path.exists(f"{outprefix}.to_polish.fa")
    assert len(os.listdir(outdir)) == 6
    assert (
        got_polished ==
        "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT"
    )
    utils.rm_rf(outdir)
예제 #3
0
def test_mask_low_coverage():
    outprefix = "tmp.mask_low_coverage"
    expect_debug_files = [f"{outprefix}.{x}" for x in ["fa", "sam", "bam"]]
    for filename in expect_debug_files:
        utils.rm_rf(filename)
    ref_seq = "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT"
    reads_file = os.path.join(data_dir, "mask_low_coverage.reads.fa")
    got_masked = utils.mask_low_coverage(ref_seq,
                                         reads_file,
                                         outprefix,
                                         min_depth=4,
                                         debug=True)
    assert (
        got_masked ==
        "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT"
    )
    for filename in expect_debug_files:
        assert os.path.exists(filename)
        os.unlink(filename)
    got_masked = utils.mask_low_coverage(ref_seq,
                                         reads_file,
                                         outprefix,
                                         min_depth=1,
                                         debug=False)
    for filename in expect_debug_files:
        assert not os.path.exists(filename)
    assert (
        got_masked ==
        "NNNNNNNNNNNNNNNNGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT"
    )
예제 #4
0
def test_run_racon_iterations_bad_data():
    fa_to_polish = os.path.join(data_dir, "run_racon.to_polish.fa")
    seq_to_polish = utils.load_single_seq_fasta(fa_to_polish)
    reads = os.path.join(data_dir, "run_racon_bad_reads.fa")
    outdir = "tmp.run_racon_iterations"
    utils.rm_rf(outdir)
    got_polished = racon.run_racon_iterations(seq_to_polish,
                                              reads,
                                              outdir,
                                              max_iterations=3,
                                              debug=True)
    assert got_polished is None
    utils.rm_rf(outdir)
예제 #5
0
def run(options):
    if not utils.look_for_required_binaries_in_path():
        raise Exception(
            "At least one required program was not found in $PATH. Cannot continue"
        )

    if (len([
            x for x in (options.bam, options.reads_to_map,
                        options.reads_per_amp_dir) if x is not None
    ]) != 1):
        raise Exception(
            "Must provide exactly one of: --bam, --reads_to_map, --reads_per_amp_dir"
        )

    if options.mates_to_map is not None and options.reads_to_map is None:
        raise Exception(
            "--mates_to_map was used, but --reads_to_map was not. --reads_to_map is required by --mates_to_map"
        )

    if options.force:
        utils.rm_rf(options.outdir)

    if options.amplicons_to_fail_file is None:
        amplicons_to_fail = None
    else:
        with open(options.amplicons_to_fail_file) as f:
            amplicons_to_fail = set([x.rstrip() for x in f])

    return assemble.run_assembly_pipeline(
        options.ref_fasta,
        options.amplicons_json,
        options.outdir,
        sorted_bam=options.bam,
        reads_per_amp_dir=options.reads_per_amp_dir,
        reads_fastaq=options.reads_to_map,
        mates_fastaq=options.mates_to_map,
        minimap_opts=options.minimap_opts,
        min_mean_coverage=options.min_mean_coverage,
        target_coverage=options.target_coverage,
        read_end_trim=options.read_end_trim,
        read_map_tolerance=options.read_map_tolerance,
        min_read_length=options.min_read_length,
        racon_iterations=options.racon_iterations,
        min_depth_for_not_N=options.min_depth_for_not_N,
        min_amp_overlap_len=options.min_amp_overlap_len,
        contig_map_end_allowance=options.contig_map_end_allowance,
        amplicons_to_fail=amplicons_to_fail,
        wgs=options.wgs,
        debug=options.debug,
        command_line_args=options,
    )
예제 #6
0
def test_run_racon_bad_data():
    fa_to_polish = os.path.join(data_dir, "run_racon.to_polish.fa")
    seq_to_polish = utils.load_single_seq_fasta(fa_to_polish)
    reads = os.path.join(data_dir, "run_racon_bad_reads.fa")
    pre_out = "tmp.run_racon"
    utils.rm_rf(f"{pre_out}.sam")
    utils.rm_rf(f"{pre_out}.to_polish.fa")
    polished = racon.run_racon(seq_to_polish, reads, pre_out, debug=True)
    utils.rm_rf(f"{pre_out}.sam")
    utils.rm_rf(f"{pre_out}.to_polish.fa")
    assert polished is None
예제 #7
0
def test_get_reads_for_polishing():
    reads_bam = os.path.join(data_dir, "get_reads_for_polishing.bam")
    bam = pysam.AlignmentFile(reads_bam, "rb")
    reads_out = "tmp.get_reads_for_polishing.reads.fa"
    utils.rm_rf(reads_out)
    amplicon = amplicons.Amplicon("amp1", 59, 419, 1, 1)

    got_reads, got_used, got_cov = amplicon.get_reads_for_polishing(
        "ref1",
        bam,
        reads_out,
        min_coverage=1,
        trim_ends=5,
        tolerance=1,
        min_output_length=300,
        target_depth=3,
    )
    assert got_reads == 6
    assert got_used == 4
    assert got_cov == pytest.approx(4.49, 4.50)
    expect_reads = os.path.join(data_dir, "get_reads_for_polishing.expect.fa")
    assert filecmp.cmp(reads_out, expect_reads, shallow=False)
    os.unlink(reads_out)

    amplicon = amplicons.Amplicon("amp1", 50, 100, 1, 1)
    got_reads, got_used, got_cov = amplicon.get_reads_for_polishing(
        "ref2",
        bam,
        reads_out,
        min_coverage=1,
        trim_ends=5,
        tolerance=1,
        min_output_length=30,
        target_depth=1,
    )
    assert got_reads == 0
    assert got_used == 0
    assert got_cov == 0
    assert not os.path.exists(reads_out)
예제 #8
0
def test_load_and_check_reads_amp_dir():
    tmp_dir = "tmp.load_and_check_reads_amp_dir"
    utils.rm_rf(tmp_dir)
    os.mkdir(tmp_dir)
    json_file = os.path.join(tmp_dir, "manifest.json")
    json_data = {
        "a1": "a1.fasta",
        "a2": "a2.fasta",
    }
    with open(json_file, "w") as f:
        json.dump(json_data, f)

    amp1 = mock.Mock()
    amp1.name = "a1"
    amp2 = mock.Mock()
    amp2.name = "a2"
    amp3 = mock.Mock()
    amp3.name = "a3"

    with pytest.raises(Exception):
        assemble.load_and_check_reads_amp_dir(tmp_dir, set())

    with pytest.raises(Exception):
        assemble.load_and_check_reads_amp_dir(tmp_dir, [amp1])

    with pytest.raises(Exception):
        assemble.load_and_check_reads_amp_dir(tmp_dir, [amp1, amp2])

    for filename in json_data.values():
        with open(os.path.join(tmp_dir, filename), "w"):
            pass

    got = assemble.load_and_check_reads_amp_dir(tmp_dir, [amp1, amp2])
    assert got == {k: os.path.join(tmp_dir, v) for k, v in json_data.items()}

    with pytest.raises(Exception):
        assemble.load_and_check_reads_amp_dir(tmp_dir, [amp1, amp2, amp3])

    utils.rm_rf(tmp_dir)
예제 #9
0
def test_consensus_contigs_to_consensus():
    ref_fasta = os.path.join(data_dir, "consensus_contigs_to_consensus.fa")
    outprefix = "tmp.consensus_contigs_to_consensus"
    utils.rm_rf(f"{outprefix}.*")
    assert (amplicon_overlapper.consensus_contigs_to_consensus(
        None, ref_fasta, outprefix) is None)
    assert (amplicon_overlapper.consensus_contigs_to_consensus(
        [], ref_fasta, outprefix) is None)
    # contig is in ref from 1-120
    contig1 = "GGGTCCTCGGCCTACGACTATATCGCATGGCACGGTGCGGCTGTAGGGACACAAGATAATGTTCCGAGCAATTACGCACTTATTTGGTTCAGGAATCAGACTTCCGGTTTCGAACTTTCG"
    contigs = [contig1]
    got = amplicon_overlapper.consensus_contigs_to_consensus(
        contigs, ref_fasta, outprefix)
    assert got == contig1
    utils.rm_rf(f"{outprefix}.*")

    # contig2 is in ref from 181-300
    contig2 = "CTATTTGCACCGTTGTAAATGCGCAGTTTGAGCTGTTGTTTCGCGGCACCGTAAGAAAAAAGATGTACTGCCGAACTCGGGGCGTAGTGAGGGGTTCATAGCGAGAAACGTCTTGTACGC"
    contigs = [contig1, contig2]
    got = amplicon_overlapper.consensus_contigs_to_consensus(
        contigs, ref_fasta, outprefix)
    assert got == contig1 + "N" * 60 + contig2
    utils.rm_rf(f"{outprefix}.*")

    # contigs in wrong order, should result in aborted assembly
    contigs = [contig2, contig1]
    got = amplicon_overlapper.consensus_contigs_to_consensus(
        contigs, ref_fasta, outprefix)
    assert got is None
    utils.rm_rf(f"{outprefix}.*")

    # Add one short contig that should get removed because won't map well
    # enough to the ref
    contigs = [contig1, contig2[:40]]
    got = amplicon_overlapper.consensus_contigs_to_consensus(
        contigs, ref_fasta, outprefix)
    assert got == contig1
    utils.rm_rf(f"{outprefix}.*")
예제 #10
0
def test_assemble():
    data_dir = os.path.join(data_root, "assemble")
    options = mock.Mock()
    outdir = "tmp.test_task_assemble"
    utils.rm_rf(outdir)
    options.bam = None
    options.ref_fasta = os.path.join(data_dir, "run_assembly_pipeline.ref.fa")
    options.amplicons_json = os.path.join(
        data_dir, "run_assembly_pipeline.amplicons.json")
    options.outdir = outdir
    options.reads_to_map = os.path.join(data_dir,
                                        "run_assembly_pipeline.reads.fa")
    options.reads_per_amp_dir = None
    options.mates_to_map = None
    options.minimap_opts = "-t 1 -x map-ont"
    options.min_mean_coverage = 5
    options.target_coverage = 500
    options.read_end_trim = 1
    options.read_map_tolerance = 20
    options.min_read_length = 200
    options.racon_iterations = 3
    options.min_depth_for_not_N = 1
    options.min_amp_overlap_len = 20
    options.contig_map_end_allowance = 20
    options.amplicons_to_fail_file = None
    options.wgs = False
    options.debug = True
    got = tasks.assemble.run(options)
    expect_fa = os.path.join(data_dir, "run_assembly_pipeline.expect.fa")
    expect_seq = utils.load_single_seq_fasta(expect_fa)
    # expected fasta is the fasta used to generate the reads. But the amplicons
    # don't cover the whole genome, so we expect to miss the ends
    assert got == expect_seq[11:988]
    consensus_from_file = utils.load_single_seq_fasta(
        os.path.join(outdir, "consensus.final_assembly.fa"))
    assert got == consensus_from_file.seq
    assert os.path.exists(os.path.join(options.outdir, "run_info.json"))
    utils.rm_rf(outdir)

    # Test the option amplicons_to_fail_file
    options.amplicons_to_fail_file = "tmp.amplicons_to_fail.txt"
    with open(options.amplicons_to_fail_file, "w") as f:
        print("a1", file=f)
    got = tasks.assemble.run(options)
    expect_fa = os.path.join(data_dir, "run_assembly_pipeline.expect.fa")
    expect_seq = utils.load_single_seq_fasta(expect_fa)
    assert got == expect_seq[356:988]
    consensus_from_file = utils.load_single_seq_fasta(
        os.path.join(outdir, "consensus.final_assembly.fa"))
    assert got == consensus_from_file.seq
    assert os.path.exists(os.path.join(options.outdir, "run_info.json"))
    utils.rm_rf(outdir)
    os.unlink(options.amplicons_to_fail_file)
예제 #11
0
def test_polish():
    ref_fasta = os.path.join(data_dir, "polish.ref.fa")
    ref_genome = utils.load_single_seq_fasta(ref_fasta)
    amplicon = amplicons.Amplicon("amplicon1", 60, 259, 1, 1)
    reads_bam = os.path.join(data_dir, "polish.bam")
    bam = pysam.AlignmentFile(reads_bam, "rb")
    outdir = "tmp.polish.out"
    utils.rm_rf(outdir)
    amplicon.polish(
        ref_genome,
        outdir,
        bam_to_slice_reads=bam,
        min_mean_coverage=3,
        racon_iterations=3,
        min_depth_for_not_N=3,
        min_read_length=100,
        max_polished_N_prop=0.5,
        debug=True,
    )
    assert (
        amplicon.masked_seq ==
        "NNNNNNNNNNNNNNNNNNNNAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGNNNNNNNNNNNNNNNNNNNNN"
    )
    assert amplicon.assemble_success
    assert amplicon.polish_data["Polish success"]
    utils.rm_rf(outdir)

    # Same again, but this time use the fasta of reads instead of the BAM file.
    # Plus, this is giving untrimmed reads, so we get less masking. In the
    # previous run 20bp trimmed off all the reads
    reads_file = os.path.join(data_dir, "polish.reads.fa")
    amplicon.polish(
        ref_genome,
        outdir,
        reads_file=reads_file,
        min_mean_coverage=3,
        racon_iterations=3,
        min_depth_for_not_N=3,
        min_read_length=100,
        max_polished_N_prop=0.5,
        debug=True,
    )
    assert (
        amplicon.masked_seq ==
        "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT"
    )
    assert amplicon.assemble_success
    assert amplicon.polish_data["Polish success"]
    utils.rm_rf(outdir)

    # The reads are such that there's a dip in coverage in the middle of the
    # amplicon. Setting min_depth_for_not_N higher makes this region get
    # masked, and then the amplicon should get failed
    amplicon = amplicons.Amplicon("amplicon1", 60, 259, 1, 1)
    amplicon.polish(
        ref_genome,
        outdir,
        bam_to_slice_reads=bam,
        min_mean_coverage=3,
        racon_iterations=3,
        min_depth_for_not_N=18,
        min_read_length=50,
        max_polished_N_prop=0.1,
        debug=True,
    )
    assert (
        amplicon.masked_seq ==
        "NNNNNNNNNNNNNNNNNNNNAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGNNNNNNNNNNNNNNNNNNNNN"
    )
    assert not amplicon.assemble_success
    assert not amplicon.polish_data["Polish success"]
    utils.rm_rf(outdir)
예제 #12
0
    def polish(
        self,
        ref_genome,
        outdir,
        bam_to_slice_reads=None,
        reads_file=None,
        min_mean_coverage=25,
        target_coverage=500,
        read_end_trim=20,
        read_map_tolerance=20,
        min_read_length=200,
        racon_iterations=3,
        min_depth_for_not_N=5,
        max_polished_N_prop=0.1,
        wgs=False,
        debug=False,
        minimap_opts=None,
    ):
        os.mkdir(outdir)
        if reads_file is None:
            if bam_to_slice_reads is None:
                self.polish_data["Comments"].append(f"No reads provided. Calling this amplicon failed")
                return

            reads_file = os.path.join(outdir, "reads.fa")
            total_reads, used_reads, coverage = self.get_reads_for_polishing(
                ref_genome.id,
                bam_to_slice_reads,
                reads_file,
                min_coverage=min_mean_coverage,
                trim_ends=read_end_trim,
                tolerance=read_map_tolerance,
                min_output_length=min_read_length,
                target_depth=target_coverage,
                wgs=wgs,
            )
            logging.debug(
                f"Extracted {total_reads} reads for amplicon {self.name}. Using {used_reads} for polishing, at mean depth of {coverage}"
            )
            if coverage < min_mean_coverage:
                logging.warning(
                    f"Mean coverage for amplicon {self.name} is too low: {coverage}. Considering this a failed amplicon"
                )
                self.polish_data["Comments"].append(f"Coverage {coverage} too low")
                return
        else:
            logging.debug(
                "Using user-supplied reads {reads_file} for amplicon {self.name}"
            )
            self.polish_data["Comments"].append(
                f"Read stats not calculated because user supplied file {reads_file}"
            )

        amplicon_seq = ref_genome[self.start : self.end + 1]
        racon_dir = os.path.join(outdir, "Racon")
        self.polished_seq = racon.run_racon_iterations(
            amplicon_seq,
            reads_file,
            racon_dir,
            debug=debug,
            max_iterations=racon_iterations,
            minimap_opts=minimap_opts,
        )
        logging.debug(f"polished_seq: {self.polished_seq}")
        if self.polished_seq is None:
            self.polish_data["Comments"].append("No sequenced returned from racon")
            return
        mask_outprefix = os.path.join(outdir, "masked")
        self.masked_seq = utils.mask_low_coverage(
            self.polished_seq,
            reads_file,
            mask_outprefix,
            min_depth=min_depth_for_not_N,
            debug=debug,
        )
        logging.debug(f"masked: {self.masked_seq}")
        masked_strip_ns = self.masked_seq.strip("N")
        if len(masked_strip_ns) == 0:
            proportion_masked = 0
        else:
            proportion_masked = round(
                masked_strip_ns.count("N") / len(masked_strip_ns), 2
            )
        if proportion_masked > max_polished_N_prop:
            percent_N = 100 * proportion_masked
            self.polish_data["Comments"].append(
                f"Too many Ns ({percent_N}%) after masking polished sequence (not including Ns at the start/end)"
            )
        else:
            self.polish_data["Polish success"] = True
            self.assemble_success = True

        if not debug:
            utils.rm_rf(outdir)
예제 #13
0
def polish_each_amplicon(
    ref_genome,
    amplicons,
    outdir,
    bam_to_slice_reads=None,
    amplicon_to_reads_file=None,
    min_mean_coverage=25,
    target_coverage=500,
    read_end_trim=20,
    read_map_tolerance=20,
    min_read_length=200,
    racon_iterations=3,
    min_depth_for_not_N=5,
    amplicons_to_fail=None,
    wgs=False,
    debug=False,
    minimap_opts=None,
):
    if amplicon_to_reads_file is None:
        amplicon_to_reads_file = {}
    if amplicons_to_fail is None:
        amplicons_to_fail = set()

    for i, amplicon in enumerate(amplicons):
        logging.debug(
            f"Start processing amplicon {amplicon.name} ({i+1}/{len(amplicons)})"
        )
        if amplicon.name in amplicons_to_fail:
            logging.debug(
                f"User chose to fail amplicon {amplicon.name}. Moving on")
            amplicon.force_polish_fail()
            continue

        logging.debug(
            f"Extracting reads and polishing amplicon {amplicon.name}")
        amplicon_dir = os.path.join(outdir, str(i + 1))
        amplicon.polish(
            ref_genome,
            amplicon_dir,
            bam_to_slice_reads=bam_to_slice_reads,
            reads_file=amplicon_to_reads_file.get(amplicon.name, None),
            min_mean_coverage=min_mean_coverage,
            target_coverage=target_coverage,
            read_end_trim=read_end_trim,
            read_map_tolerance=read_map_tolerance,
            min_read_length=min_read_length,
            racon_iterations=racon_iterations,
            min_depth_for_not_N=min_depth_for_not_N,
            wgs=wgs,
            debug=debug,
            minimap_opts=minimap_opts,
        )
        ok = "yes" if amplicon.assemble_success else "no"
        logging.debug(
            f"Finish polishing amplicon {amplicon.name}. Success: {ok}")
        if not debug:
            utils.rm_rf(amplicon_dir)

        logging.debug(
            f"Finish processing amplicon {amplicon.name} ({i+1}/{len(amplicons)})"
        )
        if i % 10 == 0:
            logging.info(f"Processed {i+1} of {len(amplicons)} amplicons")
예제 #14
0
def run_assembly_pipeline(
    ref_fasta,
    amplicons_json,
    outdir,
    sorted_bam=None,
    reads_per_amp_dir=None,
    reads_fastaq=None,
    mates_fastaq=None,
    minimap_opts=None,
    min_mean_coverage=25,
    target_coverage=500,
    read_end_trim=20,
    read_map_tolerance=20,
    min_read_length=200,
    racon_iterations=3,
    min_depth_for_not_N=5,
    min_amp_overlap_len=20,
    contig_map_end_allowance=20,
    amplicons_to_fail=None,
    wgs=False,
    debug=False,
    command_line_args=None,
):
    # Make a dict of the command line options to go in the JSON output file.
    # The tests don't use argparse (they use Mock), which means convert to dict
    # doesn't work. Don't care about that case anyway in the final output, so
    # just set to None
    if isinstance(command_line_args, argparse.Namespace):
        options_dict = {
            k: v
            for k, v in vars(command_line_args).items() if k != "func"
        }
    else:
        options_dict = None

    start_time = datetime.datetime.now()
    os.mkdir(outdir)
    json_out = os.path.join(outdir, "run_info.json")

    json_data = {
        "run_summary": {
            "total_amplicons": None,
            "successful_amplicons": None,
            "command": " ".join(sys.argv),
            "options": options_dict,
            "cwd": os.getcwd(),
            "version": viridian_version,
            "finished_running": False,
            "made_consensus": False,
            "consensus": None,
            "start_time": start_time.replace(microsecond=0).isoformat(),
            "end_time": None,
            "hostname": socket.gethostname(),
        },
        "amplicons": None,
    }
    with open(json_out, "w") as f:
        json.dump(json_data, f, indent=2, sort_keys=True)

    ref_genome = utils.load_single_seq_fasta(ref_fasta)
    logging.info(f"Loaded ref genome {ref_genome.id}")
    amplicons = amps.load_amplicons_json_file(amplicons_json)
    json_data["run_summary"]["total_amplicons"] = len(amplicons)
    logging.info(f"Loaded amplicons file {amplicons_json}")
    amplicon_to_reads_file = None

    if reads_per_amp_dir is not None:
        assert reads_fastaq is None
        assert sorted_bam is None
        amplicon_to_reads_file = load_and_check_reads_amp_dir(
            reads_per_amp_dir, amplicons)
        bam = None
    elif reads_fastaq is not None:
        assert sorted_bam is None
        assert reads_per_amp_dir is None
        logging.info("Reads in FASTA/FASTQ format provided. Mapping reads")
        sorted_bam = os.path.join(outdir, "map_reads.bam")
        map_reads(
            ref_fasta,
            reads_fastaq,
            sorted_bam,
            minimap_opts=minimap_opts,
            mates_file=mates_fastaq,
        )
        logging.info("Finished mapping reads")
        bam = pysam.AlignmentFile(sorted_bam, "rb")
    else:
        assert sorted_bam is not None
        bam = pysam.AlignmentFile(sorted_bam, "rb")

    if debug:
        polish_root_dir = os.path.join(outdir, "Amplicon_polish")
        os.mkdir(polish_root_dir)
    else:
        polish_root_dir = tempfile.mkdtemp(prefix="viridian_polish_")

    logging.info(
        f"Start polishing each amplicon. Directory: {polish_root_dir}")
    try:
        polish_each_amplicon(
            ref_genome,
            amplicons,
            polish_root_dir,
            bam_to_slice_reads=bam,
            amplicon_to_reads_file=amplicon_to_reads_file,
            min_mean_coverage=min_mean_coverage,
            target_coverage=target_coverage,
            read_end_trim=read_end_trim,
            read_map_tolerance=read_map_tolerance,
            min_read_length=min_read_length,
            racon_iterations=racon_iterations,
            min_depth_for_not_N=min_depth_for_not_N,
            amplicons_to_fail=amplicons_to_fail,
            wgs=wgs,
            debug=debug,
            minimap_opts=minimap_opts,
        )
    finally:
        if not debug:
            utils.rm_rf(polish_root_dir)

    logging.info("Finished polishing each amplicon")
    add_successful_amplicons_to_json_data(json_data, amplicons)
    if json_data["run_summary"]["successful_amplicons"] == 0:
        logging.warning("No amplicons successfully polished!")
        consensus = None
    else:
        logging.info("Start making consensus from polished amplicons")
        overlap_out = os.path.join(outdir, "consensus")
        consensus = amplicon_overlapper.assemble_amplicons(
            amplicons,
            ref_fasta,
            overlap_out,
            min_match_length=min_amp_overlap_len,
            ref_map_end_allowance=contig_map_end_allowance,
            debug=debug,
        )
    json_data["run_summary"]["consensus"] = consensus

    # Need to recalculate successful amplicons because they can get failed
    # during overlapping. If two adjacent amplicons have no overlap, then they
    # both get failed.
    add_successful_amplicons_to_json_data(json_data, amplicons)

    if consensus is None:
        logging.warning(
            "Did not make consensus sequence. Please see previous warnings")
    else:
        logging.info("Finished making consensus sequence.")
        json_data["run_summary"]["made_consensus"] = True

    add_consensus_length_N_count_to_json_data(json_data)
    json_data["amplicons"] = amps.amplicons_to_list_of_dicts(amplicons)
    json_data["run_summary"]["finished_running"] = True
    end_time = datetime.datetime.now()
    json_data["run_summary"]["end_time"] = end_time.replace(
        microsecond=0).isoformat()
    json_data["run_summary"]["run_time"] = str(end_time - start_time)
    with open(json_out, "w") as f:
        json.dump(json_data, f, indent=2, sort_keys=True)
    return consensus
예제 #15
0
def test_run_assembly_pipeline():
    ref_fa = os.path.join(data_dir, "run_assembly_pipeline.ref.fa")
    reads_fa = os.path.join(data_dir, "run_assembly_pipeline.reads.fa")
    amplicon_json = os.path.join(data_dir,
                                 "run_assembly_pipeline.amplicons.json")
    outdir = "tmp.run_assembly_pipeline"
    utils.rm_rf(outdir)
    got = assemble.run_assembly_pipeline(
        ref_fa,
        amplicon_json,
        outdir,
        reads_fastaq=reads_fa,
        debug=True,
        min_mean_coverage=5,
        min_depth_for_not_N=1,
        read_end_trim=1,
    )

    expect_fa = os.path.join(data_dir, "run_assembly_pipeline.expect.fa")
    expect_seq = utils.load_single_seq_fasta(expect_fa)
    # expected fasta is the fasta used to generate the reads. But the amplicons
    # don't cover the whole genome, so we expect to miss the ends
    assert got == expect_seq[11:988]
    consensus_from_file = utils.load_single_seq_fasta(
        os.path.join(outdir, "consensus.final_assembly.fa"))
    assert got == consensus_from_file.seq
    utils.rm_rf(outdir)

    # rerun, but using a directory of one file of reads per amplicon. This is
    # what viridian workflow will be making as input
    reads_per_amp_dir = os.path.join(data_dir,
                                     "run_assembly_pipeline.reads_per_amp")
    got = assemble.run_assembly_pipeline(
        ref_fa,
        amplicon_json,
        outdir,
        reads_per_amp_dir=reads_per_amp_dir,
        debug=True,
        min_mean_coverage=5,
        min_depth_for_not_N=1,
        read_end_trim=1,
    )
    assert got == expect_seq[10:988]
    consensus_from_file = utils.load_single_seq_fasta(
        os.path.join(outdir, "consensus.final_assembly.fa"))
    assert got == consensus_from_file.seq
    utils.rm_rf(outdir)

    # Rerun, but test force failing the first amplicon
    got = assemble.run_assembly_pipeline(
        ref_fa,
        amplicon_json,
        outdir,
        reads_fastaq=reads_fa,
        debug=True,
        min_mean_coverage=5,
        min_depth_for_not_N=1,
        read_end_trim=1,
        amplicons_to_fail={"a1"},
    )

    expect_fa = os.path.join(data_dir, "run_assembly_pipeline.expect.fa")
    expect_seq = utils.load_single_seq_fasta(expect_fa)
    # This time, we should not have the first amplicon, and the returned
    # sequence should start with the second amplicon
    assert got == expect_seq[356:988]
    consensus_from_file = utils.load_single_seq_fasta(
        os.path.join(outdir, "consensus.final_assembly.fa"))
    assert got == consensus_from_file.seq

    # some checks of the contents of the json summary
    with open(os.path.join(outdir, "run_info.json")) as f:
        run_info = json.load(f)
    assert run_info["run_summary"]["made_consensus"] is True
    assert run_info["run_summary"]["amplicon_success"] == {
        "a1": False,
        "a2": True,
        "a3": True,
    }
    assert run_info["run_summary"]["successful_amplicons"] == 2
    assert run_info["run_summary"]["total_amplicons"] == 3
    assert run_info["run_summary"]["consensus_length"] == 632
    assert run_info["run_summary"]["consensus_N_count"] == 0
    utils.rm_rf(outdir)
예제 #16
0
def test_assemble_amplicons():
    ref_fasta = os.path.join(data_dir, "assemble_amplicons.ref.fa")
    ref_seq = utils.load_single_seq_fasta(ref_fasta)
    amplicons = [
        amps.Amplicon("a1", 20, 300, 1, 2),
        amps.Amplicon("a2", 240, 550, 3, 4),
        amps.Amplicon("a3", 500, 850, 5, 6),
        amps.Amplicon("a4", 790, 970, 7, 8),
    ]
    outprefix = "tmp.assemble_amplicons"
    utils.rm_rf(f"{outprefix}.*")
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got is None
    utils.rm_rf(f"{outprefix}.*")

    amplicons[0].masked_seq = ref_seq[20:301]
    amplicons[0].assemble_success = True
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got == amplicons[0].masked_seq[0:-2]
    utils.rm_rf(f"{outprefix}.*")

    amplicons[1].masked_seq = ref_seq[250:545]
    amplicons[1].assemble_success = True
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got == ref_seq[20:541]
    utils.rm_rf(f"{outprefix}.*")

    amplicons[3].masked_seq = ref_seq[790:952]
    amplicons[3].assemble_success = True
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got == ref_seq[20:541] + "N" * 256 + ref_seq[797:951]
    utils.rm_rf(f"{outprefix}.*")

    # putting in junk for amplicon 2 means it won't overlap amplicons 1 or 3,
    # and we should only get amplicon 0 back
    amplicons[
        2].masked_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTT"
    amplicons[2].assemble_success = True
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got == ref_seq[20:299]
    utils.rm_rf(f"{outprefix}.*")