def test_load_amplicons_json_file(): expect = [ amplicons.Amplicon("name1", 42, 99, 5, 3), amplicons.Amplicon("name2", 90, 150, 2, 9), ] infile = os.path.join(data_dir, "load_amplicons_json_file.json") assert expect == amplicons.load_amplicons_json_file(infile)
def test_assemble_amplicons(): ref_fasta = os.path.join(data_dir, "assemble_amplicons.ref.fa") ref_seq = utils.load_single_seq_fasta(ref_fasta) amplicons = [ amps.Amplicon("a1", 20, 300, 1, 2), amps.Amplicon("a2", 240, 550, 3, 4), amps.Amplicon("a3", 500, 850, 5, 6), amps.Amplicon("a4", 790, 970, 7, 8), ] outprefix = "tmp.assemble_amplicons" utils.rm_rf(f"{outprefix}.*") got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got is None utils.rm_rf(f"{outprefix}.*") amplicons[0].masked_seq = ref_seq[20:301] amplicons[0].assemble_success = True got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got == amplicons[0].masked_seq[0:-2] utils.rm_rf(f"{outprefix}.*") amplicons[1].masked_seq = ref_seq[250:545] amplicons[1].assemble_success = True got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got == ref_seq[20:541] utils.rm_rf(f"{outprefix}.*") amplicons[3].masked_seq = ref_seq[790:952] amplicons[3].assemble_success = True got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got == ref_seq[20:541] + "N" * 256 + ref_seq[797:951] utils.rm_rf(f"{outprefix}.*") # putting in junk for amplicon 2 means it won't overlap amplicons 1 or 3, # and we should only get amplicon 0 back amplicons[ 2].masked_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTT" amplicons[2].assemble_success = True got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got == ref_seq[20:299] utils.rm_rf(f"{outprefix}.*")
def test_get_amplicon_overlaps(): amplicons = [ amps.Amplicon("a1", 10, 100, 1, 1), amps.Amplicon("a2", 10, 100, 1, 1), amps.Amplicon("a3", 110, 142, 1, 1), ] amplicons[1].masked_seq = "AAAAAAAAAAAAAAAAAATGCTGAACAGTCCCCCCC" amplicons[ 2].masked_seq = "CCTGCTGAACGGTTGATGCATCTCATGCTGACNNAGGTGTGGCCAAAAA" expect_overlaps = [None, Match(18, 2, 8)] got_overlaps = amplicon_overlapper.get_amplicon_overlaps(amplicons, 8) assert got_overlaps == expect_overlaps assert amplicon_overlapper.get_amplicon_overlaps(amplicons, 9) == [None, None]
def test_use_read_for_polishing(): amplicon = amplicons.Amplicon("name", 50, 100, 1, 1) read = mock.Mock() # Test start of read is within X bp of start of amplicon read.reference_start = 48 read.reference_end = 75 assert amplicon.use_read_for_polishing(read, 2, None, wgs=False) assert not amplicon.use_read_for_polishing(read, 1, None, wgs=False) # Test end of read is within X bp of end of amplicon read.reference_start = 75 read.reference_end = 103 assert amplicon.use_read_for_polishing(read, 3, None, wgs=False) assert not amplicon.use_read_for_polishing(read, 2, None, wgs=False) # Test overlapping read when wgs is True read.reference_start = 1 read.reference_end = 49 assert not amplicon.use_read_for_polishing(read, None, 10, wgs=True) read.reference_end = 60 assert amplicon.use_read_for_polishing(read, None, 11, wgs=True) assert not amplicon.use_read_for_polishing(read, None, 12, wgs=True) read.reference_start = 60 read.reference_end = 69 assert amplicon.use_read_for_polishing(read, None, 10, wgs=True) assert not amplicon.use_read_for_polishing(read, None, 11, wgs=True) read.reference_start = 20 read.reference_end = 110 assert amplicon.use_read_for_polishing(read, None, 51, wgs=True) assert not amplicon.use_read_for_polishing(read, None, 52, wgs=True)
def test_masked_seq_centre_coord(): amplicon = amplicons.Amplicon("name", 0, 10, 1, 1) assert amplicon.masked_seq_centre_coord() is None amplicon.masked_seq = "ACT" assert amplicon.masked_seq_centre_coord() == 1 amplicon.masked_seq = "ACTG" assert amplicon.masked_seq_centre_coord() == 2 amplicon.masked_seq = "ACTGA" assert amplicon.masked_seq_centre_coord() == 2 amplicon.masked_seq = "ACTGAT" assert amplicon.masked_seq_centre_coord() == 3
def test_ref_centre_coord(): amplicon = amplicons.Amplicon("name", 0, 9, 1, 1) assert amplicon.ref_centre_coord() == 5 amplicon = amplicons.Amplicon("name", 0, 10, 1, 1) assert amplicon.ref_centre_coord() == 5 amplicon = amplicons.Amplicon("name", 0, 11, 1, 1) assert amplicon.ref_centre_coord() == 6 amplicon = amplicons.Amplicon("name", 0, 12, 1, 1) assert amplicon.ref_centre_coord() == 6 amplicon = amplicons.Amplicon("name", 0, 13, 1, 1) assert amplicon.ref_centre_coord() == 7 amplicon = amplicons.Amplicon("name", 10, 19, 1, 1) assert amplicon.ref_centre_coord() == 15 amplicon = amplicons.Amplicon("name", 10, 20, 1, 1) assert amplicon.ref_centre_coord() == 15 amplicon = amplicons.Amplicon("name", 10, 21, 1, 1) assert amplicon.ref_centre_coord() == 16 amplicon = amplicons.Amplicon("name", 10, 22, 1, 1) assert amplicon.ref_centre_coord() == 16 amplicon = amplicons.Amplicon("name", 10, 23, 1, 1) assert amplicon.ref_centre_coord() == 17
def test_amplicons_to_consensus_contigs_2(): # This hits case not seen in previous test. Need a combination of amplicons # that pass fail pass fail pass. Was a bug where new contig was not being # started when we had two amplicons that didn't overlap and got removed # ref is 130bp of random sequence # 10 20 30 40 50 60 70 80 90 100 110 120 # 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 ref = "GGCAACAAGCCCCGTAACCCAGCTCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGGCTCCTATGCACAGCGCGGACCAACAGTCGATCGTTATCCTAACTCTACATACTAA" amplicons = [ amps.Amplicon("amp1", 0, 30, 1, 1), amps.Amplicon("amp2", 20, 50, 1, 2), amps.Amplicon("amp3", 40, 70, 1, 1), amps.Amplicon("amp4", 60, 90, 1, 1), amps.Amplicon("amp5", 80, 110, 2, 1), amps.Amplicon("amp6", 100, 130, 1, 3), ] got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 7) assert got_contigs == None amplicons[0].masked_seq = ref[0:30] amplicons[0].assemble_success = True amplicons[1].masked_seq = ref[20:50] amplicons[1].assemble_success = True amplicons[2].masked_seq = ref[40:55] + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" amplicons[2].assemble_success = True amplicons[3].masked_seq = "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" + ref[75:90] amplicons[3].assemble_success = True amplicons[4].masked_seq = ref[80:110] amplicons[4].assemble_success = True amplicons[5].masked_seq = ref[100:130] amplicons[5].assemble_success = True got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 7) assert got_contigs == [ref[0:48], ref[82:129]]
def test_get_reads_for_polishing(): reads_bam = os.path.join(data_dir, "get_reads_for_polishing.bam") bam = pysam.AlignmentFile(reads_bam, "rb") reads_out = "tmp.get_reads_for_polishing.reads.fa" utils.rm_rf(reads_out) amplicon = amplicons.Amplicon("amp1", 59, 419, 1, 1) got_reads, got_used, got_cov = amplicon.get_reads_for_polishing( "ref1", bam, reads_out, min_coverage=1, trim_ends=5, tolerance=1, min_output_length=300, target_depth=3, ) assert got_reads == 6 assert got_used == 4 assert got_cov == pytest.approx(4.49, 4.50) expect_reads = os.path.join(data_dir, "get_reads_for_polishing.expect.fa") assert filecmp.cmp(reads_out, expect_reads, shallow=False) os.unlink(reads_out) amplicon = amplicons.Amplicon("amp1", 50, 100, 1, 1) got_reads, got_used, got_cov = amplicon.get_reads_for_polishing( "ref2", bam, reads_out, min_coverage=1, trim_ends=5, tolerance=1, min_output_length=30, target_depth=1, ) assert got_reads == 0 assert got_used == 0 assert got_cov == 0 assert not os.path.exists(reads_out)
def test_masked_overlap(): Match = collections.namedtuple("Match", ("a", "b", "size")) amp1 = amplicons.Amplicon("a1", 50, 100, 1, 1) amp2 = amplicons.Amplicon("a2", 90, 150, 1, 1) assert amp1.masked_overlap(amp2, 10) is None amp2.masked_seq = "AAAAAAAAAAAAAAAAAATGCTGAACAGTCCCCCCC" assert amp1.masked_overlap(amp2, 10) is None amp1.masked_seq = "AAAAAAAAAAAAAAAAAATGCTGAACAGTCCCCCCC" amp2.masked_seq = None assert amp1.masked_overlap(amp2, 10) is None amp2.masked_seq = "CCTGCTGAACGGTTGATGCATCTCATGCTGACNNAGGTGTGGCCAAAAA" assert amp1.masked_overlap(amp2, 7) == Match(18, 2, 8) assert amp1.masked_overlap(amp2, 8) == Match(18, 2, 8) assert amp1.masked_overlap(amp2, 9) == None amp1.masked_seq = "CCTGCTGAACGGTTGATGCATCTCATGCTGACNNAGGTGTGGCCAAAAA" amp2.masked_seq = "NNAGGTGTGGCCTTTTTTTTTTTTTTTTTTTTTTTTTT" assert amp1.masked_overlap(amp2, 10) == Match(34, 2, 10) assert amp1.masked_overlap(amp2, 11) == None amp1.masked_seq = "NNAGGTGTGGCCTTTTTTTTTTTTTTTTTTTTTTTTTT" amp2.masked_seq = "AAAAAAAAAAAAAAAAANNNNNNNNNNN" assert amp1.masked_overlap(amp2, 0) == Match(2, 0, 1) assert amp1.masked_overlap(amp2, 2) == None
def test_expected_overlap_length(): amplicon1 = amplicons.Amplicon("name", 0, 10, 1, 1) amplicon2 = amplicons.Amplicon("name", 8, 20, 1, 1) amplicon3 = amplicons.Amplicon("name", 11, 20, 1, 1) assert amplicon1.expected_overlap_length(amplicon2) == 3 assert amplicon1.expected_overlap_length(amplicon3) is None
def test_polish(): ref_fasta = os.path.join(data_dir, "polish.ref.fa") ref_genome = utils.load_single_seq_fasta(ref_fasta) amplicon = amplicons.Amplicon("amplicon1", 60, 259, 1, 1) reads_bam = os.path.join(data_dir, "polish.bam") bam = pysam.AlignmentFile(reads_bam, "rb") outdir = "tmp.polish.out" utils.rm_rf(outdir) amplicon.polish( ref_genome, outdir, bam_to_slice_reads=bam, min_mean_coverage=3, racon_iterations=3, min_depth_for_not_N=3, min_read_length=100, max_polished_N_prop=0.5, debug=True, ) assert ( amplicon.masked_seq == "NNNNNNNNNNNNNNNNNNNNAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGNNNNNNNNNNNNNNNNNNNNN" ) assert amplicon.assemble_success assert amplicon.polish_data["Polish success"] utils.rm_rf(outdir) # Same again, but this time use the fasta of reads instead of the BAM file. # Plus, this is giving untrimmed reads, so we get less masking. In the # previous run 20bp trimmed off all the reads reads_file = os.path.join(data_dir, "polish.reads.fa") amplicon.polish( ref_genome, outdir, reads_file=reads_file, min_mean_coverage=3, racon_iterations=3, min_depth_for_not_N=3, min_read_length=100, max_polished_N_prop=0.5, debug=True, ) assert ( amplicon.masked_seq == "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT" ) assert amplicon.assemble_success assert amplicon.polish_data["Polish success"] utils.rm_rf(outdir) # The reads are such that there's a dip in coverage in the middle of the # amplicon. Setting min_depth_for_not_N higher makes this region get # masked, and then the amplicon should get failed amplicon = amplicons.Amplicon("amplicon1", 60, 259, 1, 1) amplicon.polish( ref_genome, outdir, bam_to_slice_reads=bam, min_mean_coverage=3, racon_iterations=3, min_depth_for_not_N=18, min_read_length=50, max_polished_N_prop=0.1, debug=True, ) assert ( amplicon.masked_seq == "NNNNNNNNNNNNNNNNNNNNAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGNNNNNNNNNNNNNNNNNNNNN" ) assert not amplicon.assemble_success assert not amplicon.polish_data["Polish success"] utils.rm_rf(outdir)
def test_amplicons_to_consensus_contigs(): # ref is 100bp of random sequence # 10 20 30 40 50 60 70 80 90 # 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 # ref = "GGCAACAAGCCCCGTAACCCAGCTCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGGCTCCTATGCACAGCGCGGACCAACA" amplicons = [ amps.Amplicon("amp1", 9, 39, 1, 2), amps.Amplicon("amp2", 24, 75, 3, 4), amps.Amplicon("amp3", 63, 99, 5, 6), ] got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 20) assert got_contigs == None amplicons[0].masked_seq = "CCCCGTAACCGAGCTCACCAGCGAATCACAA" amplicons[0].assemble_success = True got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 10) assert got_contigs == [amplicons[0].masked_seq[0:-2]] amplicons[ 1].masked_seq = "TCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGG" amplicons[1].assemble_success = True got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 10) expect = ["CCCCGTAACCGAGCTCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACT"] assert got_contigs == expect amplicons[2].masked_seq = "NNCAGAACACTTTGGCTCCTATGCACAGCGCGGACCAANN" amplicons[2].assemble_success = True got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 10) expect = [ "CCCCGTAACCGAGCTCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGGCTCCTATGCACAGCGCGGACCAA" ] assert got_contigs == expect amplicons[1].masked_seq = None amplicons[1].assemble_success = False got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 10) expect = [ amplicons[0].masked_seq[0:-2], amplicons[2].masked_seq[5:].rstrip("N") ] assert got_contigs == expect amplicons[0].masked_seq = None amplicons[0].assemble_success = False got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 10) expect = [amplicons[2].masked_seq[5:].rstrip("N")] assert got_contigs == expect amplicons[0].masked_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" amplicons[ 1].masked_seq = "TCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGG" amplicons[2].masked_seq = "NNNNNNACACTTTGGCTCCTATGCACAGCGCGGANNNNNN" amplicons[0].assemble_success = False amplicons[1].assemble_success = False got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs( amplicons, 10) expect = [amplicons[2].masked_seq[5:].strip("N")] assert got_contigs == expect
"start": 350, "end": 799, "left_primer_end": 355, "right_primer_start": 790, }, "a3": { "start": 740, "end": 989, "left_primer_end": 745, "right_primer_start": 980, }, } } amplicons = [ amps.Amplicon("a1", 10, 399, 0, 10), amps.Amplicon("a2", 350, 799, 6, 10), amps.Amplicon("a3", 740, 989, 6, 10), ] amp_seqs = ["".join(ref_for_amplicons[x.start:x.end + 1]) for x in amplicons] amp_to_seq_files_dir = "run_assembly_pipeline.reads_per_amp" subprocess.check_output(f"rm -rf {amp_to_seq_files_dir}", shell=True) os.mkdir(amp_to_seq_files_dir) amp_to_seq_files = {f"a{i}": f"reads.{i}.fa" for i in (1, 2, 3)} with open(os.path.join(amp_to_seq_files_dir, "manifest.json"), "w") as f: json.dump(amp_to_seq_files, f) with open("run_assembly_pipeline.reads.fa", "w") as f_all: for i, seq in enumerate(amp_seqs): amp_name = amplicons[i].name