def test_plot_genome_coverage_minimap(): """ Replace BLAST results with minimap2. """ genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage 1-HCV-1a,HCV-1a,1,8001,0,0,5 1-HCV-1a,HCV-1a,2,8002,0,0,5 1-HCV-1a,HCV-1a,3,8003,0,0,7 1-HCV-1a,HCV-1a,4,8004,0,0,5 1-HCV-1a,HCV-1a,5,8005,0,0,5 1-HCV-1a,HCV-1a,6,8006,0,0,5 """) minimap_hits_csv = StringIO("""\ contig,ref_name,score,match,pident,start,end,ref_start,ref_end 1-HCV-1a,HCV-1a,40,0.33,100,5,6,7006,7005 1-HCV-1a,HCV-1a,50,0.5,100,1,3,8001,8003 """) expected_figure = """\ 5'[1-341], C[342-914], E1[915-1490], E2[1491-2579], p7[2580-2768], \ NS2[2769-3419], NS3[3420-5312], NS4b[5475-6257], NS4a[5313-5474], \ NS5a[6258-7601], NS5b[7602-9374], 3'[9375-9646] 7005<-1.2--7006, 8001--1.1->8003 8001--1.1->8003, 8005--1.2->8006 Coverage 5x2, 7, 5x3 [8001-8006], 1-HCV-1a - depth 7(1-9646) """ figure = build_coverage_figure(genome_coverage_csv, minimap_hits_csv) assert summarize_figure(figure) == expected_figure
def test_plot_genome_coverage_blast_collision(): """ Two blast results end at the same position. """ genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage 1-HCV-1a,HCV-1a,1,8001,0,0,5 1-HCV-1a,HCV-1a,2,8002,0,0,5 1-HCV-1a,HCV-1a,3,8003,0,0,7 1-HCV-1a,HCV-1a,4,8004,0,0,5 1-HCV-1a,HCV-1a,5,8005,0,0,5 1-HCV-1a,HCV-1a,6,8006,0,0,5 """) blast_csv = StringIO("""\ contig_num,ref_name,score,match,pident,start,end,ref_start,ref_end 1,HCV-1g,30,0.33,90,1,2,5001,5002 1,HCV-1a,40,0.33,100,3,6,7003,7006 1,HCV-1a,50,0.5,100,1,6,8001,8006 """) expected_figure = """\ 5'[1-341], C[342-914], E1[915-1490], E2[1491-2579], p7[2580-2768], \ NS2[2769-3419], NS3[3420-5312], NS4b[5475-6257], NS4a[5313-5474], \ NS5a[6258-7601], NS5b[7602-9374], 3'[9375-9646] 7003--1.2->7006, 8001--1.1->8006 8001--1.1->8006, 8003--1.2->8006 Coverage 5x2, 7, 5x3 [8001-8006], 1-HCV-1a - depth 7(1-9646) """ figure = build_coverage_figure(genome_coverage_csv, blast_csv) assert summarize_figure(figure) == expected_figure
def test_plot_genome_coverage_blast_aligns_refs(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,1,2261,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,2,2262,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,3,2263,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,4,2264,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,5,2265,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,6,2266,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,7,2267,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,8,2268,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,9,2269,0,0,5 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,10,2270,0,0,5 """) blast_csv = StringIO("""\ contig_num,ref_name,score,match,pident,start,end,ref_start,ref_end 1,HIV1-G-CM-KP718923-seed,300,1,90,1,10,1653,1662 """) expected_figure = """\ 5' LTR[1-634], gag[789-2289], vif[5040-5616], tat[8379-8469], nef[8796-9414] tat[5831-6045], vpu[6061-6307], rev[8379-8653], 3' LTR[9086-9719] pol[2085-5096], vpr[5558-5847], rev[5970-6045], env[6225-8795] PR[2252-2549], RT[2549-3869], INT[4229-5093], V3[7109-7217], GP41[7757-8792] 2261--1.1->2270 2261--1.1->2270 Coverage 5x10 [2261-2270], 1-HIV1-G-CM-KP718923-seed - depth 5(1-9719) """ figure = build_coverage_figure(genome_coverage_csv, blast_csv) assert summarize_figure(figure) == expected_figure
def test_plot_genome_coverage_unmapped(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage,link 1-HCV-1a,HCV-1a,1,1,0,0,5,M 1-HCV-1a,HCV-1a,2,2,0,0,5,M 1-HCV-1a,HCV-1a,3,3,0,0,5,M 1-HCV-1a,HCV-1a,4,4,0,0,6,M 1-HCV-1a,HCV-1a,5,5,0,0,6,U 1-HCV-1a,HCV-1a,6,6,0,0,6,U 1-HCV-1a,HCV-1a,7,7,0,0,7,U 1-HCV-1a,HCV-1a,8,8,0,0,7,U 1-HCV-1a,HCV-1a,9,9,0,0,7,U 1-HCV-1a,HCV-1a,10,10,0,0,8,M 1-HCV-1a,HCV-1a,11,11,0,0,8,M 1-HCV-1a,HCV-1a,12,12,0,0,8,M """) expected_figure = """\ 5'[1-341], C[342-914], E1[915-1490], E2[1491-2579], p7[2580-2768], \ NS2[2769-3419], NS3[3420-5312], NS4b[5475-6257], NS4a[5313-5474], \ NS5a[6258-7601], NS5b[7602-9374], 3'[9375-9646] Coverage 5x3, 6x3, 7x3, 8x3 [1-12], 1-HCV-1a - depth 8(1-9646), yellow{5-9} """ figure = build_coverage_figure(genome_coverage_csv) assert summarize_figure(figure) == expected_figure
def test_plot_genome_coverage_gap(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage 1-HCV-1a,HCV-1a,1,1,0,0,5 1-HCV-1a,HCV-1a,2,2,0,0,5 1-HCV-1a,HCV-1a,4,4,0,0,6 1-HCV-1a,HCV-1a,5,5,0,0,6 1-HCV-1a,HCV-1a,6,6,0,0,6 contig-1-HCV-1a,HCV-1a,1,1,0,, contig-1-HCV-1a,HCV-1a,2,2,0,, contig-1-HCV-1a,HCV-1a,4,4,0,, contig-1-HCV-1a,HCV-1a,5,5,0,, contig-1-HCV-1a,HCV-1a,6,6,0,, """) expected_figure = """\ 5'[1-341], C[342-914], E1[915-1490], E2[1491-2579], p7[2580-2768], \ NS2[2769-3419], NS3[3420-5312], NS4b[5475-6257], NS4a[5313-5474], \ NS5a[6258-7601], NS5b[7602-9374], 3'[9375-9646] Coverage 5x2, 0, 6x3 [1-2], [4-6], 1-HCV-1a - depth 6(1-9646) [1-2], [4-6], contig-1-HCV-1a(1-9646) """ figure = build_coverage_figure(genome_coverage_csv) assert expected_figure == summarize_figure(figure)
def test_plot_genome_coverage_insertion(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage 1-HCV-1a,HCV-1a,1,1,0,0,5 1-HCV-1a,HCV-1a,2,2,0,0,5 1-HCV-1a,HCV-1a,3,3,0,0,5 1-HCV-1a,HCV-1a,4,,0,0,6 1-HCV-1a,HCV-1a,5,,0,0,6 1-HCV-1a,HCV-1a,6,,0,0,6 1-HCV-1a,HCV-1a,7,4,0,0,7 1-HCV-1a,HCV-1a,8,5,0,0,7 1-HCV-1a,HCV-1a,9,6,0,0,7 1-HCV-1a,HCV-1a,10,7,0,0,8 1-HCV-1a,HCV-1a,11,8,0,0,8 1-HCV-1a,HCV-1a,12,9,0,0,8 """) expected_figure = """\ 5'[1-341], C[342-914], E1[915-1490], E2[1491-2579], p7[2580-2768], \ NS2[2769-3419], NS3[3420-5312], NS4b[5475-6257], NS4a[5313-5474], \ NS5a[6258-7601], NS5b[7602-9374], 3'[9375-9646] Coverage 5x3, 7x3, 8x3 [1-9], 1-HCV-1a - depth 8(1-9646), lightgreen{4-6} """ figure = build_coverage_figure(genome_coverage_csv) assert expected_figure == summarize_figure(figure)
def test_plot_genome_coverage_two_contigs(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,dels,coverage 1-HCV-1a,HCV-1a,1,1,0,5 1-HCV-1a,HCV-1a,2,2,0,5 1-HCV-1a,HCV-1a,3,3,0,7 1-HCV-1a,HCV-1a,4,4,0,5 1-HCV-1a,HCV-1a,5,5,0,5 1-HCV-1a,HCV-1a,6,6,0,5 contig-1-HCV-1a,HCV-1a,1,1,, contig-1-HCV-1a,HCV-1a,2,2,, contig-1-HCV-1a,HCV-1a,3,3,, contig-1-HCV-1a,HCV-1a,4,4,, contig-1-HCV-1a,HCV-1a,5,5,, contig-1-HCV-1a,HCV-1a,6,6,, 2-HCV-1b-partial,,1,,0,5 2-HCV-1b-partial,,2,,0,5 2-HCV-1b-partial,,3,,0,29 2-HCV-1b-partial,,4,,0,5 2-HCV-1b-partial,,5,,0,5 2-HCV-1b-partial,,6,,0,5 3-HCV-1a,HCV-1a,101,101,0,15 3-HCV-1a,HCV-1a,102,102,0,15 3-HCV-1a,HCV-1a,103,103,0,17 3-HCV-1a,HCV-1a,104,104,0,15 3-HCV-1a,HCV-1a,105,105,0,15 3-HCV-1a,HCV-1a,106,106,0,15 contig-3-HCV-1a,HCV-1a,1,101,, contig-3-HCV-1a,HCV-1a,2,102,, contig-3-HCV-1a,HCV-1a,3,103,, contig-3-HCV-1a,HCV-1a,4,104,, contig-3-HCV-1a,HCV-1a,5,105,, contig-3-HCV-1a,HCV-1a,6,106,, """) expected_figure = """\ 5'[1-341], C[342-914], E1[915-1490], E2[1491-2579], p7[2580-2768], \ NS2[2769-3419], NS3[3420-5312], NS4b[5475-6257], NS4a[5313-5474], \ NS5a[6258-7601], NS5b[7602-9374], 3'[9375-9646] Coverage 15x2, 17, 15x3 [101-106], 3-HCV-1a - depth 17(1-9646) [101-106], contig-3-HCV-1a(1-9646) Coverage 5x2, 7, 5x3 [1-6], 1-HCV-1a - depth 7(1-9646) [1-6], contig-1-HCV-1a(1-9646) [1-500], [1001-1500], [2001-2500], [3001-3500], [4001-4500], \ [5001-5500], [6001-6500], [7001-7500], [8001-8500], [9001-9500], \ Partial Blast Results(1-9646) Coverage 5x2, 29, 5x3 [1-6], 2-HCV-1b-partial - depth 29(1-9646) """ figure = build_coverage_figure(genome_coverage_csv) assert summarize_figure(figure) == expected_figure
def test_plot_genome_coverage_empty(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage """) expected_figure = """\ No contigs found.(1-500) """ figure = build_coverage_figure(genome_coverage_csv) assert expected_figure == summarize_figure(figure)
def test_plot_genome_coverage_zero(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,dels,coverage 1-HCV-1b-partial,,1,,0,0 1-HCV-1b-partial,,2,,0,0 1-HCV-1b-partial,,3,,0,0 1-HCV-1b-partial,,4,,0,0 1-HCV-1b-partial,,5,,0,0 1-HCV-1b-partial,,6,,0,0 """) expected_figure = """\ [1-500], Partial Blast Results(1-500) [1-6], 1-HCV-1b-partial(1-500) """ figure = build_coverage_figure(genome_coverage_csv) assert expected_figure == summarize_figure(figure)
def test_plot_genome_coverage_partial_header(): """ Last dash in the header banner can be less than 500 wide. """ genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage """) genome_coverage_csv.seek(0, 2) # EOF for i in range(1010): genome_coverage_csv.write(f'1-HCV-1a-partial,,{i+1},,0,0,5\n') genome_coverage_csv.seek(0) expected_figure = """\ [1-500], [1001-1010], Partial Blast Results(1-1010) Coverage 5x1010 [1-1010], 1-HCV-1a-partial - depth 5(1-1010) """ figure = build_coverage_figure(genome_coverage_csv) assert expected_figure == summarize_figure(figure)
def test_plot_genome_coverage_partial(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage 1-HCV-1a-partial,,1,,0,0,5 1-HCV-1a-partial,,2,,0,0,5 1-HCV-1a-partial,,3,,0,0,7 1-HCV-1a-partial,,4,,0,0,5 1-HCV-1a-partial,,5,,0,0,5 1-HCV-1a-partial,,6,,0,0,5 """) expected_figure = """\ [1-500], Partial Blast Results(1-500) Coverage 5x2, 7, 5x3 [1-6], 1-HCV-1a-partial - depth 7(1-500) """ figure = build_coverage_figure(genome_coverage_csv) assert expected_figure == summarize_figure(figure)
def test_plot_genome_coverage_offset_blast(): """ When a contig extends before the reference start, offset everything. """ genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,ins,dels,coverage 1-HCV-1a,HCV-1a,1,-2,0,0,5 1-HCV-1a,HCV-1a,2,-1,0,0,5 1-HCV-1a,HCV-1a,3,0,0,0,7 1-HCV-1a,HCV-1a,4,1,0,0,5 1-HCV-1a,HCV-1a,5,2,0,0,5 1-HCV-1a,HCV-1a,6,3,0,0,5 1-HCV-1a,HCV-1a,7,,0,0,5 1-HCV-1a,HCV-1a,8,,0,0,5 1-HCV-1a,HCV-1a,9,,0,0,5 1-HCV-1a,HCV-1a,10,4,0,0,5 1-HCV-1a,HCV-1a,11,5,0,0,5 1-HCV-1a,HCV-1a,12,6,0,0,5 2-unknown-partial,,1,,0,0,6 2-unknown-partial,,2,,0,0,6 2-unknown-partial,,3,,0,0,6 """) blast_csv = StringIO("""\ contig_num,ref_name,score,match,pident,start,end,ref_start,ref_end 1,HCV-1a,30,0.33,90,10,12,4,6 """) expected_figure = """\ 5'[4-344], C[345-917], E1[918-1493], E2[1494-2582], p7[2583-2771], \ NS2[2772-3422], NS3[3423-5315], NS4b[5478-6260], NS4a[5316-5477], \ NS5a[6261-7604], NS5b[7605-9377], 3'[9378-9649] 7--1.1->9 7--1.1->9 Coverage 5x2, 7, 5x6 [1-9], 1-HCV-1a - depth 7(1-9649), lightgreen{7-9} [4-503], [1004-1503], [2004-2503], [3004-3503], [4004-4503], \ [5004-5503], [6004-6503], [7004-7503], [8004-8503], [9004-9503], \ Partial Blast Results(4-9649) Coverage 6x3 [4-6], 2-unknown-partial - depth 6(1-9649) """ figure = build_coverage_figure(genome_coverage_csv, blast_csv) assert summarize_figure(figure) == expected_figure
def test_plot_genome_coverage_g2p(): genome_coverage_csv = StringIO("""\ contig,coordinates,query_nuc_pos,refseq_nuc_pos,dels,coverage,link 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,1,2261,0,5,M 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,2,2262,0,5,M 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,3,2263,0,5,M 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,4,2264,0,5,M 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,5,2265,0,5,M 1-HIV1-G-CM-KP718923-seed,HIV1-B-FR-K03455-seed,6,2266,0,5,M HIV1-CON-XX-Consensus-seed,HIV1-B-FR-K03455-seed,1,7201,0,100,M HIV1-CON-XX-Consensus-seed,HIV1-B-FR-K03455-seed,2,7202,0,100,M HIV1-CON-XX-Consensus-seed,HIV1-B-FR-K03455-seed,3,7203,0,100,M HIV1-CON-XX-Consensus-seed,HIV1-B-FR-K03455-seed,4,7204,0,100,M HIV1-CON-XX-Consensus-seed,HIV1-B-FR-K03455-seed,5,7205,0,100,M HIV1-CON-XX-Consensus-seed,HIV1-B-FR-K03455-seed,6,7206,0,100,M """) minimap_hits_csv = StringIO("""\ contig,ref_name,start,end,ref_start,ref_end 1-HIV1-G-CM-KP718923-seed,HIV1-G-CM-KP718923-seed,1,10,1653,1658 HIV1-CON-XX-Consensus-seed,HIV1-B-FR-K03455-seed,1,6,7201,7206 """) expected_figure = """\ 5' LTR[1-634], gag[789-2289], vif[5040-5616], tat[8379-8469], nef[8796-9414] tat[5831-6045], vpu[6061-6307], rev[8379-8653], 3' LTR[9086-9719] pol[2085-5096], vpr[5558-5847], rev[5970-6045], env[6225-8795] PR[2252-2549], RT[2549-3869], INT[4229-5093], V3[7109-7217], GP41[7757-8792] 2261--1.1->2266 Coverage 100x6 [7201-7206], HIV1-CON-XX-Consensus-seed - depth 100(1-9719) 2261--1.1->2266 Coverage 5x6 [2261-2266], 1-HIV1-G-CM-KP718923-seed - depth 5(1-9719) """ figure = build_coverage_figure(genome_coverage_csv, minimap_hits_csv) assert summarize_figure(figure) == expected_figure
def main(): args = parse_args() source_bins = load_read_bins(args.remap1_csv) target_bins = load_read_bins(args.remap2_csv) all_keys = set(source_bins) all_keys.update(target_bins) moves = defaultdict(Counter) # {source_bin: {target_bin: count}} for read_key in all_keys: source_bin = source_bins.get(read_key) target_bin = target_bins.get(read_key) bin_moves = moves[source_bin] bin_moves[target_bin] += 1 old_unmapped = moves.pop(None, None) sorted_moves = sorted(moves.items()) if old_unmapped: sorted_moves.insert(0, (None, old_unmapped)) diagram_path = args.compare_mapping_svg f = build_coverage_figure(args.genome_coverage1_csv) del f.elements[6:] ref_y, ref = f.elements[5] f2 = build_coverage_figure(args.genome_coverage2_csv) coverage3_y, coverage3 = f2.elements[4] contig3_y, contig3 = f2.elements[5] coverage1_y, coverage1 = f2.elements[6] contig1_y, contig1 = f2.elements[7] dashes_y, dashes = f2.elements[10] del f2.elements[4:] # contig_y, contig = f2.elements[6] # f.h += 50 f.w = max(f.w, f2.w) f.add(coverage3, gap=-4) f.add(contig3, gap=30) contig3_y = f.elements[-1][0] coverage1.a = 0 f.add(coverage1, gap=-4) contig1_shift = contig1.tracks[0].a for track in contig1.tracks: if track.a >= contig1_shift: track.a -= contig1_shift track.b -= contig1_shift f.add(contig1) contig1_y = f.elements[-1][0] if __name__ != '__live_coding__': drawing_width = 970 else: # noinspection PyProtectedMember turtle_screen = Turtle._screen drawing_width = turtle_screen.cv.cget('width') - 10 diagram_path = None drawing = f.show(w=drawing_width) seed1_y = f.h - f.elements[5][0] - 10 seed2_y = f.h - f.elements[7][0] + 25 ref_y = f.h - ref_y contig_y = f.h - contig1_y + 25 contig_x = 0 x_scale = drawing_width / f.w blast_display = BlastDisplay(drawing, x_scale, ref_y, contig_x, contig_y) blast_rows = list(DictReader(args.blast2_csv)) blast_rows.sort(key=lambda match: int(match['score']), reverse=True) best_ref = None matched_positions = set() for row in blast_rows: if '003' not in row['contig_name']: continue if best_ref is None: best_ref = row['ref_name'] elif row['ref_name'] != best_ref: continue start = int(row['start']) end = int(row['end']) new_positions = set(range(start, end)) collisions = matched_positions & new_positions collision_fraction = len(collisions) / len(new_positions) if collision_fraction < 0.1: matched_positions |= new_positions ref_start = int(row['ref_start']) ref_end = int(row['ref_end']) blast_display.add_match(start, end, ref_start, ref_end) blast_display.draw() for source_bin, bin_moves in sorted_moves: total = sum(bin_moves.values()) for i, (target_bin, count) in enumerate(bin_moves.most_common()): fraction = count / total if fraction < 0.1: break if i == 0: print(source_bin, end=':') print(f'\t{target_bin}({fraction})') source_shift = 0 target_shift = contig_x * x_scale if source_bin is None: source_x = target_bin source_y = seed2_y - 10 source_shift = target_shift else: source_x = source_bin source_y = seed1_y if target_bin is None: target_x = source_bin target_y = seed1_y + 10 target_shift = source_shift else: target_x = target_bin target_y = seed2_y source_x *= 100 * x_scale target_x *= 100 * x_scale source_x += source_shift target_x += target_shift drawing.append( Line(source_x, source_y, target_x, target_y, stroke='black', stroke_width=2 * fraction, stroke_opacity=0.25)) if diagram_path is not None: drawing.saveSvg(diagram_path) else: display_image(drawing)