def run_flye(assembly, reads_fname, out_dir, threads, no_nucl_alignment): try: make_flye() except: print( 'Failed to compile Flye! Please try to compile it manually: create %s folder and run "make" in %s' % (dirname(ASSEMBLY_BIN), dirname(dirname(ASSEMBLY_BIN)))) sys.exit(2) cmd = [ ASSEMBLY_BIN, '--reads', reads_fname, '--asm', assembly.compressed_fname or assembly.fname, '--kmers', abspath(assembly.kmers_fname), '--out-file', abspath(assembly.chains_fname), '--out-asm', 'draft_assembly.fasta', '--max-diff', str(assembly.max_aln_diff), '--genome-size', str(get_fasta_len(assembly.fname)), '--config', abspath(get_flye_cfg_fname()), '--log', join(out_dir, 'mapping.log'), '--min-kmers', str(MIN_CHAIN_KMERS) if not no_nucl_alignment else '1000000', '--threads', str(threads), '--min-ovlp', str(MIN_CHAIN_LEN), '--kmer', str(KMER_SIZE) ] subprocess.call(cmd, stdout=open("/dev/null", "w"), stderr=open("/dev/null", "w"))
def set_params(fnames, threads): if not check_fasta_files(fnames): sys.exit(1) assembly_len = max([get_fasta_len(f) for f in fnames]) #print("Max assembly len: %d" % assembly_len) config.KMER_WINDOW_SIZE = max(500, assembly_len // 150) if assembly_len < 100000: config.BP_WINDOW_SIZE = 200 elif assembly_len < 1000000: config.BP_WINDOW_SIZE = 500 else: config.BP_WINDOW_SIZE = 1000 config.MOVING_AVG_WINDOW_SIZE = min( 20, max(20, assembly_len // config.BP_WINDOW_SIZE // 20)) config.MAX_THREADS = threads
def do(assemblies, reads_fname, reads_real_coords, out_dir, threads, no_reuse, no_nucl_alignment): print("") print("*********************************") print("Read mapping started...") run_parallel(run_flye, [(assembly, reads_fname, out_dir, max(1, threads // len(assemblies)), no_nucl_alignment) for assembly in assemblies if not exists(assembly.bed_fname) or no_reuse], n_jobs=min(len(assemblies), threads)) all_data = [] for assembly in assemblies: errors = postprocess_chains(assembly, reads_real_coords) coverage = calculate_coverage(get_fasta_len(assembly.fname), assembly.bed_fname) all_data.append((errors, coverage)) make_plotly_noise(assemblies, all_data, out_dir) print("Read mapping finished")
def do(assemblies, reads_fname, hifi_reads_fname, out_dir, tmp_dir): print("") print("*********************************") print("Running polishing module...") out_dir = join(out_dir, "polished") if not exists(out_dir): os.makedirs(out_dir) try: make_flye() except: print('Failed to compile Flye! Please try to compile it manually: create %s folder and run "make" in %s' % (dirname(POLISH_BIN), dirname(dirname(POLISH_BIN)))) sys.exit(2) for i in range(4): select_kmers.do(assemblies, reads_fname, reads_fname, hifi_reads_fname, out_dir, tmp_dir, no_reuse=True, only_polish=True) for assembly in assemblies: print("Polishing genome (%d/%d)" % (i+1, 4)) assembly.fname = polish(assembly.fname, reads_fname, out_dir, assembly.kmers_fname, get_fasta_len(assembly.fname), config.MAX_THREADS, config.platform, get_flye_cfg_fname(), i) print("Polished assemblies saved to %s" % out_dir)
def do(assemblies, reads_fname, out_dir, no_reuse=False): print("") print("*********************************") print("K-mer analysis started...") kmer_stats_table = [['Assembly'] + [assembly.name for assembly in assemblies]] kmer_stats_table.append(["K-mers forming single clump"] + ["-" for assembly in assemblies]) kmer_stats_table.append(["K-mers forming multiple clumps"] + ["-" for assembly in assemblies]) kmer_stats_table.append(["K-mers forming no clumps"] + ["-" for assembly in assemblies]) for i, assembly in enumerate(assemblies): if exists(assembly.good_kmers_fname) and exists( join(out_dir, "report", assembly.name + "_kmer_stats.txt")) and not no_reuse: print("Reusing latest results...") with open( join(out_dir, "report", assembly.name + "_kmer_stats.txt")) as f: line = f.readline() kmer_stats_table[1][i + 1] = line.split("\t")[1] line = f.readline() kmer_stats_table[2][i + 1] = line.split("\t")[1] line = f.readline() kmer_stats_table[3][i + 1] = line.split("\t")[1] continue solid_kmers = get_kmers(assembly.solid_kmers_fname) assembly_len = get_fasta_len(assembly.fname) ref_kmers_pos, kmer_by_pos = get_kmers_positions( assembly.fname, solid_kmers) read_kmer_pos, reads_coords = get_kmers_read_pos( assembly, reads_fname, solid_kmers) no_clumps = [] one_clump = [] good_kmers = [] multi_clumps = [] bad_kmers1 = [] bad_kmers2 = [] multi_clump_pos = [] no_clump_pos = [] for kmer, pos in ref_kmers_pos.items(): pos_in_ref = [] read_pos = [] reads = [] for read_name, kmers_pos in read_kmer_pos.items(): if kmer in kmers_pos and kmers_pos[kmer] in reads_coords[ read_name]: k_pos = kmers_pos[kmer] read_pos.append(k_pos) pos_in_ref.append(reads_coords[read_name][k_pos]) reads.append(read_name) if read_pos and len(read_pos) >= MIN_CLUMP_SIZE: clusters = get_clusters(pos_in_ref) if not clusters: no_clumps.append(pos) bad_kmers1.append(kmer) multi_clump_pos.extend(pos_in_ref) elif len(clusters) == 1: one_clump.append(pos) good_kmers.append(kmer) else: multi_clumps.append(pos) bad_kmers2.append(kmer) no_clump_pos.extend(pos_in_ref) all_kmers = len(one_clump) + len(multi_clumps) + len(no_clumps) with open(assembly.good_kmers_fname, "w") as f: for kmer in good_kmers: f.write("%s\n" % kmer) with open(join(out_dir, "report", assembly.name + "_kmer_stats.txt"), "w") as f: f.write("Single clump\t%.2f (%d)\n" % (len(one_clump) * 100.0 / all_kmers, len(one_clump))) f.write("Multi clump\t%.2f (%d)\n" % (len(multi_clumps) * 100.0 / all_kmers, len(multi_clumps))) f.write("No clumps\t%.2f (%d)\n" % (len(no_clumps) * 100.0 / all_kmers, len(no_clumps))) kmer_stats_table[1][i + 1] = "%.2f (%d)" % (len(one_clump) * 100.0 / all_kmers, len(one_clump)) kmer_stats_table[2][i + 1] = "%.2f (%d)" % (len(multi_clumps) * 100.0 / all_kmers, len(multi_clumps)) kmer_stats_table[3][i + 1] = "%.2f (%d)" % (len(no_clumps) * 100.0 / all_kmers, len(no_clumps)) one_clump_dist = [0] * assembly_len no_clump_dist = [0] * assembly_len multi_clump_dist = [0] * assembly_len for p in one_clump: one_clump_dist[p] = 1 for p in no_clumps: no_clump_dist[p] = 1 for p in multi_clumps: multi_clump_dist[p] = 1 one_clump_vals = [ sum(one_clump_dist[i:i + KMER_WINDOW_SIZE]) for i in range(0, assembly_len, KMER_WINDOW_SIZE) ] multi_clump_vals = [ sum(multi_clump_dist[i:i + KMER_WINDOW_SIZE]) for i in range(0, assembly_len, KMER_WINDOW_SIZE) ] no_clump_vals = [ sum(no_clump_dist[i:i + KMER_WINDOW_SIZE]) for i in range(0, assembly_len, KMER_WINDOW_SIZE) ] plot_fname = join(out_dir, "report", assembly.name + "_kmer_analysis.png") make_plot(plot_fname, "K-mer analysis", assembly.label, xlabel="Position", ylabel="$\it{k}$-mer counts", list_vals=[one_clump_vals, multi_clump_vals, no_clump_vals], legend=("Single clump", "Multiple clumps", "No clumps"), max_x=assembly_len) #draw_report_table("K-mer statistics", "", kmer_stats_table) print("K-mer analysis finished.")
def do(assemblies, reads_fname, monomers_fname, out_dir): print("") print("*********************************") print("Monomer analysis started...") reads_mm_structure = get_reads_monomer_structure(reads_fname, monomers_fname, out_dir) for assembly in assemblies: print("") print("Processing %s assembly..." % assembly.label) ref_mm_structure, ref_stats = get_ref_monomers(assembly, monomers_fname, out_dir) ref_mm_structure.sort(key=lambda x: x[1]) make_plotly_html(assembly, ref_stats, out_dir) _, reads_coords = get_kmers_read_pos(assembly, reads_fname) assembly_len = get_fasta_len(assembly.fname) reads_monomers = [[] for i in range(len(ref_mm_structure))] coverage = [0] * assembly_len for read_name, coord_dict in reads_coords.items(): for (mm_name, mm_start, mm_end) in reads_mm_structure[read_name]: ref_i = -1 ref_i2 = -1 mm_len = mm_end - mm_start, if mm_start in reads_coords[ read_name] and mm_end in reads_coords[read_name]: mm_start = reads_coords[read_name][mm_start] mm_end = reads_coords[read_name][mm_end] mm_start, mm_end = min(mm_start, mm_end), max(mm_start, mm_end) ref_i = approx_binary_search(ref_mm_structure, 1, 0, len(ref_mm_structure), mm_start) if mm_end - mm_start > 50: ref_i2 = approx_binary_search(ref_mm_structure, 2, 0, len(ref_mm_structure), mm_end) else: ref_i2 = ref_i if ref_i > -1 and ref_i2 == ref_i: coverage[ref_i] += 1 reads_monomers[ref_i].append((mm_name, mm_len, mm_start)) read_support = [] for i in range(len(ref_mm_structure)): if len(reads_monomers[i]) >= MIN_COV: read_support.append( sum([ 1 for m in reads_monomers[i] if m[0] == ref_mm_structure[i][0] ]) * 1.0 / coverage[i]) else: read_support.append(1) plot_fname = join(out_dir, "report", assembly.name + "_monomer_analysis.png") make_plot(plot_fname, "Monomer analysis", assembly.label, xlabel="Position", ylabel="MonomersRatio", plot_values=read_support, plot_color="blue", ymax=1.05, max_x=assembly_len) #### UNIT ANALYSIS ref_unit_structure = analyze_unit_structure(ref_mm_structure) unit_occ = defaultdict(int) units_fname = join(out_dir, "%s_units.txt") % assembly.name with open(units_fname, "w") as f: f.write("\t".join(["Unit", "Start", "End", "Monomer sequence\n"])) for i in range(len(ref_unit_structure)): if not ref_unit_structure[i]: continue unit_str = ref_unit_structure[i][2] unit_occ[unit_str] += 1 f.write("\t".join( str(s) for s in [ i + 1, ref_unit_structure[i][0], ref_unit_structure[i] [1], unit_str ]) + "\n") print("Total units: %d, units sequences saved to %s" % (len(ref_unit_structure), units_fname))
def do(assemblies, out_dir): print("") print("*********************************") print("Breakpoint analysis started...") for assembly in assemblies: assembly_len = get_fasta_len(assembly.fname) starts = [0] * assembly_len ends = [0] * assembly_len ideal_starts = [0] * assembly_len ideal_ends = [0] * assembly_len coverage = [0] * assembly_len ideal_coverage = [0] * assembly_len tips = [0] * assembly_len rare_kmers = get_kmers(assembly.kmers_fname) with open(assembly.fname) as handle: for record in SeqIO.parse(handle, 'fasta'): assembly_len = len(record.seq) assembly_seq = str(record.seq) rare_kmers_by_pos = [0] * assembly_len for i in range(len(assembly_seq) - KMER_SIZE + 1): kmer = assembly_seq[i:i + KMER_SIZE] if kmer in rare_kmers or rev_comp(kmer) in rare_kmers: rare_kmers_by_pos[i] = 1 used_reads = set() with open(assembly.bed_fname) as f: for line in f: fs = line.split() ref, ref_s, ref_e, read_name, align_start, align_end, read_len = fs ref_s, ref_e, align_start, align_end, read_len = map( int, (ref_s, ref_e, align_start, align_end, read_len)) align_start, align_end = min(align_start, align_end), max( align_start, align_end) tips[ref_s] += 1 tips[ref_e - 1] += 1 starts[ref_s] += 1 ends[ref_e - 1] += 1 if read_name in used_reads: continue if sum(rare_kmers_by_pos[ref_s - align_start:ref_s] ) > MAX_MISSED_KMERS: ideal_starts[max(0, ref_s - align_start)] += 1 else: ideal_starts[ref_s] += 1 if sum(rare_kmers_by_pos[ref_e:ref_e + read_len]) > MAX_MISSED_KMERS: ideal_ends[min(assembly_len - 1, ref_e + read_len - align_end - 1)] += 1 else: ideal_ends[ref_e] += 1 used_reads.add(read_name) cur_cov = 0 ideal_cur_cov = 0 uncovered_regions = [] prev_s, prev_e = -1, -1 for i in range(assembly_len): cur_cov += starts[i] cur_cov -= ends[i] ideal_cur_cov += ideal_starts[i] ideal_cur_cov -= ideal_ends[i] ideal_coverage[i] = ideal_cur_cov coverage[i] = cur_cov if cur_cov < MIN_BP_COV: if prev_s != -1: prev_e = i else: prev_s = i elif prev_s != -1 and prev_e != -1: uncovered_regions.append((prev_s, prev_e)) prev_s, prev_e = -1, -1 else: prev_s, prev_e = -1, -1 factor = 2 step = BP_WINDOW_SIZE // factor real_bp_ratio = [ sum(tips[i:i + BP_WINDOW_SIZE]) * 1.0 / max(1, coverage[i] + sum(starts[i + 1:i + BP_WINDOW_SIZE])) if max(coverage[i:i + BP_WINDOW_SIZE]) >= MIN_BP_COV else 0 for i in range(0, len(coverage), step) ] ideal_bp_ratio = [ (sum(ideal_starts[i:i + BP_WINDOW_SIZE]) + sum(ideal_ends[i:i + BP_WINDOW_SIZE])) * 1.1 / max( 1, ideal_coverage[i] + sum(ideal_starts[i + 1:i + BP_WINDOW_SIZE])) if max(ideal_coverage[i:i + BP_WINDOW_SIZE]) >= MIN_BP_COV else 0 for i in range(0, len(coverage), step) ] def running_mean(data): cumsum = np.cumsum(np.insert(data, 0, 0)) return (cumsum[MOVING_AVG_WINDOW_SIZE:] - cumsum[:-MOVING_AVG_WINDOW_SIZE]) / 10 for i in range(2): ideal_bp_ratio[i], ideal_bp_ratio[-i - 1] = 0, 0 real_bp_ratio[i], real_bp_ratio[-i - 1] = 0, 0 real_vals = [min(1, v) for v in running_mean(real_bp_ratio)] ideal_vals = [min(1, v) for v in running_mean(ideal_bp_ratio)] plot_fname = join(out_dir, "report", assembly.name + "_bp_analysis.png") uncovered_bars = [(r[0] / step, r[1] / step) for r in uncovered_regions if (r[1] / step - r[0] / step) > 10] make_plot(plot_fname, "Breakpoint", assembly.label, xlabel="position", ylabel="breakpointRatio", fill_values=real_vals, fill_color="red", fill_values2=ideal_vals, fill_color2="gray", ymax=1, max_x=assembly_len, bg_bars=uncovered_bars) print("Breakpoint analysis finished.")
def do(assemblies, reads_file, out_dir): if len(assemblies) < 2: return print("") print("*********************************") print("Discordance test started...") for (assembly1, assembly2) in itertools.combinations(assemblies, 2): kmers1 = get_kmers(assembly1.good_kmers_fname) kmers2 = get_kmers(assembly2.good_kmers_fname) shared_kmers = kmers1.intersection(kmers2) with open(join(out_dir, "shared_kmers.txt"), "w") as f: for k in shared_kmers: f.write(k + "\n") ref_kmers_pos1, kmer_by_pos1 = get_kmers_positions( assembly1.fname, shared_kmers) ref_kmers_pos2, kmer_by_pos2 = get_kmers_positions( assembly2.fname, shared_kmers) score1 = 0 voting_reads1 = 0 score2 = 0 voting_reads2 = 0 read_kmer_pos1, reads_coords1 = get_kmers_read_pos( assembly1, reads_file, shared_kmers) read_kmer_pos2, reads_coords2 = get_kmers_read_pos( assembly2, reads_file, shared_kmers) selected_reads1 = [] selected_reads2 = [] for read_name in reads_coords1.keys(): read_name = slugify(read_name) read_score = 0 for kmer in shared_kmers: if kmer in ref_kmers_pos1 and read_name in read_kmer_pos1 and kmer in read_kmer_pos1[read_name] and \ read_kmer_pos1[read_name][kmer] in reads_coords1[read_name]: if abs(ref_kmers_pos1[kmer] - reads_coords1[read_name][read_kmer_pos1[read_name][kmer]]) <= 1000 or \ abs(ref_kmers_pos2[kmer] - reads_coords1[read_name][ read_kmer_pos1[read_name][kmer]]) <= 1000: score1 += 1 read_score += 1 if kmer in ref_kmers_pos2 and read_name in read_kmer_pos2 and kmer in read_kmer_pos2[read_name] and \ read_kmer_pos2[read_name][kmer] in reads_coords2[read_name]: if abs(ref_kmers_pos2[kmer] - reads_coords2[read_name][ read_kmer_pos2[read_name][kmer]]) <= 1000: score2 += 1 read_score -= 1 if read_score > KMER_SIZE: voting_reads1 += 1 selected_reads1.append(read_name) elif read_score < -KMER_SIZE: voting_reads2 += 1 selected_reads2.append(read_name) total_discordance = score1 - score2 print( "Discordance between %s and %s: %d. There are %d (%d) discordant reads voting for %s (%s)." % (assembly1.name, assembly2.name, total_discordance, voting_reads1, voting_reads2, assembly1.name, assembly2.name)) plot_fname = join( out_dir, "report", "discordance_%s_vs_%s.png" % (assembly1.name, assembly2.name)) draw_discordance_coverage( max(get_fasta_len(assembly1.fname), get_fasta_len(assembly2.fname)), assembly1.name, assembly2.name, assembly1.bed_fname, assembly2.bed_fname, selected_reads1, selected_reads2, "Discordance coverage", out_dir, plot_fname) print("Discordance test finished.")