def merge_bed(repeats_fpath, uncovered_fpath, insert_size, output_dirpath, err_path): combined_bed_fpath = join(output_dirpath, 'skipped_regions.bed') with open(combined_bed_fpath, 'w') as out: if exists(repeats_fpath): with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line) if exists(uncovered_fpath): with open(uncovered_fpath) as in_f: for line in in_f: out.write(line) sorted_bed_fpath = add_suffix(combined_bed_fpath, 'sorted') qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', combined_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) merged_bed_fpath = add_suffix(combined_bed_fpath, 'merged') qutils.call_subprocess( [bedtools_fpath('bedtools'), 'merge', '-i', sorted_bed_fpath], stdout=open(merged_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return merged_bed_fpath
def align_reference(ref_fpath, output_dir, using_reads='all', calculate_coverage=False): required_files = [] ref_name = qutils.name_from_fpath(ref_fpath) cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') if using_reads != 'all': cov_fpath = add_suffix(cov_fpath, using_reads) uncovered_fpath = add_suffix(uncovered_fpath, using_reads) insert_size_fpath = join(output_dir, ref_name + '.is.txt') if not is_non_empty_file(uncovered_fpath): required_files.append(uncovered_fpath) if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'pe'): required_files.append(insert_size_fpath) temp_output_dir = join(output_dir, 'temp_output') if not isdir(temp_output_dir): os.makedirs(temp_output_dir) log_path = join(output_dir, 'reads_stats.log') err_fpath = join(output_dir, 'reads_stats.err') correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, qconfig.max_threads, sam_fpath=qconfig.reference_sam, bam_fpath=qconfig.reference_bam, required_files=required_files, is_reference=True, alignment_only=True, using_reads=using_reads) if not qconfig.optimal_assembly_insert_size or qconfig.optimal_assembly_insert_size == 'auto': if using_reads == 'pe' and sam_fpath: insert_size, std_dev = calculate_insert_size(sam_fpath, output_dir, ref_name) if not insert_size: logger.info(' Failed calculating insert size.') else: qconfig.optimal_assembly_insert_size = insert_size elif using_reads == 'all' and is_non_empty_file(insert_size_fpath): try: insert_size = int(open(insert_size_fpath).readline()) if insert_size: qconfig.optimal_assembly_insert_size = insert_size except: pass if not required_files: return sam_fpath, bam_fpath, uncovered_fpath if not sam_fpath: logger.info(' Failed detecting uncovered regions.') return None, None if calculate_coverage: bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(bam_sorted_fpath): logger.info(' Using existing sorted BAM-file: ' + bam_sorted_fpath) else: sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) if not is_non_empty_file(uncovered_fpath) and calculate_coverage: get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return sam_fpath, bam_fpath, uncovered_fpath
def check_repeats_instances(coords_fpath, repeats_fpath, use_long_reads=False): query_instances = defaultdict(list) with open(coords_fpath) as f: for line in f: fs = line.split('\t') contig, align_start, align_end, strand, ref_name, ref_start = \ fs[0], fs[2], fs[3], fs[4], fs[5], fs[7] align_start, align_end, ref_start = map(int, (align_start, align_end, ref_start)) align_start += 1 ref_start += 1 matched_bases, bases_in_mapping = map(int, (fs[9], fs[10])) if matched_bases > qconfig.optimal_assembly_insert_size: query_instances[contig].append((align_start, align_end)) repeats_regions = defaultdict(list) filtered_repeats_fpath = add_suffix(repeats_fpath, 'filtered') with open(filtered_repeats_fpath, 'w') as out_f: with open(repeats_fpath) as f: for line in f: fs = line.split() query_id = '%s:%s-%s' % (fs[0], fs[1], fs[2]) if query_id in query_instances and len(query_instances[query_id]) > 1: mapped_repeats = sorted(list(set(query_instances[query_id][1:]))) merged_intervals = [] i_start, i_end = mapped_repeats[0] merged_interval = (i_start, i_end) for s, e in mapped_repeats[1:]: if s <= merged_interval[1]: merged_interval = (merged_interval[0], max(merged_interval[1], e)) else: merged_intervals.append(merged_interval) merged_interval = (s, e) merged_intervals.append(merged_interval) aligned_bases = sum([end - start + 1 for start, end in merged_intervals]) if aligned_bases >= (int(fs[2]) - int(fs[1])) * 0.9: if use_long_reads and len(mapped_repeats) > 1: solid_repeats = [] full_repeat_pos = int(fs[1]) mapped_repeats.sort(key=lambda x: (x[1], x[1] - x[0]), reverse=True) cur_repeat_start, cur_repeat_end = mapped_repeats[0] for repeat_start, repeat_end in mapped_repeats[1:]: if (cur_repeat_start >= repeat_start - REPEAT_CONF_INTERVAL and cur_repeat_end <= repeat_end + REPEAT_CONF_INTERVAL) or \ (repeat_start >= cur_repeat_start - REPEAT_CONF_INTERVAL and repeat_end <= cur_repeat_end + REPEAT_CONF_INTERVAL): cur_repeat_start, cur_repeat_end = min(repeat_start, cur_repeat_start), max(repeat_end, cur_repeat_end) else: solid_repeats.append((cur_repeat_start, cur_repeat_end)) cur_repeat_start, cur_repeat_end = repeat_start, repeat_end solid_repeats.append((cur_repeat_start, cur_repeat_end)) for repeat in solid_repeats: out_f.write('\t'.join((fs[0], str(repeat[0] + full_repeat_pos), str(repeat[1] + full_repeat_pos))) + '\n') repeats_regions[fs[0]].append((repeat[0] + full_repeat_pos, repeat[1] + full_repeat_pos)) else: out_f.write(line) repeats_regions[fs[0]].append((int(fs[1]), int(fs[2]))) sorted_repeats_fpath = add_suffix(repeats_fpath, 'sorted') qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', filtered_repeats_fpath], stdout=open(sorted_repeats_fpath, 'w'), logger=logger) return sorted_repeats_fpath, repeats_regions
def align_reference(ref_fpath, output_dir, using_reads='all'): required_files = [] ref_name = qutils.name_from_fpath(ref_fpath) cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') if using_reads != 'all': cov_fpath = add_suffix(cov_fpath, using_reads) uncovered_fpath = add_suffix(uncovered_fpath, using_reads) insert_size_fpath = join(output_dir, ref_name + '.is.txt') if not is_non_empty_file(uncovered_fpath): required_files.append(uncovered_fpath) if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'paired_end'): required_files.append(insert_size_fpath) temp_output_dir = join(output_dir, 'temp_output') if not isdir(temp_output_dir): os.makedirs(temp_output_dir) log_path = join(output_dir, 'reads_stats.log') err_fpath = join(output_dir, 'reads_stats.err') correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, qconfig.max_threads, sam_fpath=qconfig.reference_sam, bam_fpath=qconfig.reference_bam, required_files=required_files, is_reference=True, alignment_only=True, using_reads=using_reads) qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not qconfig.ideal_assembly_insert_size or qconfig.ideal_assembly_insert_size == 'auto': if using_reads == 'paired_end' and sam_fpath: insert_size = calculate_insert_size(sam_fpath, output_dir, ref_name) if not insert_size: logger.info(' Failed calculating insert size.') else: qconfig.ideal_assembly_insert_size = insert_size if not required_files: return bam_fpath, uncovered_fpath if not sam_fpath: logger.info(' Failed detecting uncovered regions.') return None, None bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(bam_sorted_fpath): logger.info(' Using existing sorted BAM-file: ' + bam_sorted_fpath) else: sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) if not is_non_empty_file(uncovered_fpath): get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return bam_fpath, uncovered_fpath
def connect_with_matepairs(bam_fpath, output_dirpath, err_fpath): bam_filtered_fpath = add_suffix(bam_fpath, 'filtered') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) ## sort by read names bam_filtered_sorted_fpath = add_suffix(bam_filtered_fpath, 'sorted') sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, 'matepairs', bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True, only_intervals=True) matepair_regions = defaultdict(list) with open(bed_fpath) as bed: for l in bed: fs = l.split() matepair_regions[fs[0]].append((int(fs[1]), int(fs[2]))) return matepair_regions
def fill_gaps_mate_pair(bam_fpath, ref_fpath, assembly_fpath, assembly_covered_regions, output_dir, uncovered_fpath, err_fpath): matepair_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True) final_fasta = [] matepair_regions = connect_with_matepairs(bam_fpath, output_dir, err_fpath) final_assembly_fpath = add_suffix(assembly_fpath, mp_polished_suffix) for name, seq in fastaparser.read_fasta(ref_fpath): covered_regions = list(find_overlaps(assembly_covered_regions[name], matepair_reads_covered_regions[name], overlap=50)) total_contigs = 0 if name not in matepair_regions or len(covered_regions) == 1: for region in covered_regions: final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), seq[region[0]: region[1]])) total_contigs += 1 else: frags_to_merge = [covered_regions.pop(0)] sorted_mp_intervals = sorted(matepair_regions[name]) while covered_regions: region2 = covered_regions.pop(0) if is_overlapped(frags_to_merge[-1], region2, sorted_mp_intervals): frags_to_merge.append(region2) else: merged_seq = merge_fragments_with_ns(seq, frags_to_merge) final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq)) total_contigs += 1 frags_to_merge = [region2] if frags_to_merge: merged_seq = merge_fragments_with_ns(seq, frags_to_merge) final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq)) total_contigs += 1 fastaparser.write_fasta(final_assembly_fpath, final_fasta) return final_assembly_fpath
def check_repeats_instances(coords_fpath, repeats_fpath): query_instances = dict() with open(coords_fpath) as f: for line in f: fs = line.split('\t') contig, align_start, align_end, strand, ref_name, ref_start = \ fs[0], fs[2], fs[3], fs[4], fs[5], fs[7] align_start, align_end, ref_start = map( int, (align_start, align_end, ref_start)) align_start += 1 ref_start += 1 matched_bases, bases_in_mapping = map(int, (fs[9], fs[10])) score = matched_bases if contig in query_instances: if score >= max(query_instances[contig]) * 0.8: query_instances[contig].append(score) else: query_instances[contig] = [score] repeats_regions = defaultdict(list) filtered_repeats_fpath = add_suffix(repeats_fpath, 'filtered') with open(filtered_repeats_fpath, 'w') as out_f: with open(repeats_fpath) as f: for line in f: fs = line.split() query_id = '%s:%s-%s' % (fs[0], fs[1], fs[2]) if query_id in query_instances and len( query_instances[query_id]) > 1: out_f.write(line) repeats_regions[fs[0]].append((int(fs[1]), int(fs[2]))) return filtered_repeats_fpath, repeats_regions
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped and proper_pair') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, output_dir, max_threads, err_fpath): merged_bam_fpath = add_suffix(bam_fpath, 'merged') tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sambamba_view(tmp_sam_fpath, tmp_bam_fpath, max_threads, err_fpath, logger, filter_rule=None) sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), merged_bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, merged_bam_fpath, bam_fpath], stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return merged_bam_fpath
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): if reads_type == 'pacbio' or reads_type == 'nanopore': if reads_type == 'pacbio': preset = ' -ax map-pb ' else: preset = ' -ax map-ont ' cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads else: cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) bam_fpath = output_fpath.replace('.sam', '.bam') if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_fpath): if not is_non_empty_file(bam_fpath): sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) if reads_type == 'pe': bam_dedup_fpath = add_suffix(bam_fpath, 'dedup') qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, bam_fpath, bam_dedup_fpath], stderr=open(err_fpath, 'a'), logger=logger) if exists(bam_dedup_fpath): shutil.move(bam_dedup_fpath, bam_fpath) if reads_type == 'pe': insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.optimal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.optimal_assembly_insert_size = max(insert_sizes) ref_name = qutils.name_from_fpath(ref_fpath) insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt') with open(insert_size_fpath, 'w') as out: out.write(str(qconfig.optimal_assembly_insert_size))
def get_joiners(ref_name, sam_fpath, bam_fpath, output_dirpath, err_fpath, using_reads): bam_filtered_fpath = add_suffix(bam_fpath, 'filtered') if not is_non_empty_file(bam_filtered_fpath): filter_rule = 'not unmapped and not supplementary and not secondary_alignment' sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger, filter_rule=filter_rule) bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_sorted_fpath): sort_bam(bam_filtered_fpath, bam_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, using_reads, bam_sorted_fpath, err_fpath, logger, bedpe=using_reads == 'mp') intervals = defaultdict(list) if using_reads == 'mp': insert_size, std_dev = calculate_insert_size(sam_fpath, output_dirpath, ref_name, reads_suffix='mp') min_is = insert_size - std_dev max_is = insert_size + std_dev with open(bed_fpath) as bed: for l in bed: fs = l.split() if using_reads == 'mp' and insert_size: interval_len = int(fs[2]) - int(fs[1]) if min_is <= abs(interval_len) <= max_is: intervals[fs[0]].append((int(fs[1]), int(fs[2]))) else: intervals[fs[0]].append((int(fs[1]), int(fs[2]))) return intervals
def fill_gaps_single(ref_fpath, assembly_fpath, assembly_covered_regions, uncovered_fpath): single_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True) final_assembly_fpath = add_suffix(assembly_fpath, single_polished_suffix) final_fasta = [] for name, seq in fastaparser.read_fasta(ref_fpath): covered_regions = find_overlaps(assembly_covered_regions[name], single_reads_covered_regions[name], overlap=50) for i, region in enumerate(covered_regions): start, end = region final_fasta.append((name.split()[0] + "_" + str(i + 1), seq[start: end])) fastaparser.write_fasta(final_assembly_fpath, final_fasta) return final_assembly_fpath
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath): tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return sam_fpath
def seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, fasta_fpath=None, seq=None, name=None, is_ref=False, intersect_with=None, kmer_fraction=1): can_reuse = True if not fasta_fpath: can_reuse = False fasta_fpath = join(tmp_dirpath, name + '.fasta') if is_ref: fasta_fpath = add_suffix(fasta_fpath, 'reference') with open(fasta_fpath, 'w') as out_f: out_f.write(seq) kmc_out_fpath = count_kmers(tmp_dirpath, fasta_fpath, log_fpath, err_fpath, can_reuse=can_reuse) if intersect_with: kmc_out_fpath = intersect_kmers(tmp_dirpath, [kmc_out_fpath, intersect_with], log_fpath, err_fpath) return kmc_out_fpath
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath): sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') bam_mapped_fpath = add_suffix(bam_fpath, 'mapped') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_fpath): bwa_index(ref_fpath, err_fpath, logger) qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath], stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', bam_fpath], stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') ref_name = qutils.name_from_fpath(ref_fpath) correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger) get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return uncovered_fpath
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath): if not isfile(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None raw_cov_fpath = add_suffix(cov_fpath, 'raw') if not is_non_empty_file(raw_cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam') sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='proper_pair and not supplementary and not duplicate') ## sort by read names bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam') sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True) calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger) return raw_cov_fpath
def run_bwa(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): cmd = bwa_cmd + (' -p ' if reads_type != 'single' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmd = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmd), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if reads_type == 'paired_end': insert_size = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.ideal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.ideal_assembly_insert_size = max(insert_sizes)
def download_wgsmaster_contigs(ref_id, ref_fpath): temp_fpath = add_suffix(ref_fpath, 'tmp') + '.gz' response = try_send_request( ncbi_url + 'esummary.fcgi?db=nuccore&id=%s&rettype=text&validate=false' % ref_id) xml_tree = ET.fromstring(response) for field in xml_tree[0]: if field.get('Name') == 'Extra': download_system = field.text.split('|')[-1][:6] genome_version = int(field.text.split('|')[3].split('.')[-1]) break fsize = None while genome_version != 0 and not fsize: try: fname = "%s.%s.fsa_nt.gz" % (download_system, genome_version) url = "ftp://ftp.ncbi.nlm.nih.gov/sra/wgs_aux/%s/%s/%s/%s" % ( download_system[:2], download_system[2:4], download_system, fname) response = urlopen(url) meta = response.info() fsize = int(meta.getheaders("Content-length")[0]) bsize = 1048576 except: fsize = None if genome_version != 0: genome_version -= 1 with open(temp_fpath, 'wb') as f: while True: buffer = response.read(bsize) if not buffer: break f.write(buffer) with open(ref_fpath, 'w') as f: subprocess.call(['gunzip', '-c', temp_fpath], stdout=f) os.remove(temp_fpath)
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Simulating Optimal Assembly...") uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) if os.path.isfile(result_fpath) or os.path.isfile( ref_prepared_optimal_assembly): already_done_fpath = result_fpath if os.path.isfile( result_fpath) else ref_prepared_optimal_assembly logger.notice( ' Will reuse already generated Optimal Assembly with insert size %d (%s)' % (insert_size, already_done_fpath)) return already_done_fpath if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Optimal Assembly on this platform, skipping...' ) return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning(' Sorry, can\'t create Optimal Assembly, skipping...') return None log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath) if unique_covered_regions is None: logger.error( ' Failed to create Optimal Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_uncovered_fpath( uncovered_fpath, ref_fpath, return_covered_regions=False ) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretically optimal Assembly saved to ' + result_fpath) logger.notice( 'You can copy it to ' + ref_prepared_optimal_assembly + ' and QUAST will reuse it in further runs against the same reference (' + original_ref_fpath + ')') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir, log_path, err_fpath): required_files = [] bed_fpath, cov_fpath, physical_cov_fpath = None, None, None if main_ref_fpath: ref_name = qutils.name_from_fpath(main_ref_fpath) bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed') cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov') required_files = [bed_fpath, cov_fpath, physical_cov_fpath] if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) elif not qconfig.forward_reads and not qconfig.interlaced_reads: if not qconfig.reference_sam and not qconfig.reference_bam: logger.info(' Will not search Structural Variations (needs paired-end reads)') bed_fpath = None qconfig.no_sv = True if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): required_files = [] n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1) max_threads_per_job = max(1, qconfig.max_threads // n_jobs) sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths) bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths) parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)] if main_ref_fpath: parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True)) correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs) qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)] qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)] add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath) save_reads(output_dir) if not main_ref_fpath: return None, None, None correct_chr_names = correct_chr_names[-1] sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1] qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not required_files: return bed_fpath, cov_fpath, physical_cov_fpath if not all([sam_fpath, bam_fpath]): logger.info(' Failed searching structural variations.') return None, None, None sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted')) bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: if not is_non_empty_file(bam_sorted_fpath): sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_lengths = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_lengths[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False ref_files = {} if meta_ref_fpaths: global ref_sam_fpaths for cur_ref_fpath in meta_ref_fpaths: cur_ref_name = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam') ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath if is_non_empty_file(ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath)) ref_files[cur_ref_name] = None else: ref_sam_file = open(ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): ref_sam_file.write(headers[0] + '\n') for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name: ref_sam_file.write(h + '\n') ref_sam_file.write(headers[-1] + '\n') ref_files[cur_ref_name] = ref_sam_file need_ref_splitting = True trivial_deletions_fpath = \ search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting) if get_gridss_fpath() and isfile(get_gridss_fpath()): try: gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath) qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join( output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = [ 'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group' ] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error( ' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write( qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write( qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up references = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header references[name] = seq log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int( line[0] ) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append( SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [ SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2]) ] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} ref_lens = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq in references.items(): regions.setdefault(name, []).append([1, len(seq)]) ref_lens[name] = len(seq) total_regions += 1 total_reg_len += ref_lens[name] log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) if qconfig.large_genome: log_out_f.write('Analyzing large blocks...\n') large_misassembly_fpath = add_suffix( misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null' ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'), coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w')) min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD result.update( analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null', aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0]) qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update( analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta( join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join( output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join( output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile( r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall( contig)[0][0] contig_cov = len_cov_pattern.findall( contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Generating Upper Bound Assembly...") if not reads_analyzer.compile_reads_analyzer_tools(logger): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...' ) return None if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Upper Bound Assembly on this platform ' '(only linux64 and macOS are supported), skipping...') return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to install/download third-party repeat finding tool [Red]), skipping...' ) return None insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) if long_reads: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, mp_polished_suffix) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size: calculated_insert_size = qconfig.optimal_assembly_insert_size result_fpath = result_fpath.replace('is' + str(insert_size), 'is' + str(calculated_insert_size)) prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace( 'is' + str(insert_size), 'is' + str(calculated_insert_size)) insert_size = calculated_insert_size ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=long_reads) if unique_covered_regions is None: logger.error( ' Failed to create Upper Bound Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_bed( uncovered_fpath) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretical Upper Bound Assembly is saved to ' + result_fpath) logger.notice( '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n' '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). ' 'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n' '\t\tOR\n' '\tYou can copy ' + result_fpath + ' to ' + ref_prepared_optimal_assembly + '. ' 'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference (' + original_ref_fpath + ') and ' 'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size ' + str(insert_size) + '), ' 'QUAST will reuse this Upper Bound Assembly.\n') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Simulating Ideal Assembly...") uncovered_fpath = None if qconfig.paired_reads or qconfig.reference_sam or qconfig.reference_sam: sam_fpath, uncovered_fpath = reads_analyzer.align_reference(ref_fpath, join(dirname(output_dirpath), qconfig.reads_stats_dirname), using_reads='paired_end') insert_size = qconfig.ideal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.ideal_assembly_default_IS if insert_size % 2 == 0: insert_size += 1 logger.notice(' Current implementation cannot work with even insert sizes, ' 'will use the closest odd value (%d)' % insert_size) ref_basename, fasta_ext = splitext_for_fasta_file(os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % (ref_basename, qconfig.ideal_assembly_basename, insert_size) if qconfig.paired_reads and qconfig.unpaired_reads: result_basename = add_suffix(result_basename, single_polished_suffix) if qconfig.paired_reads and qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file(os.path.basename(original_ref_fpath)) prepared_ideal_assembly_basename = '%s.%s.is%d.fasta' % (original_ref_basename, qconfig.ideal_assembly_basename, insert_size) ref_prepared_ideal_assembly = os.path.join(os.path.dirname(original_ref_fpath), prepared_ideal_assembly_basename) if os.path.isfile(result_fpath) or os.path.isfile(ref_prepared_ideal_assembly): already_done_fpath = result_fpath if os.path.isfile(result_fpath) else ref_prepared_ideal_assembly logger.notice(' Will reuse already generated Ideal Assembly with insert size %d (%s)' % (insert_size, already_done_fpath)) return already_done_fpath if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t create Ideal Assembly on this platform, skipping...') return None base_aux_dir = os.path.join(qconfig.LIBS_LOCATION, 'ideal_assembly') configs_dir = os.path.join(base_aux_dir, 'configs') binary_fpath = download_external_tool('spades', os.path.join(base_aux_dir, 'bin'), 'spades', platform_specific=True) if not os.path.isfile(binary_fpath): logger.warning(' Sorry, can\'t create Ideal Assembly, skipping...') return None log_fpath = os.path.join(output_dirpath, 'spades.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) processed_ref_fpath = preprocess_reference(ref_fpath, tmp_dir, uncovered_fpath) dst_configs = os.path.join(tmp_dir, 'configs') main_config = os.path.join(dst_configs, 'config.info') dir_util._path_created = {} # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree dir_util.copy_tree(configs_dir, dst_configs, preserve_times=False) prepare_config_spades(main_config, insert_size, processed_ref_fpath, tmp_dir) log_file = open(log_fpath, 'w') spades_output_fpath = os.path.join(tmp_dir, 'K%d' % insert_size, 'ideal_assembly.fasta') logger.info(' ' + 'Running SPAdes with K=' + str(insert_size) + '...') return_code = qutils.call_subprocess( [binary_fpath, main_config], stdout=log_file, stderr=log_file, indent=' ') if return_code != 0 or not os.path.isfile(spades_output_fpath): logger.error(' Failed to create Ideal Assembly, see log for details: ' + log_fpath) return None if qconfig.mate_pairs or qconfig.unpaired_reads: spades_output_fpath = polish_assembly(ref_fpath, spades_output_fpath, output_dirpath, tmp_dir) shutil.move(spades_output_fpath, result_fpath) logger.info(' ' + 'Ideal Assembly saved to ' + result_fpath) logger.notice('You can copy it to ' + ref_prepared_ideal_assembly + ' and QUAST will reuse it in further runs against the same reference (' + original_ref_fpath + ')') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath