def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): preset = 'asm5' if qconfig.min_IDY >= 95 and not qconfig.is_combined_ref else 'asm10' # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty # -O -- gap penalty, -r -- max gap size mask_level = '1' if qconfig.is_combined_ref else '0.9' num_alignments = '100' if qconfig.is_combined_ref else '50' additional_options = ['-B5', '-O4,16', '--no-long-join', '-r', str(qconfig.MAX_INDEL_LENGTH), '-N', num_alignments, '-s', str(qconfig.min_alignment), '-z', '200'] cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \ ['--mask-level', mask_level, '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def align_kmers(output_dir, ref_fpath, kmers_fpath, log_err_fpath, max_threads): out_fpath = join(output_dir, 'kmers.coords') cmdline = [minimap_fpath(), '-ax', 'sr', '-s202', '--frag=no', '-t', str(max_threads), ref_fpath, kmers_fpath] qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ') kmers_pos_by_chrom = defaultdict(list) kmers_by_chrom = defaultdict(list) with open(out_fpath) as f: for line in f: fs = line.split('\t') if len(fs) < 10: continue contig, chrom, pos = fs[0], fs[2], fs[3] kmers_pos_by_chrom[chrom].append(int(pos)) kmers_by_chrom[chrom].append(int(contig)) return kmers_by_chrom, kmers_pos_by_chrom
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False): red_genome_dir = os.path.join(tmp_dir, 'tmp_red') if isdir(red_genome_dir): shutil.rmtree(red_genome_dir) os.makedirs(red_genome_dir) ref_name = qutils.name_from_fpath(ref_fpath) ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa') ## Red recognizes only *.fa files if os.path.islink(ref_symlink): os.remove(ref_symlink) os.symlink(ref_fpath, ref_symlink) logger.info(' ' + 'Running repeat masking tool...') repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt') if is_non_empty_file(repeats_fpath): return_code = 0 logger.info(' ' + 'Using existing file ' + repeats_fpath + '...') else: return_code = qutils.call_subprocess([binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5'], stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent=' ') if return_code == 0 and repeats_fpath and exists(repeats_fpath): long_repeats_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt') with open(long_repeats_fpath, 'w') as out: with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line[1:]) repeats_fasta_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta') coords_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt') if not is_non_empty_file(coords_fpath): fasta_index_fpath = ref_fpath + '.fai' if exists(fasta_index_fpath): os.remove(fasta_index_fpath) qutils.call_subprocess([bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed', long_repeats_fpath, '-fo', repeats_fasta_fpath], stderr=open(log_fpath, 'w'), indent=' ') cmdline = [minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100', '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath] qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a')) filtered_repeats_fpath, repeats_regions = check_repeats_instances(coords_fpath, long_repeats_fpath, use_long_reads) unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath) return unique_covered_regions, repeats_regions return None, None
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): if reads_type == 'pacbio' or reads_type == 'nanopore': if reads_type == 'pacbio': preset = ' -ax map-pb ' else: preset = ' -ax map-ont ' cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads else: cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) bam_fpath = output_fpath.replace('.sam', '.bam') if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_fpath): if not is_non_empty_file(bam_fpath): sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) if reads_type == 'pe': bam_dedup_fpath = add_suffix(bam_fpath, 'dedup') qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, bam_fpath, bam_dedup_fpath], stderr=open(err_fpath, 'a'), logger=logger) if exists(bam_dedup_fpath): shutil.move(bam_dedup_fpath, bam_fpath) if reads_type == 'pe': insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.optimal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.optimal_assembly_insert_size = max(insert_sizes) ref_name = qutils.name_from_fpath(ref_fpath) insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt') with open(insert_size_fpath, 'w') as out: out.write(str(qconfig.optimal_assembly_insert_size))
def align_kmers(output_dir, ref_fpath, kmers_fpath, log_err_fpath, max_threads): out_fpath = join(output_dir, 'kmers.coords') cmdline = [minimap_fpath(), '-c', '-r1', '-s101', '-A1', '-B10', '-O39,81', '--secondary=no', '-t', str(max_threads), ref_fpath, kmers_fpath] qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ') return out_fpath