def _split_reference_by_priority(cnf, features_bed_fpath): features = ['CDS', 'Exon', 'Transcript', 'Gene'] info('Splitting the reference file into ' + ', '.join(features)) features_and_beds = [] for f in features: features_and_beds.append((f, BedTool(features_bed_fpath).filter(lambda x: x[6] == f))) return features_and_beds
def make_region_reports(view, work_dir, samples, target, genome, depth_thresholds): bed_fpath = target.bed_fpath or target.wgs_bed_fpath if all(can_reuse(s.targqc_region_tsv, [s.bam, bed_fpath]) for s in samples): debug('All region reports exist, reusing') return [s.targqc_region_tsv for s in samples] info('Calculating coverage statistics for CDS and exon regions from RefSeq...') depth_thresholds_by_sample = dict() for s in samples: depth_thresholds_by_sample[s.name] = depth_thresholds debug() debug('Running sambamba...') sambamba_depth_output_fpaths = view.run(sambamba_depth, [[s.work_dir, bed_fpath, s.bam, depth_thresholds_by_sample[s.name], None, s.name] for s in samples]) assert len(sambamba_depth_output_fpaths) == len(samples), \ 'Number of sambamba results = ' + str(len(sambamba_depth_output_fpaths)) + \ ' which is less then the number of samples ' + str(len(samples)) debug() debug('Parsing sambamba results and writing results...') view.run(_proc_sambamba_depth, [[sambamba_output_fpath, s.targqc_region_tsv, s.name, depth_thresholds_by_sample[s.name]] for sambamba_output_fpath, s in zip(sambamba_depth_output_fpaths, samples)]) info('Done.') return [s.targqc_region_tsv for s in samples]
def _do_run(cmd, checks, env=None, output_fpath=None, input_fpath=None): """Perform running and check results, raising errors for issues. """ cmd, shell_arg, executable_arg = _normalize_cmd_args(cmd) s = subprocess.Popen(cmd, shell=shell_arg, executable=executable_arg, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, env=env) debug_stdout = collections.deque(maxlen=100) while 1: line = s.stdout.readline() if line: if six.PY3: line = line.decode(errors='replace') debug_stdout.append(line) info(' ' + line.rstrip()) exitcode = s.poll() if exitcode is not None: for line in s.stdout: if six.PY3: line = line.decode(errors='replace') debug_stdout.append(line) if exitcode is not None and exitcode != 0: error_msg = " ".join(cmd) if not isinstance(cmd, six.string_types) else cmd error_msg += "\n" error_msg += "".join(debug_stdout) s.communicate() s.stdout.close() raise subprocess.CalledProcessError(exitcode, cmd=cmd, output=error_msg) else: break s.communicate() s.stdout.close() # Check for problems not identified by shell return codes if checks: for check in checks: if not check(output_fpath, input_fpath): raise IOError("External command failed")
def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1): info('Running bwa to align reads...') bam_fpath = make_bam_fpath(work_dir) if can_reuse(bam_fpath, [l_fpath, r_fpath]): return bam_fpath tmp_dirpath = join(work_dir, 'sambamba_tmp_dir') safe_mkdir(tmp_dirpath) bwa_cmdline = ( '{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' + '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' + '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}' ).format(**locals()) run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False) if dedup: dedup_bam_fpath = add_suffix(bam_fpath, 'dedup') dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format( **locals()) run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False) verify_bam(dedup_bam_fpath) os.rename(dedup_bam_fpath, bam_fpath) sambamba.index_bam(bam_fpath) # samtools view -b -S -u - | # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam # /dev/stdin # if dedup: # info() # info('Calling SamBlaster to mark duplicates') # markdup_sam_fpath = markdup_sam(sam_fpath, samblaster) # if markdup_sam_fpath: # sam_fpath = markdup_sam_fpath # info() # info('Converting to BAM') # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals()) # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate) # # info() # info('Sorting BAM') # prefix = splitext(sorted_bam_fpath)[0] # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals()) # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate) return bam_fpath
def _split_reference_by_priority(cnf, features_bed_fpath): features = ['CDS', 'Exon', 'Transcript', 'Gene'] info('Splitting the reference file into ' + ', '.join(features)) features_and_beds = [] for f in features: features_and_beds.append( (f, BedTool(features_bed_fpath).filter(lambda x: x[6] == f))) return features_and_beds
def _make_targetcov_symlinks(samples): for sample in samples: new_link = join(dirname(dirname(sample.targetcov_detailed_txt)), basename(sample.targetcov_detailed_txt)) if exists(new_link): os.unlink(new_link) symlink_plus(sample.targetcov_detailed_txt, new_link) info('TargetCov TXT symlink saved to ' + new_link)
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def get_padded_bed_file(work_dir, bed, padding, fai_fpath): genome_fpath = fai_fpath info('Making bed file for padded regions...') bedtools = which('bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome_fpath} -b {padding}'.format( **locals()) output_fpath = intermediate_fname(work_dir, bed, 'padded') call_process.run(cmdline, output_fpath=output_fpath) return output_fpath
def _make_targetcov_symlinks(samples): for sample in samples: new_link = join( dirname(dirname(sample.targetcov_detailed_txt)), basename(sample.targetcov_detailed_txt)) if exists(new_link): os.unlink(new_link) symlink_plus(sample.targetcov_detailed_txt, new_link) info('TargetCov TXT symlink saved to ' + new_link)
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath env = os.environ.copy() if env_vars: for k, v in env_vars.items(): if v is None: if k in env: del env[k] else: env[k] = v if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpath): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, six.string_types) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpath) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpath) else: _try_run(cmd, output_fpath, input_fpath) else: _try_run(cmd, None, input_fpath)
def count_read_pairs(s_name, work_dir, fastq_fpath): from targqc.utilz.logger import info pairs_counts_fpath = make_pair_counts_fpath(work_dir) if can_reuse(pairs_counts_fpath, fastq_fpath): with open(pairs_counts_fpath) as f: return int(f.read().strip()) else: info('Counting read pairs in ' + s_name + ', writing to ' + pairs_counts_fpath) pairs_number = _count_records_in_fastq(fastq_fpath) with open(pairs_counts_fpath, 'w') as out: out.write(str(pairs_number)) return pairs_number
def bgzip_and_tabix(fpath, reuse=False, tabix_parameters='', **kwargs): gzipped_fpath = join(fpath + '.gz') tbi_fpath = gzipped_fpath + '.tbi' if reuse and \ file_exists(gzipped_fpath) and (getctime(gzipped_fpath) >= getctime(fpath) if file_exists(fpath) else True) and \ file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath): info('Actual compressed file and index exist, reusing') return gzipped_fpath info('Compressing and tabixing file, writing ' + gzipped_fpath + '(.tbi)') bgzip = which('bgzip') tabix = which('tabix') if not bgzip: err('Cannot index file because bgzip is not found') if not tabix: err('Cannot index file because tabix is not found') if not bgzip and not tabix: return fpath if isfile(gzipped_fpath): os.remove(gzipped_fpath) if isfile(tbi_fpath): os.remove(tbi_fpath) info('BGzipping ' + fpath) cmdline = '{bgzip} {fpath}'.format(**locals()) call_process.run(cmdline) info('Tabixing ' + gzipped_fpath) cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals()) call_process.run(cmdline) return gzipped_fpath
def set_up_dirs(proc_name, output_dir=None, work_dir=None, log_dir=None): """ Creates output_dir, work_dir, and sets up log """ output_dir = safe_mkdir( adjust_path(output_dir or join(os.getcwd(), proc_name)), 'output_dir') debug('Saving results into ' + output_dir) work_dir = safe_mkdir(work_dir or join(output_dir, 'work'), 'working directory') info('Using work directory ' + work_dir) log_fpath = set_up_log(log_dir or safe_mkdir(join(work_dir, 'log')), proc_name + '.log') return output_dir, work_dir, log_fpath
def find_fastq_pairs(fpaths): info('Finding FastQ pairs...') fastqs_by_sample_name = dict() for fpath in fpaths: fn, ext = splitext_plus(basename(fpath)) if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']: sname, l_fpath, r_fpath = None, None, None if fn.endswith('_1'): sname = fn[:-2] l_fpath = fpath if fn.endswith('_R1'): sname = fn[:-3] l_fpath = fpath if fn.endswith('_2'): sname = fn[:-2] r_fpath = fpath if fn.endswith('_R2'): sname = fn[:-3] r_fpath = fpath if sname: m = re.match(r'(.*)_S\d+', sname) if m: sname = m.group(1) sname = sname.replace('-', '_') else: sname = fn info('Cannot detect file for ' + sname) l, r = fastqs_by_sample_name.get(sname, (None, None)) if l and l_fpath: critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath) if r and r_fpath: critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath) fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath fixed_fastqs_by_sample_name = dict() for sname, (l, r) in fastqs_by_sample_name.items(): if not l: err('ERROR: for sample ' + sname + ', left reads not found') if not r: err('ERROR: for sample ' + sname + ', right reads not found') if l and r: fixed_fastqs_by_sample_name[sname] = l, r return fixed_fastqs_by_sample_name
def annotate_target(work_dir, target_bed, genome_build): output_fpath = intermediate_fname(work_dir, target_bed, 'ann') if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath annotate_bed_py = which('annotate_bed.py') if not annotate_bed_py: critical( 'Error: annotate_bed.py not found in PATH, please install TargQC.') cmdline = '{annotate_bed_py} {target_bed} -g {genome_build} -o {output_fpath}'.format( **locals()) run(cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = clean_bed(output_fpath, work_dir) return output_fpath
def align(work_dir, sample_name, l_fpath, r_fpath, bwa, smb, bwa_prefix, dedup=True, threads=1): info('Running bwa to align reads...') bam_fpath = make_bam_fpath(work_dir) if can_reuse(bam_fpath, [l_fpath, r_fpath]): return bam_fpath tmp_dirpath = join(work_dir, 'sambamba_tmp_dir') safe_mkdir(tmp_dirpath) bwa_cmdline = ('{bwa} mem -t {threads} -v 2 {bwa_prefix} {l_fpath} {r_fpath} | ' + '{smb} view /dev/stdin -t {threads} -f bam -S -o - | ' + '{smb} sort /dev/stdin -t {threads} --tmpdir {tmp_dirpath} -o {bam_fpath}').format(**locals()) run(bwa_cmdline, output_fpath=bam_fpath, stdout_to_outputfile=False) if dedup: dedup_bam_fpath = add_suffix(bam_fpath, 'dedup') dedup_cmdl = '{smb} markdup -t {threads} {bam_fpath} {dedup_bam_fpath}'.format(**locals()) run(dedup_cmdl, output_fpath=dedup_bam_fpath, stdout_to_outputfile=False) verify_bam(dedup_bam_fpath) os.rename(dedup_bam_fpath, bam_fpath) sambamba.index_bam(bam_fpath) # samtools view -b -S -u - | # sambamba sort -N -t 8 -m 682M --tmpdir /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000-sorttmp-full # -o /Molly/saveliev/cancer-dream-syn3/work/align/syn3-normal/split/tx/tmpwdXndE/syn3-normal-sort-1_20000000.bam # /dev/stdin # if dedup: # info() # info('Calling SamBlaster to mark duplicates') # markdup_sam_fpath = markdup_sam(sam_fpath, samblaster) # if markdup_sam_fpath: # sam_fpath = markdup_sam_fpath # info() # info('Converting to BAM') # cmdline = sambamba.get_executable() + ' view -t {threads} -S -f bam {sam_fpath}'.format(**locals()) # run(cmdline, output_fpath=bam_fpath, reuse=cfg.reuse_intermediate) # # info() # info('Sorting BAM') # prefix = splitext(sorted_bam_fpath)[0] # cmdline = sambamba.get_executable() + ' sort -t {threads} {bam_fpath} -o {sorted_bam_fpath}'.format(**locals()) # run(cmdline, output_fpath=sorted_bam_fpath, stdout_to_outputfile=False, reuse=cfg.reuse_intermediate) return bam_fpath
def _prep_report_data(sample, depth_stats, reads_stats, indels_stats, target_stats, target, num_pairs_by_sample, genome, depth_threshs, fai_fpath=None): sample.avg_depth = depth_stats['ave_depth'] if num_pairs_by_sample and sample.name in num_pairs_by_sample: reads_stats['original_num_reads'] = num_pairs_by_sample[sample.name] * 2 chrom_lengths = reference_data.get_chrom_lengths(genome=genome, fai_fpath=fai_fpath) if 'Y' in chrom_lengths or 'chrY' in chrom_lengths: reads_stats['gender'] = determine_sex(sample.work_dir, sample.bam, depth_stats['ave_depth'], genome, target.get_capture_bed()) info() if 'bases_by_depth' in depth_stats: depth_stats['bases_within_threshs'], depth_stats['rates_within_threshs'] = calc_bases_within_threshs( depth_stats['bases_by_depth'], target_stats['target_size'] if not target.is_wgs else target_stats['reference_size'], depth_threshs) if depth_stats['median_depth'] > 0: depth_stats['wn_20_percent'] = calc_rate_within_normal( depth_stats['bases_by_depth'], depth_stats['median_depth'], target_stats['target_size'] if not target.is_wgs else target_stats['reference_size']) if target_stats['target_size']: target.bases_num = target_stats['target_size'] target.fraction = target_stats['target_fraction'] else: target.bases_num = target_stats['reference_size'] reads_stats['mapped_dedup'] = number_of_mapped_reads(sample.work_dir, sample.bam, dedup=True) if not target.is_wgs: reads_stats['mapped_dedup_on_target'] = number_mapped_reads_on_target( sample.work_dir, target.get_capture_bed().cut(range(3)).saveas().fn, sample.bam, dedup=True, target_name='target') or 0 reads_stats['mapped_dedup_on_padded_target'] = number_mapped_reads_on_target( sample.work_dir, target.padded_bed_fpath, sample.bam, dedup=True, target_name='padded_target') or 0 else: cds_bed = get_merged_cds(genome) info('Using the CDS reference BED to calc "reads on CDS"') reads_stats['mapped_dedup_on_exome'] = number_mapped_reads_on_target( sample.work_dir, cds_bed, sample.bam, dedup=True, target_name='exome') or 0 return depth_stats, reads_stats, indels_stats
def _get_qualimap_version(tool_cmdline): cmdline = tool_cmdline + ' -version' # actually, Qualimap doesn't have -version option version = None with subprocess.Popen(cmdline, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True).stdout as stdout: out = stdout.read().strip() flag = "QualiMap v." if out.startswith(flag) >= 0: version = out.split(flag)[-1].strip() if not version: info('WARNING: could not determine Qualimap version, using 1.0') return '1.0' if version.split('.') > 2: # only major version version = '.'.join(version.split('.')[:2]) return version
def make_general_reports(view, samples, target, genome, depth_threshs, bed_padding, num_pairs_by_sample=None, reuse=False, is_debug=False, reannotate=False, fai_fpath=None): if all(all(can_reuse(fp, [s.bam, target.qualimap_bed_fpath] if target.bed else s.bam) for fp in _qualimap_outputs(s)) for s in samples): debug('All QualiMap files for all samples exist and newer than BAMs and BEDs, reusing') else: info('Running QualiMap...') view.run(runner.run_qualimap, [[s.work_dir, s.qualimap_dirpath, _qualimap_outputs(s), s.bam, genome, target.qualimap_bed_fpath, view.cores_per_job] for s in samples]) for s in samples: for fp in _qualimap_outputs(s): verify_file(fp, is_critical=True) summary_reports = [] for sample in samples: info('-'*70) info(sample.name) debug('-'*70) debug('Parsing QualiMap results...') depth_stats, reads_stats, indels_stats, target_stats = parse_qualimap_results(sample) _prep_report_data(sample, depth_stats, reads_stats, indels_stats, target_stats, target, num_pairs_by_sample, genome, depth_threshs, fai_fpath=fai_fpath) r = _build_report(depth_stats, reads_stats, indels_stats, sample, target, depth_threshs, bed_padding, sample_num=len(samples), is_debug=is_debug, reannotate=reannotate) summary_reports.append(r) return summary_reports
def read_samples(args): bam_by_sample = find_bams(args) if bam_by_sample: info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else '')) input_not_bam = [ verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample ] input_not_bam = [fpath for fpath in input_not_bam if fpath] fastqs_by_sample = dict() if not input_not_bam and not bam_by_sample: critical('No correct input files') if input_not_bam: info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files') fastqs_by_sample = find_fastq_pairs(input_not_bam) if fastqs_by_sample: info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs') intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys()) if intersection: critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection))) return fastqs_by_sample, bam_by_sample
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES), )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)') genome = opts.genome debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'CDS') features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome)) info('Saving CDS regions...') output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) info('Done, saved to ' + output_fpath)
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all( can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all( verify_file( fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format( **locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def run_qualimap(work_dir, output_dir, output_fpaths, bam_fpath, genome, bed_fpath=None, threads=1): info('Analysing ' + bam_fpath) safe_mkdir(dirname(output_dir)) safe_mkdir(output_dir) mem_cmdl = '' mem_m = get_qualimap_max_mem(bam_fpath) mem = str(int(mem_m)) + 'M' mem_cmdl = '--java-mem-size=' + mem cmdline = (find_executable() + ' bamqc --skip-duplicated -nt {threads} {mem_cmdl} -nr 5000 ' '-bam {bam_fpath} -outdir {output_dir}') if genome.startswith('hg') or genome.startswith('GRCh'): cmdline += ' -gd HUMAN' if genome.startswith('mm'): cmdline += ' -gd MOUSE' if bed_fpath: cmdline += ' -gff {bed_fpath}' debug('Using amplicons/capture panel ' + bed_fpath) cmdline = cmdline.format(**locals()) if not all(can_reuse(fp, [bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): for fp in output_fpaths: if isfile(fp): os.remove(fp) try: run(cmdline, env_vars=dict(DISPLAY=None)) except subprocess.CalledProcessError as e: if 'The alignment file is unsorted.' in e.output: info() info('BAM file is unsorted; trying to sort and rerun QualiMap') sorted_bam_fpath = sort_bam(bam_fpath) cmdline = cmdline.replace(bam_fpath, sorted_bam_fpath) run(cmdline, env_vars=dict(DISPLAY=None)) if not all(verify_file(fp, cmp_f=[bam_fpath, bed_fpath] if bed_fpath else [bam_fpath]) for fp in output_fpaths): critical('Some of the QualiMap results were not generated') return output_dir
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=False, is_debug=False, **kwargs): # if genome: # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2')) # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')') # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0]) # else: intersection_bed = None intersection_fpath = None pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) if is_debug: intersection_fpath = join(work_dir, 'intersection.bed') if isfile(intersection_fpath): info('Loading from ' + intersection_fpath) intersection_bed = BedTool(intersection_fpath) if not intersection_bed: if count_bed_cols(fai_fpath) == 2: debug('Fai fields size is 2 ' + fai_fpath) intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath) else: debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2') intersection_bed = bed.intersect(ref_bed, wao=True) if is_debug and not isfile(intersection_fpath): intersection_bed.saveas(intersection_fpath) debug('Saved intersection to ' + intersection_fpath) total_annotated = 0 total_uniq_annotated = 0 total_off_target = 0 met = set() overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict( lambda: OrderedDefaultDict(lambda: defaultdict(list))) # off_targets = list() expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1 for i, intersection_fields in enumerate(intersection_bed): inters_fields_list = list(intersection_fields) if len(inters_fields_list) < expected_fields_num: critical( f'Cannot parse the reference BED file - unexpected number of lines ' '({len(inters_fields_list} in {inters_fields_list}' + ' (less than {expected_fields_num})') a_chr, a_start, a_end = intersection_fields[:3] a_extra_columns = intersection_fields[3:ori_col_num] overlap_fields = [None for _ in ebl.BedCols.cols] overlap_fields[:len(intersection_fields[ori_col_num:-1] )] = intersection_fields[ori_col_num:-1] keep_gene_column = not reannotate a_gene = None if keep_gene_column: a_gene = a_extra_columns[0] e_chr = overlap_fields[0] overlap_size = int(intersection_fields[-1]) assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}' # fs = [None for _ in ebl.BedCols.cols] # fs[:3] = [a_chr, a_start, a_end] reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns)) if e_chr == '.': total_off_target += 1 # off_targets.append(fs) overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict( list) else: # fs[3:-1] = db_feature_fields[3:-1] total_annotated += 1 if (a_chr, a_start, a_end) not in met: total_uniq_annotated += 1 met.add((a_chr, a_start, a_end)) e_gene = overlap_fields[ebl.BedCols.GENE] if keep_gene_column and e_gene != a_gene: overlaps_by_tx_by_gene_by_loc[reg][ a_gene] = OrderedDefaultDict(list) else: transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID] overlaps_by_tx_by_gene_by_loc[reg][e_gene][ transcript_id].append((overlap_fields, overlap_size)) info(' Total annotated regions: ' + str(total_annotated)) info(' Total unique annotated regions: ' + str(total_uniq_annotated)) info(' Total off target regions: ' + str(total_off_target)) info('Resolving ambiguities...') annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs) return annotated
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir): metric_storage = get_detailed_metric_storage(depth_threshs) report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage) report.add_record( 'Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples])) total_regions = 0 fpaths = [ s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv) ] if not fpaths: err('No targetcov detailed per-gene report was generated; skipping.') return None open_tsv_files = [open(fpath) for fpath in fpaths] first_col = 0 while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break l = lines_for_each_sample[0] if l.startswith('##'): continue elif l.startswith('#'): if l.startswith('#Sample'): first_col = 1 break while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break if all([ not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample ]): shared_fields = lines_for_each_sample[0].split( '\t')[first_col:first_col + 9] reg = report.add_row() reg.add_record('Chr', get_val(shared_fields[0])) reg.add_record('Start', get_int_val(shared_fields[1])) reg.add_record('End', get_int_val(shared_fields[2])) reg.add_record('Size', get_int_val(shared_fields[3])) reg.add_record('Gene', get_val(shared_fields[4])) reg.add_record('Strand', get_val(shared_fields[5])) reg.add_record('Feature', get_val(shared_fields[6])) reg.add_record('Biotype', get_val(shared_fields[7])) reg.add_record('Transcript', get_val(shared_fields[8])) min_depths, ave_depths, stddevs, withins = ([], [], [], []) percents_by_threshs = {t: [] for t in depth_threshs} for l in lines_for_each_sample: fs = l.split('\t') min_depths.append(get_int_val(fs[first_col + 9])) ave_depths.append(get_float_val(fs[first_col + 10])) stddevs.append(get_float_val(fs[first_col + 11])) withins.append(get_float_val(fs[first_col + 12])) for t, f in zip(depth_threshs, fs[first_col + 13:]): percents_by_threshs[t].append(get_float_val(f)) # counting bests reg.add_record('Min depth', select_best(min_depths)) reg.add_record('Ave depth', select_best(ave_depths)) reg.add_record('Std dev', select_best(stddevs, max)) reg.add_record('W/n 20% of median depth', select_best(withins)) for t in depth_threshs: reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t])) total_regions += 1 for f in open_tsv_files: f.close() gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best') txt_rep_fpath = report.save_txt( join(output_dir, gene_report_basename + '.txt')) tsv_rep_fpath = report.save_tsv( join(output_dir, gene_report_basename + '.tsv')) info('') info('Best values for the regions (total ' + str(total_regions) + ') saved into:') info(' ' + txt_rep_fpath) return txt_rep_fpath
def _build_report(depth_stats, reads_stats, mm_indels_stats, sample, target, depth_threshs, bed_padding, sample_num, is_debug=False, reannotate=False): report = SampleReport(sample, metric_storage=get_header_metric_storage(depth_threshs, is_wgs=target.bed_fpath is None, padding=bed_padding)) def _add(_metric_name, _val, url=None): return report.add_record(_metric_name, _val, silent=(sample_num > 1 and not is_debug), url=url) _add('Qualimap', 'Qualimap', url=relpath(sample.qualimap_html_fpath, sample.dirpath)) if reads_stats.get('gender') is not None: _add('Sex', reads_stats['gender']) debug('* General coverage statistics *') _add('Reads', reads_stats['total']) _add('Mapped reads', reads_stats['mapped']) # _add('Unmapped reads', reads_stats['totaAvgl'] - reads_stats['mapped']) percent_mapped = 1.0 * (reads_stats['mapped'] or 0) / reads_stats['total'] if reads_stats['total'] else None if percent_mapped > 1.0: percent_mapped = 1.0 _add('Percentage of mapped reads', percent_mapped) # percent_unmapped = 1.0 * (reads_stats['total'] - reads_stats['mapped']) / reads_stats['total'] if reads_stats['total'] else None # assert percent_unmapped <= 1.0 or percent_unmapped is None, str(percent_unmapped) # _add('Percentage of unmapped reads', percent_unmapped) if reads_stats.get('mapped_paired') is not None: total_paired_reads_pecent = 1.0 * (reads_stats['mapped_paired'] or 0) / reads_stats['total'] if reads_stats['total'] else None if total_paired_reads_pecent > 1.0: total_paired_reads_pecent = 1.0 _add('Properly paired mapped reads percent', total_paired_reads_pecent) # if reads_stats.get('paired') is not None: # total_paired_reads_pecent = 1.0 * (reads_stats['paired'] or 0) / reads_stats['total'] if reads_stats['total'] else None # assert total_paired_reads_pecent <= 1.0 or total_paired_reads_pecent is None, str(total_paired_reads_pecent) # _add('Properly paired reads percent', total_paired_reads_pecent) # if dedup_bam_stats: # dup_rate = 1 - (1.0 * dedup_bam_stats['mapped'] / bam_stats['mapped']) if bam_stats['mapped'] else None _add('Duplication rate', reads_stats['dup_rate']) # _add('Dedupped mapped reads', reads_stats['mapped'] - reads_stats['']) _add('Median GC', reads_stats['median_gc']) _add('Median insert size', reads_stats['median_ins_size']) debug() if not target.is_wgs: debug('* Target coverage statistics *') if target.original_bed_fpath: _add('Target', target.original_bed_fpath) if count_bed_cols(target.original_bed_fpath) == 3 or reannotate: _add('Ready target (clean, sorted and annotated)', target.capture_bed_fpath) else: _add('Target', target.capture_bed_fpath) _add('Bases in target', target.bases_num) _add('Percentage of reference', target.fraction) _add('Regions in target', target.regions_num) _add('Scope', 'targeted') _add('Genes in target', len(target.gene_keys_list)) else: debug('* Genome coverage statistics *') _add('Target', 'whole genome') _add('Reference size', target.bases_num) _add('Scope', 'WGS') trg_type = 'target' if not target.is_wgs else 'genome' if 'bases_within_threshs' in depth_stats: bases_within_threshs = depth_stats['bases_within_threshs'] v_covered_bases_in_targ = list(bases_within_threshs.items())[0][1] v_percent_covered_bases_in_targ = 1.0 * (v_covered_bases_in_targ or 0) / target.bases_num if target.bases_num else None if v_percent_covered_bases_in_targ > 1.0: v_percent_covered_bases_in_targ = 1.0 _add('Covered bases in ' + trg_type, v_covered_bases_in_targ) _add('Percentage of ' + trg_type + ' covered by at least 1 read', v_percent_covered_bases_in_targ) if not target.is_wgs: debug('Getting number of mapped reads on target...') # mapped_reads_on_target = number_mapped_reads_on_target(cnf, target_info.bed, bam_fpath) if 'mapped_dedup_on_target' in reads_stats: # _add('Reads mapped on target', reads_stats['mapped_on_target']) debug('Unique mapped reads on target: ' + str(reads_stats['mapped_dedup_on_target'])) percent_mapped_dedup_on_target = 1.0 * reads_stats['mapped_dedup_on_target'] / reads_stats['mapped_dedup'] if reads_stats['mapped_dedup'] != 0 else None if percent_mapped_dedup_on_target > 1.0: percent_mapped_dedup_on_target = 1.0 _add('Percentage of reads mapped on target', percent_mapped_dedup_on_target) percent_mapped_dedup_off_target = 1.0 * (reads_stats['mapped_dedup'] - reads_stats['mapped_dedup_on_target']) / reads_stats['mapped_dedup'] if reads_stats['mapped_dedup'] != 0 else None if percent_mapped_dedup_off_target > 1.0: percent_mapped_dedup_off_target = 1.0 _add('Percentage of reads mapped off target', percent_mapped_dedup_off_target) percent_usable = 1.0 * reads_stats['mapped_dedup_on_target'] / reads_stats['total'] if reads_stats['total'] != 0 else None if percent_usable > 1.0: percent_usable = 1.0 # for edge case where multimappers cause number of alignments to be higher than number of reads _add('Percentage of usable reads', percent_usable) read_bases_on_targ = int(target.bases_num * depth_stats['ave_depth']) # sum of all coverages _add('Read bases mapped on target', read_bases_on_targ) if 'mapped_dedup_on_padded_target' in reads_stats: # _add('Reads mapped on padded target', reads_stats['mapped_reads_on_padded_target']) percent_mapped_on_padded_target = 1.0 * reads_stats['mapped_dedup_on_padded_target'] / reads_stats['mapped_dedup'] if reads_stats['mapped_dedup'] else None if percent_mapped_on_padded_target > 1.0: percent_mapped_on_padded_target = 1.0 _add('Percentage of reads mapped on padded target', percent_mapped_on_padded_target) elif 'mapped_dedup_on_exome' in reads_stats: # _add('Reads mapped on target', reads_stats['mapped_on_target']) percent_mapped_on_exome = 1.0 * reads_stats['mapped_dedup_on_exome'] / reads_stats['mapped_dedup'] if reads_stats['mapped_dedup'] != 0 else None if percent_mapped_on_exome: if percent_mapped_on_exome > 1.0: percent_mapped_on_exome = 1.0 _add('Percentage of reads mapped on exome', percent_mapped_on_exome) percent_mapped_off_exome = 1.0 - percent_mapped_on_exome _add('Percentage of reads mapped off exome ', percent_mapped_off_exome) percent_usable = 1.0 * reads_stats['mapped_dedup'] / reads_stats['total'] if reads_stats['total'] != 0 else None if percent_usable > 1.0: percent_usable = 1.0 _add('Percentage of usable reads', percent_usable) debug() _add('Mean ' + trg_type + ' coverage depth', depth_stats['ave_depth']) if 'original_num_reads' in reads_stats: _add('Original reads', reads_stats['original_num_reads']) times_downsampled = 1.0 * reads_stats['original_num_reads'] / reads_stats['total'] est_full_cov = times_downsampled * depth_stats['ave_depth'] _add('Estimated ' + trg_type + ' full coverage depth', est_full_cov) _add('Median ' + trg_type + ' coverage depth', depth_stats['median_depth']) if depth_stats['median_depth'] > 0: _add('Std. dev. of ' + trg_type + ' coverage depth', depth_stats['stddev_depth']) # _add('Minimal ' + trg_type + ' coverage depth', depth_stats['min_depth']) # _add('Maximum ' + trg_type + ' coverage depth', depth_stats['max_depth']) if 'wn_20_percent' in depth_stats: if depth_stats['wn_20_percent'] > 1.0: depth_stats['wn_20_percent'] = 1.0 _add('Percentage of ' + trg_type + ' within 20% of med depth', depth_stats['wn_20_percent']) if 'bases_within_threshs' in depth_stats: for depth, bases in depth_stats['bases_within_threshs'].items(): fraction_val = 1.0 * (bases or 0) / target.bases_num if target.bases_num else 0 if fraction_val > 1.0: fraction_val = 1.0 if fraction_val > 0: _add('Part of ' + trg_type + ' covered at least by ' + str(depth) + 'x', fraction_val) debug() _add('Read mean length', reads_stats['ave_len']) _add('Read min length', reads_stats['min_len']) _add('Read max length', reads_stats['max_len']) _add('Mean Mapping Quality', mm_indels_stats['mean_mq']) _add('Mismatches', mm_indels_stats['mismatches']) _add('Insertions', mm_indels_stats['insertions']) _add('Deletions', mm_indels_stats['deletions']) _add('Homopolymer indels', mm_indels_stats['homo_indels']) debug() info('Saving reports...') report.save_json(sample.targqc_json_fpath) report.save_txt(sample.targqc_txt_fpath) report.save_html(sample.targqc_html_fpath, caption='Target coverage statistics for ' + sample.name) debug() debug('Saved to ' + dirname(report.txt_fpath)) return report
def _save_best_details_for_each_gene(depth_threshs, samples, output_dir): metric_storage = get_detailed_metric_storage(depth_threshs) report = PerRegionSampleReport(sample='Best', metric_storage=metric_storage) report.add_record('Sample', 'contains best values from all samples: ' + ', '.join([s.name for s in samples])) total_regions = 0 fpaths = [s.targqc_region_tsv for s in samples if verify_file(s.targqc_region_tsv)] if not fpaths: err('No targetcov detailed per-gene report was generated; skipping.') return None open_tsv_files = [open(fpath) for fpath in fpaths] first_col = 0 while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break l = lines_for_each_sample[0] if l.startswith('##'): continue elif l.startswith('#'): if l.startswith('#Sample'): first_col = 1 break while True: lines_for_each_sample = [next(f, None) for f in open_tsv_files] if not all(lines_for_each_sample): break if all([not l.startswith('#') and ('Whole-Gene' in l or 'Gene-Exon' in l) for l in lines_for_each_sample]): shared_fields = lines_for_each_sample[0].split('\t')[first_col:first_col+9] reg = report.add_row() reg.add_record('Chr', get_val(shared_fields[0])) reg.add_record('Start', get_int_val(shared_fields[1])) reg.add_record('End', get_int_val(shared_fields[2])) reg.add_record('Size', get_int_val(shared_fields[3])) reg.add_record('Gene', get_val(shared_fields[4])) reg.add_record('Strand', get_val(shared_fields[5])) reg.add_record('Feature', get_val(shared_fields[6])) reg.add_record('Biotype', get_val(shared_fields[7])) reg.add_record('Transcript', get_val(shared_fields[8])) min_depths, ave_depths, stddevs, withins = ([], [], [], []) percents_by_threshs = {t: [] for t in depth_threshs} for l in lines_for_each_sample: fs = l.split('\t') min_depths.append(get_int_val(fs[first_col+9])) ave_depths.append(get_float_val(fs[first_col+10])) stddevs.append(get_float_val(fs[first_col+11])) withins.append(get_float_val(fs[first_col+12])) for t, f in zip(depth_threshs, fs[first_col+13:]): percents_by_threshs[t].append(get_float_val(f)) # counting bests reg.add_record('Min depth', select_best(min_depths)) reg.add_record('Ave depth', select_best(ave_depths)) reg.add_record('Std dev', select_best(stddevs, max)) reg.add_record('W/n 20% of median depth', select_best(withins)) for t in depth_threshs: reg.add_record('{}x'.format(t), select_best(percents_by_threshs[t])) total_regions += 1 for f in open_tsv_files: f.close() gene_report_basename = add_suffix(samples[0].targqc_region_tsv, 'best') txt_rep_fpath = report.save_txt(join(output_dir, gene_report_basename + '.txt')) tsv_rep_fpath = report.save_tsv(join(output_dir, gene_report_basename + '.tsv')) info('') info('Best values for the regions (total ' + str(total_regions) + ') saved into:') info(' ' + txt_rep_fpath) return txt_rep_fpath
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True): num_pairs_by_sample = num_pairs_by_sample or dict() if downsample_to: # Read pairs counts debug() if all(s.name in num_pairs_by_sample for s in samples): debug('Using read pairs counts extracted from FastQC reports') elif all(can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples): debug('Reusing pairs counts, reading from files') num_pairs_by_sample = {s.name: int(open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples} else: info('Counting read pairs') num_pairs = parall_view.run(count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples]) num_pairs_by_sample = {s.name: pairs_count for s, pairs_count in zip(samples, num_pairs)} # Downsampling debug() if all(can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and can_reuse(make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples): debug('Reusing downsampled FastQ') for s in samples: s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath) s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath) else: if isinstance(downsample_to, float): info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads') else: info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs') fastq_pairs = parall_view.run(downsample, [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name)] for s in samples]) for s, (l_r, r_r) in zip(samples, fastq_pairs): s.l_fpath = l_r s.r_fpath = r_r else: info('Skipping downsampling') debug() if all(can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples): debug('All downsampled BAM exists, reusing') for s in samples: s.bam = make_bam_fpath(join(work_dir, s.name)) else: bwa = which('bwa') if not isfile(bwa): critical('BWA not found under ' + bwa) smb = sambamba.get_executable() if not (bwa and smb): if not bwa: err('Error: bwa is required for the alignment pipeline') if not smb: err('Error: sambamba is required for the alignment pipeline') critical('Tools required for alignment not found') info('Aligning reads to the reference') bam_fpaths = parall_view.run(align, [[join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job] for s in samples]) bam_fpaths = [verify_bam(b) for b in bam_fpaths] if len(bam_fpaths) < len(samples): critical('Some samples were not aligned successfully.') for bam, s in zip(bam_fpaths, samples): s.bam = bam return num_pairs_by_sample
def start_targqc( work_dir, output_dir, samples, target_bed_fpath, parallel_cfg, bwa_prefix, fai_fpath=None, genome=config.genome, depth_threshs=config.depth_thresholds, downsample_to=config.downsample_fraction, padding=config.padding, dedup=config.dedup, num_pairs_by_sample=None, reannotate=config.reannotate, ): d = get_description() info('*' * len(d)) info(d) info('*' * len(d)) info() fai_fpath = fai_fpath or ref.get_fai(genome) target = Target(work_dir, output_dir, fai_fpath, padding=padding, bed_fpath=target_bed_fpath, reannotate=reannotate, genome=genome, is_debug=logger.is_debug) fastq_samples = [ s for s in samples if not s.bam and s.l_fpath and s.r_fpath ] from targqc.utilz.parallel import parallel_view if fastq_samples: if not bwa_prefix: critical('--bwa-prefix is required when running from fastq') with parallel_view(len(fastq_samples), parallel_cfg, join(work_dir, 'sge_fastq')) as view: num_pairs_by_sample = proc_fastq(fastq_samples, view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample, dedup=dedup) info() for s in samples: if s.bam: info(s.name + ': using alignment ' + s.bam) with parallel_view(len(samples), parallel_cfg, join(work_dir, 'sge_bam')) as view: info('Sorting BAMs...') sorted_bams = view.run( sort_bam, [[s.bam, safe_mkdir(join(work_dir, s.name))] for s in samples]) for s, sorted_bam in zip(samples, sorted_bams): s.bam = sorted_bam if all(can_reuse(s.bam + '.bai', s.bam) for s in samples): debug('BAM indexes exists') else: info('Indexing BAMs...') view.run(index_bam, [[s.bam] for s in samples]) info('Making general reports...') make_general_reports(view, samples, target, genome, depth_threshs, padding, num_pairs_by_sample, is_debug=logger.is_debug, reannotate=reannotate, fai_fpath=fai_fpath) info() info('*' * 70) tsv_fpath, html_fpath = make_tarqc_html_report(output_dir, work_dir, samples, bed_fpath=target_bed_fpath) info('TargQC summary saved in: ') info(' ' + html_fpath) info(' ' + tsv_fpath) info() with parallel_view(len(samples), parallel_cfg, join(work_dir, 'sge_bam')) as view: info('Making region-level reports...') make_region_reports(view, work_dir, samples, target, genome, depth_threshs) info() info('*' * 70) tsv_region_rep_fpath = combined_regional_reports(work_dir, output_dir, samples) info() info('*' * 70) info('TargQC summary saved in: ') info(' ' + html_fpath) info(' ' + tsv_fpath) info('Per-region coverage statistics saved into:') info(' ' + tsv_region_rep_fpath) return html_fpath
def downsample(work_dir, sample_name, fastq_left_fpath, fastq_right_fpath, downsample_to, num_pairs=None): """ get N random headers from a fastq file without reading the whole thing into memory modified from: http://www.biostars.org/p/6544/ """ sample_name = sample_name or splitext(''.join(lc if lc == rc else '' for lc, rc in zip(fastq_left_fpath, fastq_right_fpath)))[0] l_out_fpath = make_downsampled_fpath(work_dir, fastq_left_fpath) r_out_fpath = make_downsampled_fpath(work_dir, fastq_right_fpath) if can_reuse(l_out_fpath, [fastq_left_fpath, fastq_right_fpath]): return l_out_fpath, r_out_fpath info('Processing ' + sample_name) if num_pairs is None: info(sample_name + ': counting number of reads in fastq...') num_pairs = _count_records_in_fastq(fastq_left_fpath) if num_pairs > LIMIT: info(sample_name + ' the number of reads is higher than ' + str(LIMIT) + ', sampling from only first ' + str(LIMIT)) num_pairs = LIMIT info(sample_name + ': ' + str(num_pairs) + ' reads') num_downsample_pairs = int(downsample_to * num_pairs) if isinstance(downsample_to, float) else downsample_to if num_pairs <= num_downsample_pairs: info(sample_name + ': and it is less than ' + str(num_downsample_pairs) + ', so no downsampling.') return fastq_left_fpath, fastq_right_fpath else: info(sample_name + ': downsampling to ' + str(num_downsample_pairs)) rand_records = sorted(random.sample(range(num_pairs), num_downsample_pairs)) info('Opening ' + fastq_left_fpath) fh1 = open_gzipsafe(fastq_left_fpath) info('Opening ' + fastq_right_fpath) fh2 = open_gzipsafe(fastq_right_fpath) if fastq_right_fpath else None out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath,) written_records = 0 with file_transaction(work_dir, out_files) as tx_out_files: if isinstance(tx_out_files, six.string_types): tx_out_f1 = tx_out_files else: tx_out_f1, tx_out_f2 = tx_out_files info('Opening ' + str(tx_out_f1) + ' to write') sub1 = open_gzipsafe(tx_out_f1, "w") info('Opening ' + str(tx_out_f2) + ' to write') sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None rec_no = -1 for rr in rand_records: while rec_no < rr: rec_no += 1 for i in range(4): fh1.readline() if fh2: for i in range(4): fh2.readline() for i in range(4): sub1.write(fh1.readline()) if sub2: sub2.write(fh2.readline()) written_records += 1 if written_records % 10000 == 0: info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no + 1)) if rec_no > num_pairs: info(sample_name + ' reached the limit of ' + str(num_pairs), ' read lines, stopping.') break info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no)) fh1.close() sub1.close() if fastq_right_fpath: fh2.close() sub2.close() info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written') return l_out_fpath, r_out_fpath
def _try_run(_cmd, _output_fpath, _input_fpath): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, six.string_types) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpath) except: raise
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) # if reannotate: # bed = BedTool(input_bed_fpath).cut([0, 1, 2]) # keep_gene_column = False # else: # if col_num > 4: # bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3]) # keep_gene_column = True # features_bed = features_bed.saveas() # cols = features_bed.field_count() # if cols < 12: # features_bed = features_bed.each(lambda f: f + ['.']*(12-cols)) if high_confidence: features_bed = features_bed.filter(ebl.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ebl.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def downsample(work_dir, sample_name, fastq_left_fpath, fastq_right_fpath, downsample_to, num_pairs=None): """ get N random headers from a fastq file without reading the whole thing into memory modified from: http://www.biostars.org/p/6544/ """ sample_name = sample_name or splitext(''.join( lc if lc == rc else '' for lc, rc in zip(fastq_left_fpath, fastq_right_fpath)))[0] l_out_fpath = make_downsampled_fpath(work_dir, fastq_left_fpath) r_out_fpath = make_downsampled_fpath(work_dir, fastq_right_fpath) if can_reuse(l_out_fpath, [fastq_left_fpath, fastq_right_fpath]): return l_out_fpath, r_out_fpath info('Processing ' + sample_name) if num_pairs is None: info(sample_name + ': counting number of reads in fastq...') num_pairs = _count_records_in_fastq(fastq_left_fpath) if num_pairs > LIMIT: info(sample_name + ' the number of reads is higher than ' + str(LIMIT) + ', sampling from only first ' + str(LIMIT)) num_pairs = LIMIT info(sample_name + ': ' + str(num_pairs) + ' reads') num_downsample_pairs = int(downsample_to * num_pairs) if isinstance( downsample_to, float) else downsample_to if num_pairs <= num_downsample_pairs: info(sample_name + ': and it is less than ' + str(num_downsample_pairs) + ', so no downsampling.') return fastq_left_fpath, fastq_right_fpath else: info(sample_name + ': downsampling to ' + str(num_downsample_pairs)) rand_records = sorted( random.sample(range(num_pairs), num_downsample_pairs)) info('Opening ' + fastq_left_fpath) fh1 = open_gzipsafe(fastq_left_fpath) info('Opening ' + fastq_right_fpath) fh2 = open_gzipsafe(fastq_right_fpath) if fastq_right_fpath else None out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath, ) written_records = 0 with file_transaction(work_dir, out_files) as tx_out_files: if isinstance(tx_out_files, six.string_types): tx_out_f1 = tx_out_files else: tx_out_f1, tx_out_f2 = tx_out_files info('Opening ' + str(tx_out_f1) + ' to write') sub1 = open_gzipsafe(tx_out_f1, "w") info('Opening ' + str(tx_out_f2) + ' to write') sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None rec_no = -1 for rr in rand_records: while rec_no < rr: rec_no += 1 for i in range(4): fh1.readline() if fh2: for i in range(4): fh2.readline() for i in range(4): sub1.write(fh1.readline()) if sub2: sub2.write(fh2.readline()) written_records += 1 if written_records % 10000 == 0: info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no + 1)) if rec_no > num_pairs: info(sample_name + ' reached the limit of ' + str(num_pairs), ' read lines, stopping.') break info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no)) fh1.close() sub1.close() if fastq_right_fpath: fh2.close() sub2.close() info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written') return l_out_fpath, r_out_fpath
def check_md5(work_dir, fpath, file_type, silent=False): md5_fpath = join(work_dir, file_type + '_md5.txt') new_md5 = md5(fpath) info('md5 of ' + fpath + ' is ' + str(new_md5)) prev_md5 = None if isfile(md5_fpath): with open(md5_fpath) as f: prev_md5 = f.read() else: info('Previous md5 file ' + md5_fpath + ' does not exist') info('Checking previous md5 from ' + md5_fpath + ': ' + str(prev_md5)) if prev_md5 == new_md5: if not silent: debug('Reusing previous ' + file_type.upper() + ' files.') return True else: if not silent: info('Pre-processing input ' + file_type.upper() + ' file') if prev_md5: if not silent: info('Prev ' + file_type.upper() + ' md5: ' + str(prev_md5)) info('New ' + file_type.upper() + ' md5: ' + str(new_md5)) with open(md5_fpath, 'w') as f: f.write(str(new_md5)) return False
def proc_fastq(samples, parall_view, work_dir, bwa_prefix, downsample_to, num_pairs_by_sample=None, dedup=True): num_pairs_by_sample = num_pairs_by_sample or dict() if downsample_to: # Read pairs counts debug() if all(s.name in num_pairs_by_sample for s in samples): debug('Using read pairs counts extracted from FastQC reports') elif all( can_reuse(make_pair_counts_fpath(join(work_dir, s.name)), s.l_fpath) for s in samples): debug('Reusing pairs counts, reading from files') num_pairs_by_sample = { s.name: int( open(make_pair_counts_fpath(join(work_dir, s.name))).read().strip()) for s in samples } else: info('Counting read pairs') num_pairs = parall_view.run( count_read_pairs, [[s.name, safe_mkdir(join(work_dir, s.name)), s.l_fpath] for s in samples]) num_pairs_by_sample = { s.name: pairs_count for s, pairs_count in zip(samples, num_pairs) } # Downsampling debug() if all( can_reuse( make_downsampled_fpath(join(work_dir, s.name), s.l_fpath), s.l_fpath) and can_reuse( make_downsampled_fpath(join(work_dir, s.name), s.r_fpath), s.r_fpath) for s in samples): debug('Reusing downsampled FastQ') for s in samples: s.l_fpath = make_downsampled_fpath(join(work_dir, s.name), s.l_fpath) s.r_fpath = make_downsampled_fpath(join(work_dir, s.name), s.r_fpath) else: if isinstance(downsample_to, float): info('Downsampling FastQ to ' + str(float(downsample_to)) + ' fraction of reads') else: info('Downsampling FastQ to ' + str(int(downsample_to)) + ' read pairs') fastq_pairs = parall_view.run(downsample, [[ join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, downsample_to, num_pairs_by_sample.get(s.name) ] for s in samples]) for s, (l_r, r_r) in zip(samples, fastq_pairs): s.l_fpath = l_r s.r_fpath = r_r else: info('Skipping downsampling') debug() if all( can_reuse(make_bam_fpath(join(work_dir, s.name)), [s.l_fpath, s.r_fpath]) for s in samples): debug('All downsampled BAM exists, reusing') for s in samples: s.bam = make_bam_fpath(join(work_dir, s.name)) else: bwa = which('bwa') if not isfile(bwa): critical('BWA not found under ' + bwa) smb = sambamba.get_executable() if not (bwa and smb): if not bwa: err('Error: bwa is required for the alignment pipeline') if not smb: err('Error: sambamba is required for the alignment pipeline') critical('Tools required for alignment not found') info('Aligning reads to the reference') bam_fpaths = parall_view.run(align, [[ join(work_dir, s.name), s.name, s.l_fpath, s.r_fpath, bwa, smb, bwa_prefix, dedup, parall_view.cores_per_job ] for s in samples]) bam_fpaths = [verify_bam(b) for b in bam_fpaths] if len(bam_fpaths) < len(samples): critical('Some samples were not aligned successfully.') for bam, s in zip(bam_fpaths, samples): s.bam = bam return num_pairs_by_sample
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) # if reannotate: # bed = BedTool(input_bed_fpath).cut([0, 1, 2]) # keep_gene_column = False # else: # if col_num > 4: # bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3]) # keep_gene_column = True # features_bed = features_bed.saveas() # cols = features_bed.field_count() # if cols < 12: # features_bed = features_bed.each(lambda f: f + ['.']*(12-cols)) if high_confidence: features_bed = features_bed.filter(ebl.high_confidence_filter) if only_canonical: features_bed = features_bed.filter( ebl.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ebl.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write( '## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write( '## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write( '## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n' ) out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def _annotate(bed, ref_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=False, is_debug=False, **kwargs): # if genome: # genome_fpath = cut(fai_fpath, 2, output_fpath=intermediate_fname(work_dir, fai_fpath, 'cut2')) # intersection = bed.intersect(ref_bed, sorted=True, wao=True, g='<(cut -f1,2 ' + fai_fpath + ')') # intersection = bed.intersect(ref_bed, sorted=True, wao=True, genome=genome.split('-')[0]) # else: intersection_bed = None intersection_fpath = None pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) if is_debug: intersection_fpath = join(work_dir, 'intersection.bed') if isfile(intersection_fpath): info('Loading from ' + intersection_fpath) intersection_bed = BedTool(intersection_fpath) if not intersection_bed: if count_bed_cols(fai_fpath) == 2: debug('Fai fields size is 2 ' + fai_fpath) intersection_bed = bed.intersect(ref_bed, wao=True, g=fai_fpath) else: debug('Fai fields is ' + str(count_bed_cols(fai_fpath)) + ', not 2') intersection_bed = bed.intersect(ref_bed, wao=True) if is_debug and not isfile(intersection_fpath): intersection_bed.saveas(intersection_fpath) debug('Saved intersection to ' + intersection_fpath) total_annotated = 0 total_uniq_annotated = 0 total_off_target = 0 met = set() overlaps_by_tx_by_gene_by_loc = OrderedDefaultDict(lambda: OrderedDefaultDict(lambda: defaultdict(list))) # off_targets = list() expected_fields_num = ori_col_num + len(ebl.BedCols.cols[:-4]) + 1 for i, intersection_fields in enumerate(intersection_bed): inters_fields_list = list(intersection_fields) if len(inters_fields_list) < expected_fields_num: critical(f'Cannot parse the reference BED file - unexpected number of lines ' '({len(inters_fields_list} in {inters_fields_list}' + ' (less than {expected_fields_num})') a_chr, a_start, a_end = intersection_fields[:3] a_extra_columns = intersection_fields[3:ori_col_num] overlap_fields = [None for _ in ebl.BedCols.cols] overlap_fields[:len(intersection_fields[ori_col_num:-1])] = intersection_fields[ori_col_num:-1] keep_gene_column = not reannotate a_gene = None if keep_gene_column: a_gene = a_extra_columns[0] e_chr = overlap_fields[0] overlap_size = int(intersection_fields[-1]) assert e_chr == '.' or a_chr == e_chr, f'Error on line {i}: chromosomes don\'t match ({a_chr} vs {e_chr}). Line: {intersection_fields}' # fs = [None for _ in ebl.BedCols.cols] # fs[:3] = [a_chr, a_start, a_end] reg = (a_chr, int(a_start), int(a_end), tuple(a_extra_columns)) if e_chr == '.': total_off_target += 1 # off_targets.append(fs) overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: # fs[3:-1] = db_feature_fields[3:-1] total_annotated += 1 if (a_chr, a_start, a_end) not in met: total_uniq_annotated += 1 met.add((a_chr, a_start, a_end)) e_gene = overlap_fields[ebl.BedCols.GENE] if keep_gene_column and e_gene != a_gene: overlaps_by_tx_by_gene_by_loc[reg][a_gene] = OrderedDefaultDict(list) else: transcript_id = overlap_fields[ebl.BedCols.ENSEMBL_ID] overlaps_by_tx_by_gene_by_loc[reg][e_gene][transcript_id].append((overlap_fields, overlap_size)) info(' Total annotated regions: ' + str(total_annotated)) info(' Total unique annotated regions: ' + str(total_uniq_annotated)) info(' Total off target regions: ' + str(total_off_target)) info('Resolving ambiguities...') annotated = _resolve_ambiguities(overlaps_by_tx_by_gene_by_loc, chr_order, **kwargs) return annotated
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug( 'The male non-PAR region does not overlap with the capture target - cannot determine sex.' ) return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info( 'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.' ) if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def determine_sex(work_dir, bam_fpath, ave_depth, genome, target_bed=None): info() info('Determining sex') male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = male_bed.count() info('Male region total size: ' + str(male_area_size)) if target_bed: male_bed = BedTool(target_bed).intersect(male_bed).merge() target_male_area_size = male_bed.count() if target_male_area_size < male_area_size * MALE_TARGET_REGIONS_FACTOR: info('Target male region total size is ' + str(target_male_area_size) + ', which is less than the ' + 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) + ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + ') - cannot determine sex') return None else: info('Target male region total size is ' + str(target_male_area_size) + ', which is higher than the ' + 'checked male regions size * ' + str(MALE_TARGET_REGIONS_FACTOR) + ' (' + str(male_area_size * MALE_TARGET_REGIONS_FACTOR) + '). ' + 'Determining sex based on coverage in those regions.') else: info('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_cov_output_fpath = sambamba_depth(work_dir, male_bed, bam_fpath, []) chry_mean_coverage = get_mean_cov(chry_cov_output_fpath) info('Y key regions average depth: ' + str(chry_mean_coverage)) ave_depth = float(ave_depth) info('Sample average depth: ' + str(ave_depth)) if ave_depth < AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX: info('Sample average depth is too low (less then ' + str(AVE_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: info('Y depth is 0 - it\s female') sex = 'F' else: factor = ave_depth / chry_mean_coverage info('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage info('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: info('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' info('Sex is ' + sex) info() return sex