def _correct_qualimap_insert_size_histogram(work_dir, samples): """ replacing Qualimap insert size histogram with Picard one. """ for s in samples: qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace( 'raw_data_qualimapReport', 'raw_data') qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath) if exists(qualimap1_dirname): if not exists(qualimap2_dirname): shutil.move(qualimap1_dirname, qualimap2_dirname) else: shutil.rmtree(qualimap1_dirname) elif not exists(qualimap2_dirname): continue # no data from both Qualimap v.1 and Qualimap v.2 # if qualimap histogram exits and reuse_intermediate, skip if verify_file(s.qualimap_ins_size_hist_fpath, silent=True) and tc.reuse_intermediate: pass else: if verify_file(s.picard_ins_size_hist_txt_fpath): with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f: one_line_to_stop = False for line in picard_f: if one_line_to_stop: break if line.startswith('## HISTOGRAM'): one_line_to_stop = True with file_transaction( work_dir, s.qualimap_ins_size_hist_fpath) as tx: with open(tx, 'w') as qualimap_f: for line in picard_f: qualimap_f.write(line)
def combined_regional_reports(work_dir, output_dir, samples): if not any(verify_file(s.targqc_region_tsv, silent=True) for s in samples): return None, None tsv_region_rep_fpath = join(output_dir, basename(samples[0].targqc_region_tsv)) debug('Combining regional reports, writing to ' + tsv_region_rep_fpath) with file_transaction(work_dir, tsv_region_rep_fpath) as tx_tsv: with open(tx_tsv, 'w') as tsv_out: # sample_i = 0 # for s in samples: # if s.targqc_region_txt and verify_file(s.targqc_region_txt): # with open(s.targqc_region_txt) as txt_in: # for l in txt_in: # if l.startswith('#'): # if not l.startswith('##') and sample_i == 0: # txt_out.write('#Sample' + ' '*(max(len('#Sample'), len(s.name)) - len('#Sample')) + ' ' + l.replace('#Chr', 'Chr ')) # else: # txt_out.write(s.name + ' '*(max(len('#Sample'), len(s.name)) - len(s.name)) + ' ' + l) # sample_i += 1 sample_i = 0 for s in samples: if s.targqc_region_tsv and verify_file(s.targqc_region_tsv): with open(s.targqc_region_tsv) as tsv_in: for i, l in enumerate(tsv_in): if i == 0: if sample_i == 0: tsv_out.write('sample\t' + l) else: tsv_out.write(s.name + '\t' + l) sample_i += 1 return tsv_region_rep_fpath
def _make_wgs_regions_file(self, work_dir, genome=None): self.wgs_bed_fpath = join(work_dir, 'targqc_features_to_report.bed') if can_reuse(self.wgs_bed_fpath, ebl.ensembl_gtf_fpath(genome)): return self.wgs_bed_fpath chr_order = reference_data.get_chrom_order(genome or cfg.genome) r_by_tx_by_gene = OrderedDefaultDict(lambda: defaultdict(list)) all_features = ebl.get_all_features(genome or cfg.genome, high_confidence=True) debug('Select best transcript to report') for r in all_features: if r[ebl.BedCols.FEATURE] != 'gene': gene = r[ebl.BedCols.GENE] tx = r[ebl.BedCols.ENSEMBL_ID] r_by_tx_by_gene[gene][tx].append(r.fields) with file_transaction(work_dir, self.wgs_bed_fpath) as tx: with open(tx, 'w') as out: for gname, r_by_tx in r_by_tx_by_gene.items(): all_tx = (x for xx in r_by_tx.values() for x in xx if x[ebl.BedCols.FEATURE] == 'transcript') tx_sorted_list = [x[ebl.BedCols.ENSEMBL_ID] for x in sorted(all_tx, key=tx_priority_sort_key)] if not tx_sorted_list: continue tx_id = tx_sorted_list[0] for r in sorted(r_by_tx[tx_id], key=get_sort_key(chr_order)): out.write('\t'.join(str(f) for f in r) + '\n') return self.wgs_bed_fpath
def _correct_qualimap_insert_size_histogram(samples): """ replacing Qualimap insert size histogram with Picard one. """ for s in samples: qualimap1_dirname = dirname(s.qualimap_ins_size_hist_fpath).replace('raw_data_qualimapReport', 'raw_data') qualimap2_dirname = dirname(s.qualimap_ins_size_hist_fpath) if exists(qualimap1_dirname): if not exists(qualimap2_dirname): shutil.move(qualimap1_dirname, qualimap2_dirname) else: shutil.rmtree(qualimap1_dirname) elif not exists(qualimap2_dirname): continue # no data from both Qualimap v.1 and Qualimap v.2 # if qualimap histogram exits and reuse_intermediate, skip if verify_file(s.qualimap_ins_size_hist_fpath, silent=True) and cfg.reuse_intermediate: pass else: if verify_file(s.picard_ins_size_hist_txt_fpath): with open(s.picard_ins_size_hist_txt_fpath, 'r') as picard_f: one_line_to_stop = False for line in picard_f: if one_line_to_stop: break if line.startswith('## HISTOGRAM'): one_line_to_stop = True with file_transaction(None, s.qualimap_ins_size_hist_fpath) as tx: with open(tx, 'w') as qualimap_f: for line in picard_f: qualimap_f.write(line)
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES), )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)') genome = opts.genome debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] == 'CDS') features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome)) info('Saving CDS regions...') output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) info('Done, saved to ' + output_fpath)
def partition_gtf(gtf, coding=False, out_file=False): """ return a GTF file of all non-coding or coding transcripts. the GTF must be annotated with gene_biotype = "protein_coding" or to have the source column set to the biotype for all coding transcripts. set coding to True to get only the coding, false to get only the non-coding """ if out_file and file_exists(out_file): return out_file if not out_file: out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gtf").name if coding: pred = lambda biotype: biotype and biotype == "protein_coding" else: pred = lambda biotype: biotype and biotype != "protein_coding" biotype_lookup = _biotype_lookup_fn(gtf) db = get_gtf_db(gtf) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in db.all_features(): biotype = biotype_lookup(feature) if pred(biotype): out_handle.write(str(feature) + "\n") return out_file
def _make_wgs_regions_file(self, work_dir, genome=None): self.wgs_bed_fpath = join(work_dir, 'targqc_features_to_report.bed') if can_reuse(self.wgs_bed_fpath, ebl.ensembl_gtf_fpath(genome)): return self.wgs_bed_fpath chr_order = reference_data.get_chrom_order(genome or cfg.genome) r_by_tx_by_gene = OrderedDefaultDict(lambda: defaultdict(list)) all_features = ebl.get_all_features(genome or cfg.genome, high_confidence=True) debug('Select best transcript to report') for r in all_features: if r[ebl.BedCols.FEATURE] != 'gene': gene = r[ebl.BedCols.GENE] tx = r[ebl.BedCols.ENSEMBL_ID] r_by_tx_by_gene[gene][tx].append(r.fields) with file_transaction(work_dir, self.wgs_bed_fpath) as tx: with open(tx, 'w') as out: for gname, r_by_tx in r_by_tx_by_gene.items(): all_tx = (x for xx in r_by_tx.values() for x in xx if x[ebl.BedCols.FEATURE] == 'transcript') tx_sorted_list = [ x[ebl.BedCols.ENSEMBL_ID] for x in sorted(all_tx, key=tx_priority_sort_key) ] if not tx_sorted_list: continue tx_id = tx_sorted_list[0] for r in sorted(r_by_tx[tx_id], key=get_sort_key(chr_order)): out.write('\t'.join(str(f) for f in r) + '\n') return self.wgs_bed_fpath
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def filter_bed_with_gene_set(bed_fpath, gene_keys_set, output_fpath): with file_transaction(None, output_fpath) as tx: with open(bed_fpath) as inp, open(tx, 'w') as out: for l in inp: if l.strip('\n'): chrom, start, end, gene = l.strip('\n').split('\t') if (gene, chrom) in gene_keys_set: out.write(l)
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath env = os.environ.copy() if env_vars: for k, v in env_vars.items(): if v is None: if k in env: del env[k] else: env[k] = v if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpath): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, six.string_types) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpath) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpath) else: _try_run(cmd, output_fpath, input_fpath) else: _try_run(cmd, None, input_fpath)
def _make_padded_bed(self, work_dir, fai_fpath, padding): if self.is_wgs: return None self.padded_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'padded') if can_reuse(self.padded_bed_fpath, self.capture_bed_fpath): return BedTool(self.padded_bed_fpath) padded_bed = self.bed.slop(b=padding, g=fai_fpath).sort().merge() with file_transaction(work_dir, self.padded_bed_fpath) as tx: padded_bed.saveas(tx) verify_file(self.padded_bed_fpath, is_critical=True) return BedTool(self.padded_bed_fpath)
def merge_overlaps(work_dir, bed_fpath, distance=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged') if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath): return output_fpath with file_transaction(work_dir, output_fpath) as tx: kwargs = dict(d=distance) if distance else dict() BedTool(bed_fpath).merge(**kwargs).saveas(tx) return output_fpath
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any( x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def tx2genefile(gtf, out_file=None): """ write out a file of transcript->gene mappings. use the installed tx2gene.csv if it exists, else write a new one out """ installed_tx2gene = os.path.join(os.path.dirname(gtf), "tx2gene.csv") if file_exists(installed_tx2gene): return installed_tx2gene if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for k, v in transcript_to_gene(gtf).items(): out_handle.write(",".join([k, v]) + "\n") return out_file
def _make_qualimap_bed(self, work_dir): if self.is_wgs: return None self.qualimap_bed_fpath = intermediate_fname(work_dir, self.capture_bed_fpath, 'qualimap_ready') if can_reuse(self.qualimap_bed_fpath, self.capture_bed_fpath): return self.qualimap_bed_fpath debug('Merging and saving BED into required bed6 format for Qualimap') bed = self.bed.sort().merge() with file_transaction(work_dir, self.qualimap_bed_fpath) as tx: with open(tx, 'w') as out: for i, region in enumerate(x for x in bed): region = [x for x in list(region) if x] fillers = [str(i), "1.0", "+"] full = region + fillers[:6 - len(region)] out.write("\t".join(full) + "\n") verify_file(self.qualimap_bed_fpath, is_critical=True) return self.qualimap_bed_fpath
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed( clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count( ) == 3 or reannotate: debug( 'Annotating target BED file and collecting overlapping genome features' ) overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname( work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix( join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) # if reannotate: # bed = BedTool(input_bed_fpath).cut([0, 1, 2]) # keep_gene_column = False # else: # if col_num > 4: # bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3]) # keep_gene_column = True # features_bed = features_bed.saveas() # cols = features_bed.field_count() # if cols < 12: # features_bed = features_bed.each(lambda f: f + ['.']*(12-cols)) if high_confidence: features_bed = features_bed.filter(ebl.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ebl.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ebl.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def _proc_sambamba_depth(sambamba_depth_output_fpath, output_fpath, sample_name, depth_thresholds): read_count_col = None mean_cov_col = None median_cov_col = None min_depth_col = None std_dev_col = None wn_20_pcnt_col = None regions_by_genekey = defaultdict(list) ##################################### ##################################### if can_reuse(output_fpath, sambamba_depth_output_fpath): return output_fpath debug('Reading coverage statistics and writing regions to ' + output_fpath) def write_line(f, fields): f.write('\t'.join(fields) + '\n') with file_transaction(None, output_fpath) as tx: with open(sambamba_depth_output_fpath) as sambabma_depth_file, open(tx, 'w') as out: total_regions_count = 0 for line in sambabma_depth_file: fs = line.strip('\n').split('\t') if line.startswith('#'): fs = line.split('\t') read_count_col = fs.index('readCount') + 1 mean_cov_col = fs.index('meanCoverage') + 1 #median_cov_col = fs.index('medianCoverage') if 'medianCoverage' in fs else None #min_depth_col = fs.index('minDepth') if 'minDepth' in fs else None #std_dev_col = fs.index('stdDev') if 'stdDev' in fs else None #wn_20_pcnt_col = fs.index('percentWithin20PercentOfMedian') if 'percentWithin20PercentOfMedian' in fs else None write_line(out, [ 'chrom', 'start', 'end', 'size', 'gene', 'exon', 'strand', 'feature', 'biotype', 'transcript', 'trx_overlap', 'exome_overlap', 'cds_overlap', # 'min_depth', 'avg_depth', # 'median_depth', # 'std_dev', # 'within_20pct_of_median', ] + ['at{}x'.format(ths) for ths in depth_thresholds]) continue chrom = fs[0] start, end = int(fs[1]), int(fs[2]) region_size = end - start gene_name = fs[ebl.BedCols.GENE] if read_count_col != ebl.BedCols.GENE else '.' exon = fs[ebl.BedCols.EXON] strand = fs[ebl.BedCols.STRAND] feature = fs[ebl.BedCols.FEATURE] biotype = fs[ebl.BedCols.BIOTYPE] transcript = fs[ebl.BedCols.ENSEMBL_ID] transcript_overlap = fs[ebl.BedCols.TX_OVERLAP_PERCENTAGE] exome_overlap = fs[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] cds_overlap = fs[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] avg_depth = float(fs[mean_cov_col]) # min_depth = int(fs[min_depth_col]) if min_depth_col is not None else '.' # std_dev = float(fs[std_dev_col]) if std_dev_col is not None else '.' # median_depth = int(fs[median_cov_col]) if median_cov_col is not None else '.' # rate_within_normal = float(fs[wn_20_pcnt_col]) if wn_20_pcnt_col is not None else '.' last_cov_col = max(mean_cov_col or 0, median_cov_col or 0, std_dev_col or 0, wn_20_pcnt_col or 0) rates_within_threshs = fs[last_cov_col+1:-1] write_line(out, [str(v) if v not in ['', None, '.'] else '.' for v in [ chrom, start, end, region_size, gene_name, exon, strand, feature, biotype, transcript, ((transcript_overlap + '%') if transcript_overlap not in ['', None, '.'] else '.'), ((exome_overlap + '%') if exome_overlap not in ['', None, '.'] else '.'), ((cds_overlap + '%') if cds_overlap not in ['', None, '.'] else '.'), # min_depth, avg_depth, # median_depth, # std_dev, # rate_within_normal, ] + rates_within_threshs]) total_regions_count += 1 if total_regions_count > 0 and total_regions_count % 10000 == 0: debug(' Processed {0:,} regions'.format(total_regions_count)) debug('Total regions: ' + str(len(regions_by_genekey))) return output_fpath
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug( 'The male non-PAR region does not overlap with the capture target - cannot determine sex.' ) return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info( 'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.' ) if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def downsample(work_dir, sample_name, fastq_left_fpath, fastq_right_fpath, downsample_to, num_pairs=None): """ get N random headers from a fastq file without reading the whole thing into memory modified from: http://www.biostars.org/p/6544/ """ sample_name = sample_name or splitext(''.join( lc if lc == rc else '' for lc, rc in zip(fastq_left_fpath, fastq_right_fpath)))[0] l_out_fpath = make_downsampled_fpath(work_dir, fastq_left_fpath) r_out_fpath = make_downsampled_fpath(work_dir, fastq_right_fpath) if can_reuse(l_out_fpath, [fastq_left_fpath, fastq_right_fpath]): return l_out_fpath, r_out_fpath info('Processing ' + sample_name) if num_pairs is None: info(sample_name + ': counting number of reads in fastq...') num_pairs = _count_records_in_fastq(fastq_left_fpath) if num_pairs > LIMIT: info(sample_name + ' the number of reads is higher than ' + str(LIMIT) + ', sampling from only first ' + str(LIMIT)) num_pairs = LIMIT info(sample_name + ': ' + str(num_pairs) + ' reads') num_downsample_pairs = int(downsample_to * num_pairs) if isinstance( downsample_to, float) else downsample_to if num_pairs <= num_downsample_pairs: info(sample_name + ': and it is less than ' + str(num_downsample_pairs) + ', so no downsampling.') return fastq_left_fpath, fastq_right_fpath else: info(sample_name + ': downsampling to ' + str(num_downsample_pairs)) rand_records = sorted( random.sample(range(num_pairs), num_downsample_pairs)) info('Opening ' + fastq_left_fpath) fh1 = open_gzipsafe(fastq_left_fpath) info('Opening ' + fastq_right_fpath) fh2 = open_gzipsafe(fastq_right_fpath) if fastq_right_fpath else None out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath, ) written_records = 0 with file_transaction(work_dir, out_files) as tx_out_files: if isinstance(tx_out_files, six.string_types): tx_out_f1 = tx_out_files else: tx_out_f1, tx_out_f2 = tx_out_files info('Opening ' + str(tx_out_f1) + ' to write') sub1 = open_gzipsafe(tx_out_f1, "w") info('Opening ' + str(tx_out_f2) + ' to write') sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None rec_no = -1 for rr in rand_records: while rec_no < rr: rec_no += 1 for i in range(4): fh1.readline() if fh2: for i in range(4): fh2.readline() for i in range(4): sub1.write(fh1.readline()) if sub2: sub2.write(fh2.readline()) written_records += 1 if written_records % 10000 == 0: info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no + 1)) if rec_no > num_pairs: info(sample_name + ' reached the limit of ' + str(num_pairs), ' read lines, stopping.') break info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no)) fh1.close() sub1.close() if fastq_right_fpath: fh2.close() sub2.close() info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written') return l_out_fpath, r_out_fpath
def _make_target_bed(self, bed_fpath, work_dir, output_dir, is_debug, padding=None, fai_fpath=None, genome=None, reannotate=False): clean_target_bed_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_target_bed_fpath, bed_fpath): debug() debug('Cleaning target BED file...') bed = BedTool(bed_fpath) if bed.field_count() > 4: bed = bed.cut(range(4)) bed = bed\ .filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser']))\ .remove_invalid() with file_transaction(work_dir, clean_target_bed_fpath) as tx: bed.saveas(tx) debug('Saved to ' + clean_target_bed_fpath) verify_file(clean_target_bed_fpath, is_critical=True) sort_target_bed_fpath = intermediate_fname(work_dir, clean_target_bed_fpath, 'sorted') if not can_reuse(sort_target_bed_fpath, clean_target_bed_fpath): debug() debug('Sorting target BED file...') sort_target_bed_fpath = sort_bed(clean_target_bed_fpath, output_bed_fpath=sort_target_bed_fpath, fai_fpath=fai_fpath) debug('Saved to ' + sort_target_bed_fpath) verify_file(sort_target_bed_fpath, is_critical=True) if genome in ebl.SUPPORTED_GENOMES: ann_target_bed_fpath = intermediate_fname(work_dir, sort_target_bed_fpath, 'ann_plus_features') if not can_reuse(ann_target_bed_fpath, sort_target_bed_fpath): debug() if BedTool(sort_target_bed_fpath).field_count() == 3 or reannotate: debug('Annotating target BED file and collecting overlapping genome features') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, reannotate=reannotate, only_canonical=True) else: debug('Overlapping with genomic features:') overlap_with_features(sort_target_bed_fpath, ann_target_bed_fpath, work_dir=work_dir, genome=genome, extended=True, only_canonical=True) debug('Saved to ' + ann_target_bed_fpath) verify_file(ann_target_bed_fpath, is_critical=True) else: ann_target_bed_fpath = sort_target_bed_fpath final_clean_target_bed_fpath = intermediate_fname(work_dir, ann_target_bed_fpath, 'clean') if not can_reuse(final_clean_target_bed_fpath, ann_target_bed_fpath): bed = BedTool(ann_target_bed_fpath).remove_invalid() with file_transaction(work_dir, final_clean_target_bed_fpath) as tx: bed.saveas(tx) pass verify_file(final_clean_target_bed_fpath, is_critical=True) self.bed_fpath = final_clean_target_bed_fpath self.bed = BedTool(self.bed_fpath) self.capture_bed_fpath = add_suffix(join(output_dir, basename(bed_fpath)), 'clean_sorted_ann') if not can_reuse(self.capture_bed_fpath, self.bed_fpath): with file_transaction(work_dir, self.capture_bed_fpath) as tx: self.get_capture_bed().saveas(tx) gene_key_set, gene_key_list = get_genes_from_bed(bed_fpath) self.gene_keys_set = gene_key_set self.gene_keys_list = gene_key_list self.regions_num = self.get_capture_bed().count() self._make_qualimap_bed(work_dir) if padding: self._make_padded_bed(work_dir, fai_fpath, padding)
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) # if reannotate: # bed = BedTool(input_bed_fpath).cut([0, 1, 2]) # keep_gene_column = False # else: # if col_num > 4: # bed = BedTool(input_bed_fpath).cut([0, 1, 2, 3]) # keep_gene_column = True # features_bed = features_bed.saveas() # cols = features_bed.field_count() # if cols < 12: # features_bed = features_bed.each(lambda f: f + ['.']*(12-cols)) if high_confidence: features_bed = features_bed.filter(ebl.high_confidence_filter) if only_canonical: features_bed = features_bed.filter( ebl.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ebl.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ ebl.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ebl.BedCols.names[i] for i in ebl.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write( '## ' + ebl.BedCols.names[ebl.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write( '## ' + ebl.BedCols.names[ebl.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write( '## ' + ebl.BedCols.names[ebl.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n' ) out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def downsample(work_dir, sample_name, fastq_left_fpath, fastq_right_fpath, downsample_to, num_pairs=None): """ get N random headers from a fastq file without reading the whole thing into memory modified from: http://www.biostars.org/p/6544/ """ sample_name = sample_name or splitext(''.join(lc if lc == rc else '' for lc, rc in zip(fastq_left_fpath, fastq_right_fpath)))[0] l_out_fpath = make_downsampled_fpath(work_dir, fastq_left_fpath) r_out_fpath = make_downsampled_fpath(work_dir, fastq_right_fpath) if can_reuse(l_out_fpath, [fastq_left_fpath, fastq_right_fpath]): return l_out_fpath, r_out_fpath info('Processing ' + sample_name) if num_pairs is None: info(sample_name + ': counting number of reads in fastq...') num_pairs = _count_records_in_fastq(fastq_left_fpath) if num_pairs > LIMIT: info(sample_name + ' the number of reads is higher than ' + str(LIMIT) + ', sampling from only first ' + str(LIMIT)) num_pairs = LIMIT info(sample_name + ': ' + str(num_pairs) + ' reads') num_downsample_pairs = int(downsample_to * num_pairs) if isinstance(downsample_to, float) else downsample_to if num_pairs <= num_downsample_pairs: info(sample_name + ': and it is less than ' + str(num_downsample_pairs) + ', so no downsampling.') return fastq_left_fpath, fastq_right_fpath else: info(sample_name + ': downsampling to ' + str(num_downsample_pairs)) rand_records = sorted(random.sample(range(num_pairs), num_downsample_pairs)) info('Opening ' + fastq_left_fpath) fh1 = open_gzipsafe(fastq_left_fpath) info('Opening ' + fastq_right_fpath) fh2 = open_gzipsafe(fastq_right_fpath) if fastq_right_fpath else None out_files = (l_out_fpath, r_out_fpath) if r_out_fpath else (l_out_fpath,) written_records = 0 with file_transaction(work_dir, out_files) as tx_out_files: if isinstance(tx_out_files, six.string_types): tx_out_f1 = tx_out_files else: tx_out_f1, tx_out_f2 = tx_out_files info('Opening ' + str(tx_out_f1) + ' to write') sub1 = open_gzipsafe(tx_out_f1, "w") info('Opening ' + str(tx_out_f2) + ' to write') sub2 = open_gzipsafe(tx_out_f2, "w") if r_out_fpath else None rec_no = -1 for rr in rand_records: while rec_no < rr: rec_no += 1 for i in range(4): fh1.readline() if fh2: for i in range(4): fh2.readline() for i in range(4): sub1.write(fh1.readline()) if sub2: sub2.write(fh2.readline()) written_records += 1 if written_records % 10000 == 0: info(sample_name + ': written ' + str(written_records) + ', rec_no ' + str(rec_no + 1)) if rec_no > num_pairs: info(sample_name + ' reached the limit of ' + str(num_pairs), ' read lines, stopping.') break info(sample_name + ': done, written ' + str(written_records) + ', rec_no ' + str(rec_no)) fh1.close() sub1.close() if fastq_right_fpath: fh2.close() sub2.close() info(sample_name + ': done downsampling, saved to ' + l_out_fpath + ' and ' + r_out_fpath + ', total ' + str(written_records) + ' paired reads written') return l_out_fpath, r_out_fpath