def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def partition_gtf(gtf, coding=False, out_file=False): """ return a GTF file of all non-coding or coding transcripts. the GTF must be annotated with gene_biotype = "protein_coding" or to have the source column set to the biotype for all coding transcripts. set coding to True to get only the coding, false to get only the non-coding """ if out_file and file_exists(out_file): return out_file if not out_file: out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".gtf").name if coding: pred = lambda biotype: biotype and biotype == "protein_coding" else: pred = lambda biotype: biotype and biotype != "protein_coding" biotype_lookup = _biotype_lookup_fn(gtf) db = get_gtf_db(gtf) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feature in db.all_features(): biotype = biotype_lookup(feature) if pred(biotype): out_handle.write(str(feature) + "\n") return out_file
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth, parall_view=None): """ Picking random 3 samples and getting a callable for them. Trade off between looping through all samples in a huge batch, and hitting an sample with outstanding coverage. """ if can_reuse(output_bed_file, bam_files): return output_bed_file work_dir = safe_mkdir(join(work_dir, 'callable_work')) # random.seed(1234) # seeding random for reproducability # bam_files = random.sample(bam_files, min(len(bam_files), 3)) if parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) else: with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view: callable_beds = parall_view.run(_calculate, [ [bf, work_dir, genome_fasta_file, min_depth] for bf in bam_files]) good_overlap_sample_fraction = 0.8 # we want to pick those regions that have coverage at 80% of samples good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds)) info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} ' f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})') with file_transaction(work_dir, output_bed_file) as tx: pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) intersection = pybedtools.BedTool() \ .multi_intersect(i=callable_beds) \ .filter(lambda r: len(r[4].split(',')) >= good_overlap_count) intersection.saveas(tx) info(f'Saved to {output_bed_file}') return output_bed_file
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ba.SUPPORTED_GENOMES), )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: critical('Error: please, specify genome build name with -g (e.g. `-g hg19`)') genome = opts.genome debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] == 'CDS') features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) info('Saving CDS regions...') output_fpath = adjust_path(join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) info('Done, saved to ' + output_fpath)
def filter_bed_with_gene_set(bed_fpath, gene_keys_set, output_fpath): with file_transaction(None, output_fpath) as tx: with open(bed_fpath) as inp, open(tx, 'w') as out: for l in inp: if l.strip('\n'): chrom, start, end, gene = l.strip('\n').split('\t') if (gene, chrom) in gene_keys_set: out.write(l)
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def sample_callable_bed(bam_file, output_bed_file, work_dir, genome_fasta_file, min_depth): """Retrieve callable regions for a sample subset by defined analysis regions. """ callable_bed = _calculate(bam_file, work_dir, genome_fasta_file, min_depth) if not can_reuse(output_bed_file, callable_bed): with file_transaction(work_dir, output_bed_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed).filter(lambda x: x.name == 'CALLABLE') callable_regions.saveas(tx_out_file) return output_bed_file
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def run(cmd, output_fpath=None, input_fpaths=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath if input_fpaths is not None: if isinstance(input_fpaths, str): input_fpaths = [input_fpaths] for fpath in input_fpaths: verify_file(fpath, is_critical=True) env = _get_env(env_vars) # info('env: ' + str(env)) if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpaths): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpaths) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpaths) else: _try_run(cmd, output_fpath, input_fpaths) else: _try_run(cmd, None, input_fpaths)
def merge_overlaps(work_dir, bed_fpath, distance=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged') if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath): return output_fpath with file_transaction(work_dir, output_fpath) as tx: kwargs = dict(d=distance) if distance else dict() BedTool(bed_fpath).merge(**kwargs).saveas(tx) return output_fpath
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any(x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def merge_overlaps(work_dir, bed_fpath, distance=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged') if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath): return output_fpath with file_transaction(work_dir, output_fpath) as tx: import pybedtools kwargs = dict(d=distance) if distance else dict() pybedtools.BedTool(bed_fpath).merge(**kwargs).saveas(tx) return output_fpath
def tx2genefile(gtf, out_file=None): """ write out a file of transcript->gene mappings. use the installed tx2gene.csv if it exists, else write a new one out """ installed_tx2gene = os.path.join(os.path.dirname(gtf), "tx2gene.csv") if file_exists(installed_tx2gene): return installed_tx2gene if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for k, v in transcript_to_gene(gtf).items(): out_handle.write(",".join([k, v]) + "\n") return out_file
def clean_bed(bed_fpath, work_dir): clean_fpath = intermediate_fname(work_dir, bed_fpath, 'clean') if not can_reuse(clean_fpath, bed_fpath): import pybedtools pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) bed = pybedtools.BedTool(bed_fpath) bed = bed.filter(lambda x: x.chrom and not any( x.chrom.startswith(e) for e in ['#', ' ', 'track', 'browser'])) bed = bed.remove_invalid() with file_transaction(work_dir, clean_fpath) as tx_out_file: bed.saveas(tx_out_file) verify_bed(clean_fpath, is_critical=True) debug('Saved clean BED file into ' + clean_fpath) return clean_fpath
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath env = _get_env(env_vars) if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpath): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpath) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpath) else: _try_run(cmd, output_fpath, input_fpath) else: _try_run(cmd, None, input_fpath)
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of chr_order, fai_fpath, or genome build name must be specified') chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append(Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES), )), (['-c', '--canonical'], dict( dest='canonical', action='store_true', help='Use canonical only', )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: logger.critical( 'Error: please, specify genome build name with -g (e.g. `-g hg19`)' ) genome = opts.genome logger.debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: logger.critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) logger.warn('Extracting features from Ensembl GTF') features_bed = features_bed.filter( lambda x: x[ebl.BedCols.FEATURE] == 'CDS') if opts.canonical: features_bed = features_bed.filter( ebl.get_only_canonical_filter(genome)) logger.warn('Saving CDS regions...') output_fpath = adjust_path( join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) logger.warn('Done, saved to ' + output_fpath)
def _calculate(bam_file, work_dir, genome_fasta_file, min_depth): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ output_prefix = os.path.join(work_dir, bam_samplename(bam_file)) callability_annotation_file = output_prefix + '.callable.bed' if not can_reuse(callability_annotation_file, bam_file): info(f'Calculating coverage at {bam_file}') run(f'goleft depth --q 1 --mincov {min_depth} --reference {genome_fasta_file} --ordered' f' --prefix {output_prefix} {bam_file}') callable_file = output_prefix + '.callable.CALLABLE.bed' if not can_reuse(callable_file, callability_annotation_file): with file_transaction(None, callable_file) as tx: pybedtools.BedTool(callability_annotation_file)\ .filter(lambda x: x.name == 'CALLABLE')\ .saveas(tx) return callable_file
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug( 'The male non-PAR region does not overlap with the capture target - cannot determine sex.' ) return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info( 'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.' ) if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def annotate(input_bed_fpath, output_fpath, work_dir, genome=None, reannotate=True, high_confidence=False, only_canonical=False, coding_only=False, short=False, extended=False, is_debug=False, **kwargs): debug('Getting features from storage') features_bed = ba.get_all_features(genome) if features_bed is None: critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ba.SUPPORTED_GENOMES)) if genome: fai_fpath = reference_data.get_fai(genome) chr_order = reference_data.get_chrom_order(genome) else: fai_fpath = None chr_order = bed_chrom_order(input_bed_fpath) input_bed_fpath = sort_bed(input_bed_fpath, work_dir=work_dir, chr_order=chr_order, genome=genome) ori_bed = BedTool(input_bed_fpath) ori_col_num = ori_bed.field_count() reannotate = reannotate or ori_col_num == 3 pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'bedtools'))) ori_bed = BedTool(input_bed_fpath) if high_confidence: features_bed = features_bed.filter(ba.high_confidence_filter) if only_canonical: features_bed = features_bed.filter(ba.get_only_canonical_filter(genome)) if coding_only: features_bed = features_bed.filter(ba.protein_coding_filter) # unique_tx_by_gene = find_best_tx_by_gene(features_bed) info('Extracting features from Ensembl GTF') features_bed = features_bed.filter(lambda x: x[ba.BedCols.FEATURE] in ['exon', 'CDS', 'stop_codon', 'transcript']) # x[ebl.BedCols.ENSEMBL_ID] == unique_tx_by_gene[x[ebl.BedCols.GENE]]) info('Overlapping regions with Ensembl data') if is_debug: ori_bed = ori_bed.saveas(join(work_dir, 'bed.bed')) debug(f'Saved regions to {ori_bed.fn}') features_bed = features_bed.saveas(join(work_dir, 'features.bed')) debug(f'Saved features to {features_bed.fn}') annotated = _annotate(ori_bed, features_bed, chr_order, fai_fpath, work_dir, ori_col_num, high_confidence=False, reannotate=reannotate, is_debug=is_debug, **kwargs) full_header = [ba.BedCols.names[i] for i in ba.BedCols.cols] add_ori_extra_fields = ori_col_num > 3 if not reannotate and ori_col_num == 4: add_ori_extra_fields = False # no need to report the original gene field if we are not re-annotating info('Saving annotated regions...') total = 0 with file_transaction(work_dir, output_fpath) as tx: with open(tx, 'w') as out: header = full_header[:6] if short: header = full_header[:4] if extended: header = full_header[:-1] if add_ori_extra_fields: header.append(full_header[-1]) if extended: out.write('## ' + ba.BedCols.names[ba.BedCols.TX_OVERLAP_PERCENTAGE] + ': part of region overlapping with transcripts\n') out.write('## ' + ba.BedCols.names[ba.BedCols.EXON_OVERLAPS_PERCENTAGE] + ': part of region overlapping with exons\n') out.write('## ' + ba.BedCols.names[ba.BedCols.CDS_OVERLAPS_PERCENTAGE] + ': part of region overlapping with protein coding regions\n') out.write('\t'.join(header) + '\n') for full_fields in annotated: fields = full_fields[:6] if short: fields = full_fields[:4] if extended: fields = full_fields[:-1] if add_ori_extra_fields: fields.append(full_fields[-1]) out.write('\t'.join(map(_format_field, fields)) + '\n') total += 1 debug('Saved ' + str(total) + ' total annotated regions') return output_fpath
def _make_snp_file(dbsnp_snps_file, genome_build, output_file, autosomal_locations_limit=175, min_snp_amount=30): if can_reuse(output_file, dbsnp_snps_file): return output_file locs_by_gene = defaultdict(list) total_locs = 0 for i, interval in enumerate(BedTool(dbsnp_snps_file)): if is_sex_chrom(interval.chrom): continue pos = int(interval.start) + 1 annots = interval.name.split('|') # if len(annots) == 2: # rsid, gene = interval.name.split('|') # ref = interval[4] # else: rsid, gene, ref, alts = interval.name.split('|') loc = (interval.chrom, pos, rsid, gene, ref, alts) locs_by_gene[gene].append(loc) total_locs += 1 random.seed(1234) # seeding random for reproducability # Selecting random genes gnames = random.sample(locs_by_gene.keys(), min(len(locs_by_gene), autosomal_locations_limit)) locs_by_gene = {g: locs_by_gene[g] for g in gnames} # Selecting random SNPs in each gene # min_locs_per_gene = min(len(locs) for locs in locs_by_gene.values()) # if pick_unclustered: # locs_per_gene = min(autosomal_locations_limit / len(gnames), min_locs_per_gene) # while locs_per_gene * len(gnames) < min_snp_amount: # locs_per_gene = math.ceil(float(min_snp_amount) / len(gnames)) # selected_locs_by_gene = {g: random.sample(locs_by_gene[g], locs_per_gene) for g in gnames} # selected_locs = [l for gene_locs in selected_locs_by_gene.values() for l in gene_locs] # else: all_locs = [l for gene_locs in locs_by_gene.values() for l in gene_locs] # Selecting unclustered SNPs within genes non_clustered_locs = [] prev_pos = 0 for (chrom, pos, rsid, gene, ref, alts) in all_locs: if 0 < pos - prev_pos < 500: continue else: prev_pos = pos non_clustered_locs.append((chrom, pos, rsid, gene, ref, alts)) # Selecting random SNPs within the limit selected_locs = random.sample(non_clustered_locs, min(len(non_clustered_locs), autosomal_locations_limit)) # Sorting final locations chrom_order = get_chrom_order(genome_build) selected_locs.sort(key=lambda a: (chrom_order.get(a[0], -1), a[1:])) log.debug('Selected the following autosomal SNPs:') for (chrom, pos, rsid, gene, ref, alts) in selected_locs: log.debug(' ' + chrom + ':' + str(pos) + '\t' + rsid + '\t' + gene + '\t' + ref + '>' + ','.join(alts)) with file_transaction(None, output_file) as tx: with open(tx, 'w') as out: for (chrom, pos, rsid, gene, ref, alts) in selected_locs: out.write('\t'.join([chrom, str(pos-1), str(pos), rsid + '|' + gene + '|' + ref + '|' + alts]) + '\n') return output_file