def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False): self.raw_name = name self.name = self.raw_name.replace('.', '_') self.dirpath = verify_dir(join(self.bcbio_project.final_dir, self.name)) if not verify_dir(self.dirpath, silent=silent): if not silent: critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory ' f'{self.bcbio_project.final_dir}. Please check consistency between the YAML ' f'{self.bcbio_project.bcbio_yaml_fpath} and the directories in `final`: ' f'to every "description" value in YAML, there should be a corresponding folder with the ' f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) ' f'from consideration, if you are sure that missing folders are expected.') else: return False self.var_dirpath = join(self.dirpath, BcbioProject.var_dir) self.bam = self.find_bam(silent=silent) if self.is_rnaseq: gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts')) if isfile(gene_counts) and verify_file(gene_counts): self.counts_file = gene_counts else: if not silent: warn('Counts for ' + self.name + ' not found') else: if variantcallers_data: self._set_variant_files(variantcallers_data, ensemble=ensemble) else: if not silent: warn('No variant callers set in config, skipping finding VCF files') return True
def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False): self.raw_name = name self.name = self.raw_name.replace('.', '_') self.rgid = self.name self.dirpath = verify_dir(join(self.parent_project.final_dir, self.name)) if not verify_dir(self.dirpath, silent=silent): critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory ' f'{self.parent_project.final_dir}. Please check consistency between the YAML ' f'{self.parent_project.bcbio_yaml_fpath} and the directories in `final`: ' f'to every "description" value in YAML, there should be a corresponding folder with the ' f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) ' f'from consideration, if you are sure that missing folders are expected.') self.bam = self.find_bam(silent=silent) if self.is_rnaseq: gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts')) if isfile(gene_counts) and verify_file(gene_counts): self.counts_file = gene_counts else: if not silent: warn('Counts for ' + self.name + ' not found') else: if variantcallers_data: self._set_variant_callers(variantcallers_data, ensemble=ensemble) else: if not silent: warn('No variant callers set in config, skipping finding VCF files')
def set_genomes_dir(new_genomes_dir=None): global genomes_dir if new_genomes_dir: # genomes_dir was provided explicitly (in paths.yaml or with --genomes-dir) verify_dir(new_genomes_dir, is_critical=True) genomes_dir = new_genomes_dir else: genomes_dir = find_genomes_dir() if not genomes_dir: critical( 'Could not detect genomes dir. Please specify one with --genomes-dir or $UMCCRISE_GENOMES' )
def set_final_dir(bcbio_cnf, config_dir, final_dir=None, create_dir=False): if final_dir: return final_dir elif 'upload' in bcbio_cnf and 'dir' in bcbio_cnf['upload']: final_dirname = bcbio_cnf['upload']['dir'] final_dir = adjust_path(join(config_dir, final_dirname)) if create_dir: safe_mkdir(final_dir) verify_dir(final_dir, 'upload directory specified in the bcbio config', is_critical=True) else: final_dir = abspath(join(config_dir, pardir, 'final')) if create_dir: safe_mkdir(final_dir) if not verify_dir(final_dir): critical('If final directory it is not named "final", please, specify it in the bcbio config.') return final_dir
def load_data(data_dir, name, genome, reuse=False): print(f"reuse: {reuse}") data_dir = verify_dir(data_dir, is_critical=True) bam_files = glob.glob(join(data_dir, '*.bam')) assert bam_files, 'No BAM files in ' + data_dir if name.startswith('--name'): name = name.split('--name')[1] bed_file = None bed_files = glob.glob(join(data_dir, '*.bed')) if bed_files: assert len( bed_files ) == 1, 'Multuple BED files in ' + data_dir + ': ' + str(bed_files) bed_file = bed_files[0] sample_by_bam = dict() for bam_file in bam_files: # sample_by_bam[bam_file] = check_output('goleft samplename ' + bam_file) sample_by_bam[bam_file] = bam_samplename(bam_file) _add_project(bam_by_sample={sample_by_bam[bf]: bf for bf in bam_files}, project_name=name, bed_file=bed_file, use_callable=not bed_file, data_dir=data_dir, genome=genome, min_depth=DEPTH_CUTOFF, reuse_files=reuse)
def _set_date_dir(bcbio_cnf, final_dir, date_dir, create_dir=False, silent=False): if not date_dir: fc_date = bcbio_cnf.get('fc_date') fc_name = bcbio_cnf.get('fc_name') or 'project' if fc_date: # Date dirpath is from bcbio and named after fc_name, not our own project name date_dir = join(final_dir, fc_date + '_' + fc_name) if not create_dir and not verify_dir(date_dir, silent=True): critical('Error: no project directory of format {fc_date}_{fc_name} or {fc_name}_{fc_date}') else: if isdir(join(final_dir, 'project')): # bcbio-CWL? date_dir = join(final_dir, 'project') if not silent: info('Using the datestamp dir from bcbio-CWL: ' + date_dir) else: regexs = [fr'^\d\d\d\d-[01][0-9]-[0-3][0-9]_{fc_name}'] date_dirs = [join(final_dir, dirpath) for dirpath in listdir(final_dir) if any(re.match(regex, dirpath) for regex in regexs)] if len(date_dirs) == 0: raise NoDateStampsException('Error: no datestamp directory!') elif len(date_dirs) == 1: date_dir = date_dirs[0] else: dates = [(tuple(map(int, basename(d).split('_')[0].split('-'))), d) for d in date_dirs] newest_date, newest_dir = sorted(dates, reverse=True)[0] newest_dirs = [d_dir for d_dir in date_dirs if d_dir == newest_dir] if len(newest_dirs) > 1: raise MultipleDateStampsException(f'Error: multiple datestamp directory found, ' f'and can\'t select the most recent one because there are multiple latest dirs: {newest_dirs}') date_dir = newest_dirs[0] if not silent: info('Using the datestamp dir: ' + date_dir) if create_dir: safe_mkdir(date_dir) return date_dir
def _check_dir_not_empty(dirpath, description=None): assert verify_dir(dirpath, description=description), dirpath contents = [join(dirpath, fname) for fname in os.listdir(dirpath) if not fname.startswith('.')] assert len(contents) >= 1, dirpath + ': ' + str(contents) assert all(verify_file(realpath(fpath), is_critical=True) for fpath in contents if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
def _check_dir_not_empty(dirpath, description=None): assert verify_dir(dirpath, description=description), dirpath contents = [ join(dirpath, fname) for fname in os.listdir(dirpath) if not fname.startswith('.') ] assert len(contents) >= 1, dirpath + ': ' + str(contents) assert all( verify_file(realpath(fpath), is_critical=True) for fpath in contents if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
def main(paths, output_dir, genome, depth): log.init(True) bed_files = [verify_file(f, is_critical=True) for f in paths if isfile(f)] bcbio_projs = [] dirs = [verify_dir(f, is_critical=True) for f in paths if isdir(f)] if dirs: for d in dirs: proj = BcbioProject() proj.load_from_bcbio_dir(d, proc_name='clearup') bcbio_projs.append(proj) build_snps_panel(bcbio_projs, bed_files, safe_mkdir(output_dir), genome)
def __init__(self, genome, filt_cnf, tricky_regions_dir, transcripts_fpath, reg_exp_sample=None, platform=None): self.all_reject_counter = OrderedDefaultDict(int) self.all_counter = OrderedDefaultDict(int) self.gene_blacklist_counter = OrderedDefaultDict(int) self.region_blacklist_counter = OrderedDefaultDict(int) compendia_fpath = verify_file(filt_ref_data.compendia(genome), 'compendia_ms7_hotspot') actionable_fpath = verify_file(filt_ref_data.actionable(genome), 'actionable') filter_common_snp_fpath = verify_file(filt_ref_data.common_snp(genome), 'filter_common_snp') filter_common_arti_fpath = verify_file( filt_ref_data.common_art(genome), 'filter_common_artifacts') splice_fpath = verify_file(filt_ref_data.splice(genome), 'splice') suppressors_fpath = verify_file(filt_ref_data.suppressors(), 'suppressors') oncogenes_fpath = verify_file(filt_ref_data.oncogenes(), 'oncogenes') ruledir = verify_dir(filt_ref_data.ruledir(), 'ruledir') snpeffect_polymorph_fpath = verify_file( filt_ref_data.snpeffect_export_polymorphic(), 'snpeffect_export_polymorphic') actionable_hotspot_fpath = verify_file( filt_ref_data.actionable_hotspot(), 'actionable_hotspot') specific_mutations_fpath = verify_file( filt_ref_data.specific_mutations(), 'specific_mutations') last_critical_aa_fpath = verify_file(filt_ref_data.last_critical_aa(), 'last_critical_aa') incidentalome_dir = verify_dir(filt_ref_data.incidentalome_dir(), 'incidentalome') comments_fpath = verify_file(filt_ref_data.ngs_reports_comments(), 'ngs_reports_comments') if not all([ compendia_fpath, actionable_fpath, filter_common_snp_fpath, filter_common_arti_fpath, splice_fpath, suppressors_fpath, oncogenes_fpath, ruledir, snpeffect_polymorph_fpath, actionable_hotspot_fpath, specific_mutations_fpath, last_critical_aa_fpath, incidentalome_dir, comments_fpath, ]): logger.err( 'Error: some of the required files are not found or empty (see above)' ) self.suppressors = parse_genes_list(adjust_path(suppressors_fpath)) self.oncogenes = parse_genes_list(adjust_path(oncogenes_fpath)) self.reg_exp_sample = reg_exp_sample self.platform = platform transcripts_fpath = verify_file(transcripts_fpath, silent=True) if transcripts_fpath: logger.info('Using canonical transcripts from ' + transcripts_fpath) with open(transcripts_fpath) as f: self.transcripts = [tr.strip().split('.')[0] for tr in f] self.max_ratio = filt_cnf['max_ratio'] self.max_sample_cnt = filt_cnf['max_sample_cnt'] self.min_freq = filt_cnf['min_freq'] # for all variants self.act_min_freq = filt_cnf['act_min_freq'] self.act_min_freq = self.act_min_freq or self.min_freq // 2 self.germline_min_freq = filt_cnf['germline_min_freq'] self.filt_depth = filt_cnf['filt_depth'] self.min_vd = filt_cnf['min_vd'] self.min_gmaf = filt_cnf['min_gmaf'] self.keep_utr_intronic = filt_cnf['keep_utr_intronic'] self.keep_whole_genome = filt_cnf['keep_whole_genome'] self.keep_hla = filt_cnf['keep_hla'] self.damage_p_value = filt_cnf.get('damage_p_value') logger.info('Parsing filtering data...') self.tp53_groups = { 'Group 1': parse_mut_tp53(join(ruledir, 'DNE.txt')), 'Group 2': parse_mut_tp53(join(ruledir, 'TA0-25.txt')), 'Group 3': parse_mut_tp53(join(ruledir, 'TA25-50_SOM_10x.txt')) } self.splice_positions_by_gene = defaultdict(set) for l in iter_lines(splice_fpath): pos, g = l.split('\t') self.splice_positions_by_gene[g].add(pos) self.last_critical_aa_pos_by_gene = dict() for l in iter_lines(last_critical_aa_fpath): g, aa_pos, _ = l.split('\t') self.last_critical_aa_pos_by_gene[g] = int(aa_pos) self.filter_snp = set() for l in iter_lines(filter_common_snp_fpath): fields = l.split('\t') self.filter_snp.add('-'.join(fields[1:5])) self.snpeff_snp = set() self.snpeff_snp_rsids = set() for l in iter_lines(snpeffect_polymorph_fpath): fields = l.split('\t') snpeff_aachg = fields[2] snpeff_rsid = fields[5] if len(fields) > 11 and fields[11]: snpeff_gene = fields[11] self.snpeff_snp.add('-'.join([snpeff_gene, snpeff_aachg])) elif snpeff_rsid != '-': self.snpeff_snp_rsids.add(snpeff_rsid) self.filter_artifacts = set() self.filter_rules_by_gene = defaultdict(list) for l in iter_lines(filter_common_arti_fpath): fields = l.split('\t') if fields[5] == 'rule': gene, chrom, start, end, action, _, _, _, note = fields[:9] rule = Rule(gene, chrom=chrom, start=int(start), end=int(end), action=action, note=note) self.filter_rules_by_gene[gene].append(rule) else: gene, chrom, start, ref, alt = fields[:5] self.filter_artifacts.add('-'.join([chrom, start, ref, alt])) self.actionable_hotspot_by_gene = defaultdict(dict) self.common_snps_by_gene = defaultdict(set) with open(actionable_hotspot_fpath) as f: for l in f: l = l.replace('\n', '') if not l or l.startswith('##'): continue fields = l.split('\t') gene = fields[0] prot_change = fields[1] if gene.startswith('#'): # VUS, No special treatment for now gene = gene[1:] elif gene.startswith('^'): gene = gene[1:] self.common_snps_by_gene[gene].add(prot_change) else: is_somatic = fields[2] == 'somatic' self.actionable_hotspot_by_gene[gene][ prot_change] = 'somatic' if is_somatic else 'germline' self.ngs_reports_comments = defaultdict(dict) with open(comments_fpath) as f: for r in csv.DictReader( (row for row in f if not row.startswith('#')), delimiter='\t'): gene = r['Gene'] prot_change = r['AA_Change'] if gene.startswith('^'): gene = gene[ 1:] # remove leading ^ character, e.g. ^EGFR -> EGFR is_somatic = 'somatic' in r['Note'] self.actionable_hotspot_by_gene[gene][ prot_change] = 'somatic' if is_somatic else 'germline' else: self.ngs_reports_comments[gene][prot_change] = r['Note'] self.act_somatic = dict() self.act_germline = set() self.rules = defaultdict(list) for l in iter_lines(actionable_fpath): fields = l.split('\t') if fields[7] == 'germline': key = '-'.join(fields[1:5]) self.act_germline.add(key) elif fields[7] == 'somatic': change = fields[8].strip() if fields[6] == 'rule': if fields[4] == '*' and len(fields[3]) == 1: key = '-'.join(fields[1:4]) self.act_somatic[key] = change else: indel_type = '' if 'indel' in fields[5]: indel_type = 'indel' elif 'ins' in fields[5]: indel_type = 'ins' elif 'del' in fields[5]: indel_type = 'del' rule = Rule(gene=fields[0], chrom=fields[1], start=int(fields[2]), end=int(fields[3]), length=int(fields[4]), required_inframe='inframe' in fields[5], indel_type=indel_type, change=change) self.rules[rule.gene].append(rule) # elif fields[5] == inframe_del: # self.rules[inframe_del].setdefault(fields[0], []).append([fields[1]] + [int (f) for f in fields[2:5]]) # elif fields[5] == inframe_ins: # self.rules[inframe_ins].setdefault(fields[0], []).append([fields[1]] + [int (f) for f in fields[2:5]]) else: key = '-'.join(fields[1:5]) self.act_somatic[key] = change self.hotspot_nucleotides = set() self.hotspot_proteins = set() for l in iter_lines(compendia_fpath): fields = l.split('\t') if fields[5].startswith('g.'): continue self.hotspot_nucleotides.add('-'.join(fields[1:5])) if not fields[6]: continue self.hotspot_proteins.add('-'.join([fields[0], fields[6]])) logger.info('Parsing gene blacklists...') anno_cfg = get_anno_config() self.gene_blacklists_by_reason = parse_gene_blacklists( anno_cfg['blacklist']['genes'], incidentalome_dir) for r in self.gene_blacklists_by_reason.keys(): self.gene_blacklist_counter[r] = 0 self.gene_blacklist_counter['hardfilter'] = 0 # self.gene_to_soft_filter = list(iter_lines(join(incidentalome_dir, 'soft_filter.txt'))) # self.region_blacklists_by_reason = dict() # if tricky_regions_dir: # info('Parsing region blacklists...') # self.region_blacklists_by_reason = load_tricky_regions(anno_cfg['blacklist']['regions'], tricky_regions_dir) # for r in self.region_blacklists_by_reason.keys(): # self.region_blacklist_counter[r] = 0 logger.info('Parsing actionable rules and specific mutations...') self.tier_by_specific_mutations, self.tier_by_type_by_region_by_gene, self.sensitizations_by_gene\ = parse_specific_mutations(specific_mutations_fpath) if not all([ self.rules, self.splice_positions_by_gene, self.act_somatic, self.act_germline, self.actionable_hotspot_by_gene ]): if not self.rules: logger.err('No rules, cannot proceed') if not self.splice_positions_by_gene: logger.err('No tp53_positions, cannot proceed') if not self.act_somatic: logger.err('No act_somatic, cannot proceed') if not self.act_germline: logger.err('No act_germline, cannot proceed') if not self.actionable_hotspot_by_gene: logger.err('No actionable_hotspots, cannot proceed') self.status = None self.reason_by_status = None self.output_f = None self.fm_output_f = None self.rejected_output_f = None