def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False): self.raw_name = name self.name = self.raw_name.replace('.', '_') self.rgid = self.name self.dirpath = verify_dir(join(self.parent_project.final_dir, self.name)) if not verify_dir(self.dirpath, silent=silent): critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory ' f'{self.parent_project.final_dir}. Please check consistency between the YAML ' f'{self.parent_project.bcbio_yaml_fpath} and the directories in `final`: ' f'to every "description" value in YAML, there should be a corresponding folder with the ' f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) ' f'from consideration, if you are sure that missing folders are expected.') self.bam = self.find_bam(silent=silent) if self.is_rnaseq: gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts')) if isfile(gene_counts) and verify_file(gene_counts): self.counts_file = gene_counts else: if not silent: warn('Counts for ' + self.name + ' not found') else: if variantcallers_data: self._set_variant_callers(variantcallers_data, ensemble=ensemble) else: if not silent: warn('No variant callers set in config, skipping finding VCF files')
def _add_to_ngb(work_dir, project_name, bam_by_sample, genome_build, bed_file, p_view): if is_us() or is_uk(): try: from az.ngb import add_bcbio_project_to_ngb, add_data_to_ngb, add_file_to_ngb except ImportError: log.warn( 'If you want to, install NGS Reporting with `conda install -v vladsaveliev ngs_reporting`' ) else: log.info('Exposing project to NGB...') try: dataset = project_name + '_Fingerprints' add_data_to_ngb(work_dir, p_view, bam_by_sample, dict(), dataset, bed_file=bed_file, genome=genome_build) add_file_to_ngb(work_dir, get_dbsnp(genome_build), genome_build, dataset, dataset, skip_if_added=True) except Exception: traceback.print_exc() log.err('Error: cannot export to NGB') log.info('*' * 70)
def find_somatic_vcf(self, silent=False, caller=None): caller = caller or self.somatic_caller if not caller: if not silent: warn(f'Batch {self.name} have no variant caler info assigned, skipping finding somatic VCF') return # in datestamp. cwl-bcbio writes there vcf_cwl_fpath_gz = adjust_path(join(self.parent_project.date_dir, self.name + '-' + caller + '.vcf.gz')) # in datestamp. bcbio before 1.1.6 vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir, self.name + '-' + caller + '-annotated.vcf.gz')) # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019 vcf_fpath_gz = adjust_path(join(self.tumors[0].dirpath, self.tumors[0].name + '-' + caller + '.vcf.gz')) if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info(f'Found somatic VCF in <final-dir>/<tumor-name>/<tumor-name>-{caller}.vcf.gz (conventional bcbio): ' + vcf_fpath_gz) self.somatic_vcf = vcf_fpath_gz elif isfile(vcf_old_fpath_gz): verify_file(vcf_old_fpath_gz, is_critical=True) if not silent: info(f'Found somatic VCF in <date-dir>/<batch>-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz) self.somatic_vcf = vcf_old_fpath_gz elif isfile(vcf_cwl_fpath_gz): verify_file(vcf_cwl_fpath_gz, is_critical=True) if not silent: info(f'Found somatic VCF in project/<batch>-{caller}.vcf.gz (CWL bcbio): ' + vcf_cwl_fpath_gz) self.somatic_vcf = vcf_cwl_fpath_gz elif not silent: warn(f'Could not find somatic variants files for batch {self.name}, caller {caller} neither as ' f'{self.parent_project.final_dir}/<tumor-name>/<tumor-name>-{caller}.vcf.gz (conventional bcbio), nor as ' f'{self.parent_project.date_dir}/<batch>-{caller}-annotated.vcf.gz (bcbio < v1.1.6), nor as ' f'project/<batch>-{caller}.vcf.gz (CWL bcbio).')
def run_simple(cmd, silent=False): """Run the provided command, logging details and checking for errors. """ # cmd = _normalize_cmd_args(cmd) if not silent: warn(' '.join(str(x) for x in cmd) if not isinstance(cmd, str) else cmd) subprocess.check_call(cmd, shell=True, executable=find_bash())
def find_germline_vcf(self, silent=False, caller=None): caller = caller or self.germline_caller if not caller: if not silent: warn(f'Batch {self.name} have no variant caler info assigned, skipping finding germline VCF') return assert caller # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019 vcf_fpath_gz = adjust_path(join(self.parent_project.date_dir, f'{self.normals[0].name}-germline-{caller}.vcf.gz')) # in datestamp. bcbio before 1.1.6 vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir, f'{self.normals[0].name}-germline-{caller}-annotated.vcf.gz')) if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}.vcf.gz: ' + vcf_fpath_gz) self.germline_vcf = vcf_fpath_gz elif isfile(vcf_old_fpath_gz): verify_file(vcf_old_fpath_gz, is_critical=True) if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz) self.germline_vcf = vcf_old_fpath_gz elif not silent: warn(f'Could not find germline variants files for batch {self.name}, caller {caller} neither as ' f'<date-dir>/<normal-name>-germline-{caller}.vcf.gz, nor as ' f'<date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)')
def find_sv_vcf(self, silent=False, caller=False): caller = caller or self.sv_caller sv_prio = join(self.tumors[0].dirpath, f'{self.name}-sv-prioritize-{caller}.vcf.gz') sv_unprio = join(self.tumors[0].dirpath, f'{self.name}-{caller}.vcf.gz') # CWL? sv_cwl_prio = join(self.parent_project.date_dir, f'{self.tumors[0].name}-{caller}-prioritized.vcf.gz') sv_cwl_unprio = join(self.parent_project.date_dir, f'{self.tumors[0].name}-{caller}.vcf.gz') if isfile(sv_prio): verify_file(sv_prio, is_critical=True) if not silent: info(f'Found SV VCF in <tumor>/<batch>-sv-prioritize-{caller}.vcf.gz: ' + sv_prio) self.sv_vcf = sv_prio elif isfile(sv_unprio): verify_file(sv_unprio, is_critical=True) if not silent: info(f'Found SV VCF in <tumor>/<batch>-{caller}.vcf.gz: ' + sv_unprio) self.sv_vcf = sv_unprio elif isfile(sv_cwl_prio): verify_file(sv_cwl_prio, is_critical=True) if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}-prioritized.vcf.gz: ' + sv_cwl_prio) self.sv_cwl_prio = sv_cwl_prio elif isfile(sv_cwl_unprio): verify_file(sv_cwl_unprio, is_critical=True) if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}.vcf.gz: ' + sv_cwl_prio) self.sv_vcf = sv_cwl_unprio elif not silent: warn(f'Could not find SV VCF file for batch {self.name}, caller {caller} neither under sample folder as ' f'<tumor>/<batch>(-sv-prioritize)-{caller}.vcf.gz (conventional bcbio), ' f'nor in the project folder as project/<tumor>-{caller}(-prioritized).vcf.gz (CWL bcbio).')
def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False): self.raw_name = name self.name = self.raw_name.replace('.', '_') self.dirpath = verify_dir(join(self.bcbio_project.final_dir, self.name)) if not verify_dir(self.dirpath, silent=silent): if not silent: critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory ' f'{self.bcbio_project.final_dir}. Please check consistency between the YAML ' f'{self.bcbio_project.bcbio_yaml_fpath} and the directories in `final`: ' f'to every "description" value in YAML, there should be a corresponding folder with the ' f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) ' f'from consideration, if you are sure that missing folders are expected.') else: return False self.var_dirpath = join(self.dirpath, BcbioProject.var_dir) self.bam = self.find_bam(silent=silent) if self.is_rnaseq: gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts')) if isfile(gene_counts) and verify_file(gene_counts): self.counts_file = gene_counts else: if not silent: warn('Counts for ' + self.name + ' not found') else: if variantcallers_data: self._set_variant_files(variantcallers_data, ensemble=ensemble) else: if not silent: warn('No variant callers set in config, skipping finding VCF files') return True
def find_bam(self, silent=False): name = self.get_name_for_files() to_try = [ '-ready.cram', '-ready.bam', '-sort.bam', ] for ext in to_try: fpath = adjust_path(join(self.dirpath, name + ext)) if verify_file(fpath): return fpath input_file = self.sample_info['files'] if not isinstance(input_file, str): input_file = input_file[0] if isinstance(input_file, str) and input_file.endswith('.bam'): debug('Bcbio was run from BAM input') if not input_file.startswith('/'): input_file = abspath(join(self.bcbio_project.work_dir, input_file)) if verify_file(input_file): debug('Using BAM file from input YAML ' + input_file) return input_file else: debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist') if not silent: warn('No BAM or CRAM file found for ' + self.name)
def find_bam(self, silent=False): name = self.get_name_for_files() to_try = [ '-ready.bam', '-ready.cram', '-sort.bam', ] for ext in to_try: fpath = adjust_path(join(self.dirpath, name + ext)) if verify_file(fpath): return fpath input_file = self.sample_info['files'] if not isinstance(input_file, str): input_file = input_file[0] if isinstance(input_file, str) and input_file.endswith('.bam'): debug('Bcbio was run from BAM input') if not input_file.startswith('/'): input_file = abspath(join(self.parent_project.work_dir, input_file)) if verify_file(input_file): debug('Using BAM file from input YAML ' + input_file) return input_file else: debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist') if not silent: warn('No BAM or CRAM file found for ' + self.name)
def pair_dragen_directories(paths): # DRAGEN tumor/normal and normal directories are paired on the basis of the normal sample # name. # # Tumor and normal sample names are extracted from the BAM header. Specifically, the BAM # sample name is retrieved from the 'SM' (sample) field of the '@RG' (read group) header line. # # Tumor or normal identity of a sample is inferred from the BAM filename: if a BAM filename # contains the '_tumor.bam' suffix then it and the sample name is set as the tumor, otherwise # set as the normal sample. # # The subject identifier is from the DRAGEN output directory name. # # Assumes a one-to-one pairing for DRAGEN tumor/normal and normal output directories i.e. no # multiple tumor/normal runs to a single normal run. # Sort paths by normal sample name so that normal and tumor/normal are placed together paths_sorted = dict() for path in paths: dir_type = 'tumor_normal_run' if is_dragen_tumor_normal_directory( path) else 'normal_run' samples = get_samples_from_dragen_dir_bams(path) # Ensure we have found normal names if 'normal' not in samples: critical( f'Could not find normal sample name for DRAGEN directory {path}' ) # Sort by normal sample name, add path, subject ID to stored data sample_normal = samples['normal'] if sample_normal not in paths_sorted: paths_sorted[sample_normal] = dict() assert dir_type not in paths_sorted[sample_normal] paths_sorted[sample_normal][dir_type] = samples paths_sorted[sample_normal][dir_type]['path'] = path paths_sorted[sample_normal][dir_type][ 'prefix'] = get_dragen_output_prefix(path) paths_sorted[sample_normal][dir_type][ 'subject_id'] = get_subject_id_from_dragen_dir(path) # Differentiated paired and unpaired paths paths_unpaired = list() paths_paired = list() for paths in paths_sorted.values(): if 'normal_run' in paths and 'tumor_normal_run' in paths: # Ensure we have collected only one subject id for this set of inputs assert len({d['subject_id'] for d in paths.values()}) == 1 paths['subject_id'] = paths['normal_run']['subject_id'] paths_paired.append(paths) else: for dir_type, data in paths.items(): paths_unpaired.append((dir_type, data['path'])) # Emit warning for unpaired paths if paths_unpaired: paths_unpaired_strs = list() for dir_type, path in paths_unpaired: paths_unpaired_strs.append(f'{dir_type}: {path}') paths_unpaired_str = '\n\t'.join(paths_unpaired_strs) warn(f'could not pair DRAGEN directories:\n\t{paths_unpaired_str}') return paths_paired
def run_simple(cmd, env_vars=None, silent=False): """Run the provided command, logging details and checking for errors. """ env = _get_env(env_vars) cmd, shell_arg, executable_arg = _normalize_cmd_args(cmd) if not silent: warn(' '.join(str(x) for x in cmd) if not isinstance(cmd, str) else cmd) subprocess.check_call(cmd, shell=shell_arg, executable=executable_arg, env=env)
def find_vcf_file_from_sample_dir(sample, silent=False, caller=None): caller = caller or sample.bcbio_project.somatic_caller vcf_fname = sample.get_name_for_files() + '-' + caller + '.vcf' sample_var_dirpath = join(sample.dirpath, 'var') vcf_fpath_gz = adjust_path(join(sample.dirpath, vcf_fname + '.gz')) # in var var_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, vcf_fname + '.gz')) # in var var_raw_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname + '.gz')) # in var vcf_fpath = adjust_path(join(sample.dirpath, vcf_fname)) var_vcf_fpath = adjust_path(join(sample_var_dirpath, vcf_fname)) # in var var_raw_vcf_fpath = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname)) # in var if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF ' + vcf_fpath_gz) return vcf_fpath_gz else: debug('Not found VCF ' + vcf_fpath_gz) if isfile(var_vcf_fpath_gz): verify_file(var_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the var/ dir ' + var_vcf_fpath_gz) return var_vcf_fpath_gz else: debug('Not found VCF in the var/ dir ' + var_vcf_fpath_gz) if isfile(var_raw_vcf_fpath_gz): verify_file(var_raw_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz) return var_raw_vcf_fpath_gz else: debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz) if isfile(vcf_fpath): verify_file(vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF ' + vcf_fpath) return vcf_fpath else: debug('Not found uncompressed VCF ' + vcf_fpath) if isfile(var_vcf_fpath): verify_file(var_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the var/ dir ' + var_vcf_fpath) return var_vcf_fpath else: debug('Not found VCF in the var/ dir ' + var_vcf_fpath) if isfile(var_raw_vcf_fpath): verify_file(var_raw_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the var/raw/ dir ' + var_raw_vcf_fpath) return var_raw_vcf_fpath else: debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath) if not silent: warn('Warning: no VCF found for ' + sample.name + ' (' + caller + '), gzip or uncompressed version in and outside ' 'the var directory. Phenotype is ' + str(sample.phenotype)) return None
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug('The male non-PAR region does not overlap with the capture target - cannot determine sex.') return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info('Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.') if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def tmpdir(): dirpath = make_tmpdir() try: yield dirpath finally: try: shutil.rmtree(dirpath) except OSError: warn('Warning: cannot clean up temporary dir ' + dirpath)
def find_qc_files(self, dst_dir, exclude_files=None, include_files=None): """ Parses bcbio MultiQC file list and collects all QC files belonging to this batch :param dst_dir: destination directory where the QC files will be copied to :param exclude_files: not include files matching these patterns :param include_files: only include files matching these patterns :return: list of file paths copied into `new_mq_data_dir` """ mq_dir = join(self.parent_project.date_dir, 'multiqc') mq_filelist = join(mq_dir, 'list_files_final.txt') verify_file(mq_filelist, is_critical=True) # Cromwell? cwl_targz = join(mq_dir, 'multiqc-inputs.tar.gz') tar_f_by_fp = dict() if isfile(cwl_targz): info(f'Found CWL MultiQC output {cwl_targz}, extracting required QC files from the archive') if cwl_targz: tar = tarfile.open(cwl_targz) for member in tar.getmembers(): rel_fp = member.name if 'call-multiqc_summary/execution/qc/multiqc/' in rel_fp: rel_fp = rel_fp.split('call-multiqc_summary/execution/qc/multiqc/')[1] tar_f_by_fp[rel_fp] = tar.extractfile(member) qc_files_not_found = [] qc_files_found = [] with open(mq_filelist) as inp: for fp in [l.strip() for l in inp if l.strip()]: if fp == 'trimmed' or fp.endswith('/trimmed'): continue # back-compatibility with bcbio if exclude_files: if isinstance(exclude_files, str): exclude_files = [exclude_files] if any(re.search(ptn, fp) for ptn in exclude_files): continue if include_files: if isinstance(include_files, str): include_files = [include_files] if not any(re.search(ptn, fp) for ptn in include_files): continue new_fp = _extract_qc_file(fp, dst_dir, self.parent_project.final_dir, tar_f_by_fp) if not new_fp: qc_files_not_found.append(fp) continue else: qc_files_found.append(new_fp) if qc_files_not_found: warn('-') warn(f'Some QC files from list {mq_filelist} were not found:' + ''.join('\n ' + fpath for fpath in qc_files_not_found)) return qc_files_found
def get_dbsnp_multi_mafs(genome_cfg): if 'dbsnp_multi_mafs' not in genome_cfg: warn( 'Warning: dbsnp_multi_mafs not provided in the system configuration file for the genome.' ) return None return verify_file( genome_cfg['dbsnp_multi_mafs'], is_critical=True, description='dbSNP multi mafs file in system configuration file')
def update_batches(self, samples, silent=False): batch_by_name = {bn: BcbioBatch(name=bn, parent_project=self) for bn in list(set([b for s in samples for b in s.batch_names]))} for sample in samples: for bn in sample.batch_names: batch_by_name[bn].name = bn sample.batches.append(batch_by_name[bn]) if sample.phenotype == 'normal': batch_by_name[bn].add_normal(sample) else: batch_by_name[bn].add_tumor(sample) # Removing batches that do not have matching tumor samples batch_by_name = {bn: b for bn, b in batch_by_name.items() if b.tumors} # for batch in batch_by_name.values(): # if batch.normal and not batch.tumor: # if not silent: info('Batch ' + batch.name + ' contains only normal, treating sample ' + batch.normal.name + ' as tumor') # batch.normal.phenotype = 'tumor' # batch.normal.batch = batch # batch.tumor = batch.normal # batch.normal = None # setting up batch properties for b in batch_by_name.values(): for t in b.tumors: t.normal_matches = b.normals # setting variant caller names for batches for b in batch_by_name.values(): if b.tumors[0].somatic_caller is None: if not silent: warn(f'Sample {b.name} doesn\'t have somatic variant callers info, skip assinging ' f'variant caller to batch {b.name}') else: b.somatic_caller = b.tumors[0].somatic_caller if b.normals: if b.normals[0].germline_caller is None: if not silent: warn(f'Sample {b.name} doesn\'t have germline variant callers info, ' f'skip assinging germline variant caller to batch {b.name}') else: b.germline_caller = b.normals[0].germline_caller # finding vcfs if not self.is_rnaseq: for b in batch_by_name.values(): if b.tumors: b.find_somatic_vcf(silent=silent) b.find_sv_vcf(silent=silent) if b.normals: b.find_germline_vcf(silent=silent) return batch_by_name
def workdir(cnf): if cnf.work_dir: verify_dir(cnf.work_dir, is_critical=True) yield cnf.work_dir else: cnf.work_dir = make_tmpdir() yield cnf.work_dir try: shutil.rmtree(cnf.work_dir) except OSError: warn('Warning: cannot clean up temporary dir ' + cnf.work_dir)
def calc_genomic_bp_pos(self): genomic_coord, is_in_intron = FusionSide.offset_to_genome_coord( self.trx, self.bp_offset) if genomic_coord is None: logger.critical( f' Error: could not convert transcript {id} offest {genomic_coord} to genomic coordinate' ) return False if genomic_coord == -1: logger.warn( f' Fusion in {self} takes the whole transcript {self.trx.id}. That\'s suspicious, so we are skipping it.' ) return False self.bp_genomic_pos = genomic_coord self.bp_is_in_intron = is_in_intron return True
def load_filt_cfg(filt_cnf_fpath=None, target_type=None, vardict_min_freq=None, is_wgs=False): """ Specify either target_type, or vardict_min_freq and is_wgs """ if not filt_cnf_fpath: if not target_type: if vardict_min_freq is not None: if vardict_min_freq <= 0.005: info('Filtering config: min_allele_fraction=' + str(vardict_min_freq) + ' which is less 0.005, ' 'setting config for panel') target_type = 'panel' elif is_wgs is None: # coverage interval is not defined warn( 'Coverage interval is not defined, skipping variant filtering' ) return None elif is_wgs is True: target_type = 'genome' info('Filtering config: setting config for genome') else: target_type = 'exome' info('Filtering config: min_allele_fraction=' + str(vardict_min_freq) + ' which is higher than 0.005, ' 'setting config for exome') else: target_type = 'exome' info( 'Neither min freq not filt config was provided, using settings for exome' ) assert target_type in filt_cnf_fpaths, \ 'filt_cnf_fpath=' + str(filt_cnf_fpath) + '; ' + str(target_type) + ' not in ' + str(filt_cnf_fpaths.keys()) filt_cnf_fpath = filt_cnf_fpaths[target_type] d = load_yaml_config(filt_cnf_fpath) if d.get('variant_filtering') and isinstance(d.get('variant_filtering'), dict): d = d.get('variant_filtering', dict()) d = fill_dict_from_defaults(d, filt_info_defaults) d['filt_cnf_fpath'] = filt_cnf_fpath return d
def main(): options = [ (['-g', '--genome'], dict( dest='genome', help='Genome build. Accepted values: ' + ', '.join(ebl.SUPPORTED_GENOMES), )), (['-c', '--canonical'], dict( dest='canonical', action='store_true', help='Use canonical only', )), ] parser = OptionParser() for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if not opts.genome: logger.critical( 'Error: please, specify genome build name with -g (e.g. `-g hg19`)' ) genome = opts.genome logger.debug('Getting features from storage') features_bed = ebl.get_all_features(genome) if features_bed is None: logger.critical('Genome ' + genome + ' is not supported. Supported: ' + ', '.join(ebl.SUPPORTED_GENOMES)) logger.warn('Extracting features from Ensembl GTF') features_bed = features_bed.filter( lambda x: x[ebl.BedCols.FEATURE] == 'CDS') if opts.canonical: features_bed = features_bed.filter( ebl.get_only_canonical_filter(genome)) logger.warn('Saving CDS regions...') output_fpath = adjust_path( join(dirname(__file__), pardir, genome, 'bed', 'CDS-canonical.bed')) with file_transaction(None, output_fpath) as tx: features_bed.cut(range(6)).saveas(tx) logger.warn('Done, saved to ' + output_fpath)
def set_samples(self, bcbio_cnf, include_samples=None, exclude_samples=None): debug('Reading sample details...') exclude_samples = [s.replace('.', '_') for s in exclude_samples] if exclude_samples else None include_samples = [s.replace('.', '_') for s in include_samples] if include_samples else None # First pass - just to get extra batch IDs that we need to include to have batches consistent extra_batches = set() all_sample_names = set() all_batch_names = set() if include_samples: for sample_info in bcbio_cnf['details']: sname, batch_names = BcbioSample.parse_sample_ids(sample_info) all_sample_names.add(sname) all_batch_names |= set(batch_names) if sname in include_samples: for b in batch_names: if b not in (include_samples or []) and b not in (exclude_samples or []): extra_batches.add(b) # Second pass - including/excluding, and creating BcbioSample objects for sample_info in bcbio_cnf['details']: s = BcbioSample.load_from_sample_info( sample_info, bcbio_project=self, include_samples=include_samples, exclude_samples=exclude_samples, extra_batches=extra_batches, silent=self.silent) if s: self.samples.append(s) if not self.samples: if exclude_samples: critical(f'Error: no samples left with the exclusion of ' f'batch/sample name(s): {", ".join(exclude_samples)}\n' f'Available samples from the YAML file {self.bcbio_yaml_fpath}:\n' f'{", ".join(all_sample_names)}\nbatches: {", ".join(all_batch_names)}') if include_samples: critical(f'Error: could not find a batch or a sample with the name(s): ' f'{", ".join(include_samples)}\n' f'Available samples from the YAML file {self.bcbio_yaml_fpath}:\n' f'{", ".join(all_sample_names)}\nbatches: {", ".join(all_batch_names)}') critical(f'Error: could not parse any batch or samples in the bcbio project. ' f'Please check the bcbio YAML file: {self.bcbio_yaml_fpath}') not_found_samples = [s.name for s in self.samples if not s.bam] if not_found_samples: if not self.silent: warn(f'Warning: no BAM files not found for {len(not_found_samples)}/{len(self.samples)} samples') self.samples.sort(key=lambda _s: _s.key_to_sort()) self.batch_by_name = self.update_batches(self.samples, self.silent) def _check_dup_props(prop, is_critical=False): _vals = set([s_.__dict__.get(prop) for s_ in self.samples]) if len(_vals) > 1: (critical if is_critical else err)('Got different ' + prop + ' values in samples in ' + self.project_name) else: self.__dict__[prop] = _vals.pop() _check_dup_props('genome_build') _check_dup_props('variant_regions_bed') _check_dup_props('coverage_bed') _check_dup_props('sv_regions_bed') _check_dup_props('is_rnaseq') _check_dup_props('min_allele_fraction') _check_dup_props('is_wgs', is_critical=False) _check_dup_props('coverage_interval', is_critical=False) if self.is_rnaseq: debug('RNAseq') elif self.coverage_interval: debug('Coverage interval: ' + str(self.coverage_interval)) for s in self.samples: for caller in s.variantcallers: self.samples_by_caller[(caller, s.phenotype == 'germline')].append(s) debug('Done loading bcbio project ' + self.project_name)
def main(genome=None, gtf_path=None, all_transcripts=False, principal=False, only_key_genes=False, gene_list=None, biotypes='', features=''): out = sys.stdout # GTF if not gtf_path: try: from hpc_utils import hpc except ImportError: critical('GTF file is needed. Either install hpc_utils, or provide GTF with --gtf') else: if genome == 'GRCh37': gtf_path = os.path.join(hpc.get_ref_file(key='pyensembl_data'), 'GRCh37/ensembl75/Homo_sapiens.GRCh37.75.gtf.gz') else: gtf_path = os.path.join(hpc.get_ref_file(key='pyensembl_data'), 'GRCh38/ensembl95/Homo_sapiens.GRCh38.95.gtf.gz') # Genes key_genes = None if only_key_genes: key_genes = get_key_genes_set() elif gene_list: key_genes = get_genes_from_file(gene_list) # Transcripts transcripts_by_gid = None if not all_transcripts: if principal: transcripts_by_gid = { k: [v] for k, v in canon_transcript_per_gene(genome, only_principal=True, use_gene_id=True).items() } else: transcripts_by_gid = canon_transcript_per_gene(genome, use_gene_id=True) # Options biotypes = biotypes.strip() if biotypes: biotypes = biotypes.split(',') features = features.strip() if features: features = features.split(',') genes_set = set() genes_without_canon = set() warn(f'Parsing {gtf_path}') with open_gzipsafe(gtf_path) as f: lines_cnt = 0 region_cnt = 0 for l in f: if not l.startswith('#') and l.strip(): lines_cnt += 1 fields = l.strip().split('\t') try: chrom, _, feature, start, end, _, _, _, annotations = fields except: warn(f'Cannot read fields {str(fields)}') raise if genome.startswith('hg') and not chrom.startswith('chr'): chrom = 'chr' + chrom if features: if not any(feature == ft for ft in features): continue annotations = {kv.split()[0].strip().strip('"'): kv.split()[1].strip().strip('"') for kv in annotations.split('; ')} gene_name = annotations['gene_name'] if only_key_genes and gene_name not in key_genes: continue if biotypes: biotype = annotations['gene_biotype'] if not any(bt == biotype for bt in biotypes): continue if transcripts_by_gid: gene_id = annotations['gene_id'] transcript_id = annotations['transcript_id'] canon_transcript_ids = transcripts_by_gid.get(gene_id) if not canon_transcript_ids: genes_without_canon.add(gene_name) continue if not transcript_id in canon_transcript_ids: continue start = int(start) - 1 end = int(end) if end - start >= 3: out.write('\t'.join([chrom, str(start), str(end), gene_name]) + '\n') genes_set.add(gene_name) region_cnt += 1 if region_cnt % 10000 == 0: warn(f'Processed {len(genes_set)} genes, written {region_cnt} regions...') warn(f'Done. Processed {len(genes_set)} genes, written {region_cnt} regions') if genes_without_canon: warn(f'No canonical transcript for {len(genes_without_canon)} gene ids')
def set_samples(self, bcbio_cnf, exclude_samples=None, include_samples=None): debug('Reading sample details...') exclude_samples = [s.replace('.', '_') for s in exclude_samples] if exclude_samples else None include_samples = [s.replace('.', '_') for s in include_samples] if include_samples else None # First pass - just to get extra batch IDs that we need to include to have batches consistent extra_batches = set() if include_samples: for sample_info in bcbio_cnf['details']: sname, batch_names = BcbioSample.parse_sample_ids(sample_info) if sname in include_samples: for b in batch_names: if b not in (include_samples or []) and b not in (exclude_samples or []): extra_batches.add(b) # Second pass - including/excluding, and creating BcbioSample objects for sample_info in bcbio_cnf['details']: s = BcbioSample.load_from_sample_info( sample_info, bcbio_project=self, exclude_samples=exclude_samples, include_samples=include_samples, extra_batches=extra_batches, silent=self.silent) if s: self.samples.append(s) if not self.samples: if exclude_samples: critical(f'Error: no samples left with the exclusion of batch/sample name(s): {", ".join(exclude_samples)}.' f'Check the YAML file for available options: {self.bcbio_yaml_fpath}.') if include_samples: critical(f'Error: could not find a batch or a sample with the name(s): {", ".join(include_samples)}. ' f'Check the YAML file for available options: {self.bcbio_yaml_fpath}') critical(f'Error: could not parse any batch or samples in the bcbio project. ' f'Please check the bcbio YAML file: {self.bcbio_yaml_fpath}') not_found_samples = [s.name for s in self.samples if not s.bam] if not_found_samples: if not self.silent: warn(f'Warning: no BAM files not found for {len(not_found_samples)}/{len(self.samples)} samples') self.samples.sort(key=lambda _s: _s.key_to_sort()) self.batch_by_name = self.update_batches(self.samples, self.silent) def _check_dup_props(prop, is_critical=False): _vals = set([s_.__dict__.get(prop) for s_ in self.samples]) if len(_vals) > 1: (critical if is_critical else err)('Got different ' + prop + ' values in samples in ' + self.project_name) else: self.__dict__[prop] = _vals.pop() _check_dup_props('genome_build') _check_dup_props('variant_regions_bed') _check_dup_props('coverage_bed') _check_dup_props('sv_regions_bed') _check_dup_props('is_rnaseq') _check_dup_props('min_allele_fraction') _check_dup_props('is_wgs', is_critical=False) _check_dup_props('coverage_interval', is_critical=False) if self.is_rnaseq: debug('RNAseq') elif self.coverage_interval: debug('Coverage interval: ' + str(self.coverage_interval)) for s in self.samples: for caller in s.variantcallers: self.samples_by_caller[(caller, s.phenotype == 'germline')].append(s) debug('Done loading bcbio project ' + self.project_name)
def find_vcf_file(self, batch_name, silent=False, caller=None): caller = caller or self.somatic_caller vcf_fname = batch_name + '-' + caller + '.vcf' annot_vcf_fname = batch_name + '-' + caller + '-annotated.vcf' vcf_annot_fpath_gz = adjust_path(join(self.date_dir, annot_vcf_fname + '.gz')) # in datestamp var_raw_vcf_annot_fpath_gz = adjust_path(join(self.raw_var_dir, annot_vcf_fname + '.gz')) # in datestamp/var/raw vcf_fpath_gz = adjust_path(join(self.date_dir, vcf_fname + '.gz')) # in datestamp var_vcf_fpath_gz = adjust_path(join(self.var_dir, vcf_fname + '.gz')) # in datestamp/var var_raw_vcf_fpath_gz = adjust_path(join(self.raw_var_dir, vcf_fname + '.gz')) # in datestamp/var/raw vcf_fpath = adjust_path(join(self.date_dir, vcf_fname)) # in datestamp var_vcf_fpath = adjust_path(join(self.var_dir, vcf_fname)) # in datestamp/var var_raw_vcf_fpath = adjust_path(join(self.raw_var_dir, vcf_fname)) # in datestamp/var/raw if isfile(vcf_annot_fpath_gz): verify_file(vcf_annot_fpath_gz, is_critical=True) if not silent: info('Found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz) return vcf_annot_fpath_gz else: debug('Not found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz) if isfile(var_raw_vcf_annot_fpath_gz): verify_file(var_raw_vcf_annot_fpath_gz, is_critical=True) if not silent: info('Found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz) return var_raw_vcf_annot_fpath_gz else: debug('Not found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz) if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp dir ' + vcf_fpath_gz) return vcf_fpath_gz else: debug('Not found VCF in the datestamp dir ' + vcf_fpath_gz) if isfile(var_raw_vcf_fpath_gz): verify_file(var_raw_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz) return var_raw_vcf_fpath_gz else: debug('Not found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz) if isfile(vcf_fpath): verify_file(vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp dir ' + vcf_fpath) return vcf_fpath else: debug('Not found uncompressed VCF in the datestamp dir ' + vcf_fpath) if isfile(var_raw_vcf_fpath): verify_file(var_raw_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath) return var_raw_vcf_fpath else: debug('Not found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath) if isfile(var_vcf_fpath_gz): verify_file(var_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp/var dir ' + var_vcf_fpath_gz) return var_vcf_fpath_gz else: debug('Not found VCF in the datestamp/var dir ' + var_vcf_fpath_gz) if isfile(var_vcf_fpath): verify_file(var_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath) return var_vcf_fpath else: debug('Not found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath) if not silent: warn('Warning: no VCF found for batch ' + batch_name + ', ' + caller + ', gzip or ' 'uncompressed version in the datestamp directory.') return None
def main(prefix, output_bedpe, output_fasta=None, output_json=None, support=None, ensembl_release=None, peptide_flanking_len=None, debug=False): pizzly_flat_filt_fpath = prefix + '-flat-filtered.tsv' pizzly_json_fpath = prefix + '.json' input_fasta = prefix + '.fusions.fasta' output_bedpe = abspath(output_bedpe) logger.init(debug) ebl = EnsemblRelease(ensembl_release) # Reading filtered tsv filt_fusions = set() with open(pizzly_flat_filt_fpath) as f: for row in csv.DictReader(f, delimiter='\t'): filt_fusions.add((row['geneA.name'], row['geneB.name'])) # Read json json_data = {'genes': []} with open(pizzly_json_fpath) as f: data = json.load(f) for g_event in data['genes']: gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] if (gene_a, gene_b) in filt_fusions: json_data['genes'].append(g_event) # Read fasta fasta_dict = SeqIO.index(input_fasta, 'fasta') filt_json_data = {'genes': []} filt_fasta_records = [] filt_event_count = 0 filt_transcript_event_count = 0 # Write bedpe with open(output_bedpe, 'w') as bedpe_fh: bedpe_header = [ 'chr 5p', 'start 5p', 'end 5p', 'chr 3p', 'start 3p', 'end 3p', 'name', 'tier', 'strand 5p', 'strand 3p', 'support', 'is canon bndry', 'inframe', 'peptide', 'fusion pos', 'nt in the break', 'transcripts', 'is canon intron dinuc', ] bedpe_writer = csv.DictWriter(bedpe_fh, fieldnames=bedpe_header, delimiter='\t') bedpe_writer.writeheader() for g_event in json_data[ 'genes']: # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'} gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] logger.info(gene_a + '>>' + gene_b) # # first pass to select the longest transcripts # def _longest_tx(key): # return max((ebl.transcript_by_id(te[f'transcript{key}']['id']) for te in g_event['transcripts']), key=lambda t: len(t)) # a_tx = _longest_tx('A') # b_tx = _longest_tx('B') # print(f'Longest transcriptA: {a_tx.id}, Longest transcriptB: {b_tx.id}') # try: # t_event = [te for te in g_event['transcripts'] if te['transcriptA']['id'] == a_tx.id and te['transcriptB']['id'] == b_tx.id][0] # except: # print(f"No event with 2 longest transcripts. Available events: {', '.join(te['transcriptA']['id'] + # '>>' + te['transcriptB']['id'] for te in g_event['transcripts'])}") # raise filt_g_event = { k: v for k, v in g_event.items() if k != 'readpairs' } filt_g_event['transcripts'] = [] met_event_keys = set( ) # collecting to get rid of duplicate transcript events met_peptide_keys = set( ) # collecting to get rid of duplicate peptides bedpe_entries = [] for t_event in g_event['transcripts']: if t_event['support'] < support: continue fusion = Fusion.create_from_pizzly_event(ebl, t_event) if not fusion: # not a good transcript continue # skipping duplicate events k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.side_5p.bp_offset, fusion.side_3p.bp_offset if k in met_event_keys: continue met_event_keys.add(k) # for writing filtered json filt_g_event['transcripts'].append(t_event) filt_transcript_event_count += 1 # writing bedpe entry = fusion.to_bedpe(peptide_flanking_len) if not entry: continue # skipping duplicate peptides k = entry['name'], entry['peptide'] if k in met_peptide_keys: continue met_peptide_keys.add(k) bedpe_entries.append(entry) # for writing filtered fasta pizzly_fasta_rec = fasta_dict[t_event['fasta_record']] _check_fusion_fasta(pizzly_fasta_rec, fusion) filt_fasta_records.append(pizzly_fasta_rec) if fusion.peptide: _verify_peptides(pizzly_fasta_rec, fusion, peptide_flanking_len) if not bedpe_entries: logger.warn( f'All transcript events filtered out for fusion {gene_a}>>{gene_b}, skipping' ) else: filt_json_data['genes'].append(filt_g_event) filt_event_count += 1 for bedpe_entry in bedpe_entries: bedpe_writer.writerow(bedpe_entry) # _test_pvac(output_bedpe) # Write filtered json if output_json: with open(output_json, 'w') as f: json.dump(filt_json_data, f, indent=4) # Write fasta if output_fasta: SeqIO.write(filt_fasta_records, output_fasta, 'fasta') logger.info() logger.info(f'Written {filt_transcript_event_count} transcript events ' f'for {filt_event_count} fusions into bedpe: {output_bedpe}')
def to_bedpe(self, peptide_flanking_len=None): bp_genomic_pos_5p, bp_in_intron_5p = self.side_5p.calc_genomic_bp_offset( ) if bp_genomic_pos_5p == -1: logger.warn( f' Fusion in {self} takes the whole 5p transcript {self.side_5p.trx.id}. That\'s suspicious, so we are skipping it.' ) return None bp_genomic_pos_3p, bp_in_intron_3p = self.side_3p.calc_genomic_bp_offset( ) if bp_genomic_pos_3p == -1: logger.warn( f' Fusion in {self} takes the whole 3p transcript {self.side_3p.trx.id}. That\'s suspicious, so we are skipping it.' ) return None self.is_canonical_boundary = bp_in_intron_5p and bp_in_intron_3p entry = { 'chr 5p': self.side_5p.trx.contig, 'start 5p': -1 if self.side_5p.trx.strand == '+' else bp_genomic_pos_5p, 'end 5p': -1 if self.side_5p.trx.strand == '-' else bp_genomic_pos_5p, 'chr 3p': self.side_3p.trx.contig, 'start 3p': bp_genomic_pos_3p if self.side_3p.trx.strand == '+' else -1, 'end 3p': bp_genomic_pos_3p if self.side_3p.trx.strand == '-' else -1, 'name': self.side_5p.trx.gene.name + '>>' + self.side_3p.trx.gene.name, 'tier': self.tier, 'strand 5p': self.side_5p.trx.strand, 'strand 3p': self.side_3p.trx.strand, 'support': self.support, 'is canon bndry': 'NA', 'inframe': 'NA', 'peptide': 'NA', 'fusion pos': 'NA', 'nt in the break': 'NA', 'transcripts': 'NA', 'is canon intron dinuc': 'NA', } self.make_peptide(peptide_flanking_len) if self.peptide: # ENST00000304636|ENST00000317840;ENST00000377795;ENST00000009530|ENST00000353334 # 5' transcripts ;3' transcripts ;3' frameshift transcripts trx_line = self.side_5p.trx.transcript_id + ';' + \ (self.side_3p.trx.id if self.is_inframe else '') + ';' + \ (self.side_3p.trx.id if not self.is_inframe else '') entry.update({ 'is canon bndry': '1' if self.is_canonical_boundary else '0', 'inframe': '1' if self.is_inframe else '0', 'peptide': self.peptide, 'fusion pos': self.fusion_offset_in_peptide, 'nt in the break': self.num_of_nt_in_the_break, 'transcripts': trx_line, }) # fields += [self.side_a.transcript.transcript_id + ':' + str(len(self.side_a.transcript)), # self.side_a.t_start, self.side_a.t_end] # fields += [self.side_b.transcript.transcript_id + ':' + str(len(self.side_b.transcript)), # self.side_b.t_start, self.side_b.t_end] return entry
def determine_sex(work_dir, bam_fpath, avg_depth, genome, target_bed=None): debug() debug('Determining sex') pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp'))) male_bed = None for k in chry_key_regions_by_genome: if k in genome: male_bed = BedTool(chry_key_regions_by_genome.get(k)) break if not male_bed: warn('Warning: no male key regions for ' + genome + ', cannot identify sex') return None male_area_size = get_total_bed_size(male_bed) debug('Male region total size: ' + str(male_area_size)) if target_bed: target_male_bed = join(work_dir, 'male.bed') with file_transaction(work_dir, target_male_bed) as tx: BedTool(target_bed).intersect(male_bed).merge().saveas(tx) target_male_area_size = get_total_bed_size(target_male_bed) if target_male_area_size == 0: debug( 'The male non-PAR region does not overlap with the capture target - cannot determine sex.' ) return None male_bed = target_male_bed else: debug('WGS, determining sex based on chrY key regions coverage.') info( 'Detecting sex by comparing the Y chromosome key regions coverage and average coverage depth.' ) if not bam_fpath: critical('BAM file is required.') index_bam(bam_fpath) chry_mean_coverage = _calc_mean_coverage(work_dir, male_bed, bam_fpath, 1) debug('Y key regions average depth: ' + str(chry_mean_coverage)) avg_depth = float(avg_depth) debug('Sample average depth: ' + str(avg_depth)) if avg_depth < AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX: debug('Sample average depth is too low (less than ' + str(AVG_DEPTH_THRESHOLD_TO_DETERMINE_SEX) + ') - cannot determine sex') return None if chry_mean_coverage == 0: debug('Y depth is 0 - it\s female') sex = 'F' else: factor = avg_depth / chry_mean_coverage debug('Sample depth / Y depth = ' + str(factor)) if factor > FEMALE_Y_COVERAGE_FACTOR: # if mean target coverage much higher than chrY coverage debug('Sample depth is more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s female') sex = 'F' else: debug('Sample depth is not more than ' + str(FEMALE_Y_COVERAGE_FACTOR) + ' times higher than Y depth - it\s male') sex = 'M' debug('Sex is ' + sex) debug() return sex
def create_dragen_paired_directories_from_config(smconfig): # Set subject identifier tumor_subject_id_inferred = get_subject_id_from_dragen_dir( smconfig['dragen_somatic_dir']) normal_subject_id_inferred = get_subject_id_from_dragen_dir( smconfig['dragen_germline_dir']) if smconfig.get('dragen_subject_id'): subject_id = smconfig.get('dragen_subject_id') elif tumor_subject_id_inferred == None: critical( 'could not infer subject id from somatic dir, please specify with --dragen_subject_id' ) elif normal_subject_id_inferred == None: critical( 'could not infer subject id from germline dir, please specify with --dragen_subject_id' ) elif tumor_subject_id_inferred != normal_subject_id_inferred: critical( f'got different subject ids from somatic ({tumor_subject_id_inferred}) and germline' f' ({normal_subject_id_inferred}) dirs, please specify with --dragen_subject_id' ) else: subject_id = tumor_subject_id_inferred # Set tumor identifier tumor_samples_inferred = get_samples_from_dragen_dir_bams( smconfig['dragen_somatic_dir']) if smconfig.get('dragen_tumor_id'): tumor_id = smconfig.get('dragen_tumor_id') if tumor_id != tumor_samples_inferred['tumor']: warn( f'provided DRAGEN tumor id ({tumor_id}) doesn\'t match id collected' f' from discovered BAM file ({tumor_samples_inferred["tumor"]})' ) else: tumor_id = tumor_samples_inferred['tumor'] # Set normal identifier normal_samples_inferred = get_samples_from_dragen_dir_bams( smconfig['dragen_germline_dir']) if smconfig.get('dragen_normal_id'): normal_id = smconfig.get('dragen_normal_id') if normal_id != normal_samples_inferred['normal']: warn( f'provided DRAGEN normal id ({normal_id}) doesn\'t match id collected' f' from discovered BAM file ({normal_samples_inferred["normal"]})' ) if 'normal' in tumor_samples_inferred and normal_id != tumor_samples_inferred[ 'normal']: warn( f'provided DRAGEN normal id ({normal_id}) doesn\'t match id collected' f' from discovered BAM file ({tumor_samples_inferred["normal"]})' ) else: normal_id = normal_samples_inferred['normal'] # Create datastructure used for DragenProject init return [{ 'subject_id': subject_id, 'tumor_normal_run': { 'normal': normal_id, 'tumor': tumor_id, 'path': smconfig['dragen_somatic_dir'], 'prefix': get_dragen_output_prefix(smconfig['dragen_somatic_dir']) }, 'normal_run': { 'normal': normal_id, 'path': smconfig['dragen_germline_dir'], 'prefix': get_dragen_output_prefix(smconfig['dragen_germline_dir']) }, }]
def main(genome=None, input_genomes_url=None, gtf_path=None, all_transcripts=False, principal=False, gene_list=None, biotypes='', features='', gene_contains=None): out = sys.stdout # GTF if not gtf_path: try: from reference_data import api as refdata except ImportError: critical( 'GTF file is needed. Either install reference_data, or provide GTF with --gtf' ) else: refdata.find_genomes_dir(input_genomes_url) if genome == 'GRCh37': gtf_path = os.path.join( refdata.get_ref_file(genome, key='pyensembl_data'), 'GRCh37/ensembl75/Homo_sapiens.GRCh37.75.gtf.gz') else: gtf_path = os.path.join( refdata.get_ref_file(genome, key='pyensembl_data'), 'GRCh38/ensembl95/Homo_sapiens.GRCh38.95.gtf.gz') # Genes target_genes = None if gene_list: target_genes = get_genes_from_file(gene_list) # Transcripts transcripts_by_gid = None if not all_transcripts: if principal: transcripts_by_gid = { k: [v] for k, v in canon_transcript_per_gene( genome, only_principal=True, use_gene_id=True).items() } else: transcripts_by_gid = canon_transcript_per_gene(genome, use_gene_id=True) # Options biotypes = biotypes.strip() if biotypes: biotypes = biotypes.split(',') features = features.strip() if features: features = features.split(',') genes_set = set() genes_without_canon = set() warn(f'Parsing {gtf_path}') with open_gzipsafe(gtf_path) as f: lines_cnt = 0 region_cnt = 0 for l in f: if not l.startswith('#') and l.strip(): lines_cnt += 1 fields = l.strip().split('\t') try: chrom, _, feature, start, end, _, strand, _, annotations = fields except: warn(f'Cannot read fields {str(fields)}') raise if genome.startswith('hg') and not chrom.startswith('chr'): chrom = 'chr' + chrom if features: if not any(feature == ft for ft in features): continue annotations = { kv.split()[0].strip().strip('"'): kv.split()[1].strip().strip('"') for kv in annotations.split('; ') } gene_name = annotations['gene_name'] if target_genes and gene_name not in target_genes: continue if gene_contains is not None and gene_contains not in gene_name: continue if biotypes: biotype = annotations['gene_biotype'] if not any(bt == biotype for bt in biotypes): continue if transcripts_by_gid: gene_id = annotations['gene_id'] transcript_id = annotations['transcript_id'] canon_transcript_ids = transcripts_by_gid.get(gene_id) if not canon_transcript_ids: genes_without_canon.add(gene_name) continue if transcript_id not in canon_transcript_ids: continue start = int(start) - 1 end = int(end) if end - start >= 3: out.write('\t'.join( [chrom, str(start), str(end), gene_name, '.', strand]) + '\n') genes_set.add(gene_name) region_cnt += 1 if region_cnt % 10000 == 0: warn( f'Processed {len(genes_set)} genes, written {region_cnt} regions...' ) warn( f'Done. Processed {len(genes_set)} genes, written {region_cnt} regions' ) if genes_without_canon: warn( f'No canonical transcript for {len(genes_without_canon)} gene ids')
def make_peptide(self, peptide_flanking_len=None): # 5' fasta if not self.side_5p.trx.contains_start_codon: logger.warn('No start codong in 5\' transcript') return transl_start = self.side_5p.trx.first_start_codon_spliced_offset if self.side_5p.bp_offset < transl_start: # if the bp (t_end) falls before the beginning of translation return cds_seq_5p = Seq( self.side_5p.trx.sequence[transl_start:self.side_5p.bp_offset]) fs_5p = len(cds_seq_5p) % 3 if fs_5p != 0: logger.debug(f' Frameshift of 5p sequence: {fs_5p}') pep_5p = _translate_from_start_codon(cds_seq_5p, to_stop=False, name='5\' fasta') if '*' in pep_5p: logger.info( f' 5\' petide has a STOP codon before breakpoint. Skipping.') assert min(self.side_5p.trx.stop_codon_spliced_offsets) < self.side_5p.bp_offset, \ 'We also expect pyensembl to report a STOP codon before the breakpoint' return # 3' fasta. Getting full sequence in case if it's an FS event that will produce a novel stop codon fs_3p = (self.side_3p.bp_offset - self.side_3p.trx.first_start_codon_spliced_offset) % 3 if fs_3p != 0: logger.debug(f' Frameshift of 3p sequence: {fs_3p}') seq_3p = Seq(self.side_3p.trx.sequence[self.side_3p.bp_offset:]) # checking if the fusion produced a frameshift fusion_fs = (fs_5p + fs_3p) % 3 if fusion_fs != 0: logger.debug(f" Result fusion frameshift: {fusion_fs}") is_inframe = fusion_fs == 0 # junction peptide junction_codon = cds_seq_5p[len(cds_seq_5p) - fs_5p:] if junction_codon: # == fs_5p != 0: start_3p_from = 3 - fs_5p junction_codon += seq_3p[:start_3p_from] junction_pep = junction_codon.translate() if junction_pep == '*': logger.info(f' Junction codon is STOP, skipping') return else: junction_pep = '' start_3p_from = 0 # 3' peptide pep_3p = _trim3(seq_3p[start_3p_from:]).translate() if pep_3p[0] == '*': logger.info( f' The new 3\' peptide starts from STOP ({seq_3p[start_3p_from:][:3]} ' f'at position {self.side_3p.bp_offset}+{start_3p_from}), skipping translation.' ) return if '*' not in pep_3p: logger.info( f' No STOP codon in fused peptide, skipping translation.') return pep_3p = _trim3(seq_3p[start_3p_from:]).translate(to_stop=True) logger.debug( f' 5\' peptide (len={len(pep_5p)}): ' f'{pep_5p if len(pep_5p) < 99 else pep_5p[:48] + "..." + pep_5p[-48:]}' ) if junction_pep: logger.debug(f' Junction peptide: {junction_pep}') logger.debug( f' 3\' peptide{f" (shifted by {fusion_fs} from original)" if fusion_fs else ""} ' f'(len={len(pep_3p)}): ' f'{pep_3p if len(pep_3p) < 99 else pep_3p[:48] + "..." + pep_3p[-48:]}' ) # fusion peptide if peptide_flanking_len: # taking $(peptide_chunk_len) aminoacids from 5': pep_5p = pep_5p[-peptide_flanking_len:] # trying to make the total peptide to be of length $(peptide_chunk_len)*2+1: pep_3p = pep_3p[:peptide_flanking_len + 1 - len(junction_pep)] if is_inframe else pep_3p fusion_pep = pep_5p + junction_pep + pep_3p assert '*' not in fusion_pep self.peptide = fusion_pep self.is_inframe = is_inframe self.fusion_offset_in_peptide = peptide_flanking_len or len(pep_5p) self.num_of_nt_in_the_break = fs_5p
def _log(msg, silent, is_critical): if is_critical: critical(msg) if not silent: warn(msg)
def main(prefix, output_bedpe, output_fasta=None, output_json=None, min_read_support=None, ensembl_release=None, peptide_flanking_len=None, debug=False, no_filtering=False, check_transcript=True, pizzly_ref_fa=None, reads=None, min_tpm=None): # input_flat_fpath = prefix + '-flat.tsv' input_json_fpath = prefix + '.json' input_fasta = prefix + '.fusions.fasta' output_bedpe = abspath(output_bedpe) logger.init(debug) global ENSEMBL_RELEASE ENSEMBL_RELEASE = ensembl_release ebl = EnsemblRelease(ENSEMBL_RELEASE) # Reading filtered tsv # filt_fusions = set() # with open(input_flat_fpath) as f: # for row in csv.DictReader(f, delimiter='\t'): # filt_fusions.add((row['geneA.name'], row['geneB.name'])) # Read json json_data = {'genes': []} with open(input_json_fpath) as f: data = json.load(f) for g_event in data['genes']: gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] # if (gene_a, gene_b) in filt_fusions: json_data['genes'].append(g_event) # Read fasta fasta_dict = SeqIO.index(input_fasta, 'fasta') # First round: genomic coordinates and fasta logger.info( f'Round 1: reading {len(json_data["genes"])} gene-pairs events from pizzly JSON' ) fusions = [] for g_event in json_data[ 'genes']: # {'geneA', 'geneB', 'paircount', 'splitcount', 'transcripts', 'readpairs'} gene_a, gene_b = g_event['geneA']['name'], g_event['geneB']['name'] # logger.info(f'Processing event {gene_a}>>{gene_b}') met_fasta_keys = set( ) # collecting to get rid of duplicate transcript events for t_event in g_event['transcripts']: fusion = Fusion.create_from_pizzly_event(ebl, t_event) if check_transcript: if not _transcript_is_good( fusion.side_5p.trx) or not _transcript_is_good( fusion.side_3p.trx): # logger.info(f'Transcripts {fusion.side_5p.trx} and {fusion.side_3p.trx} didn\'t pass check') continue if no_filtering is not True and fusion.support < min_read_support: continue calc_positions_ok = fusion.calc_genomic_positions() if not calc_positions_ok: continue # comparing our fasta to pizzly fasta fusion.fasta_rec = fasta_dict[t_event['fasta_record']] _check_fusion_fasta(fusion.fasta_rec, fusion) # skipping duplicate fastas k = fusion.side_5p.trx.id, fusion.side_3p.trx.id, fusion.fasta assert k not in met_fasta_keys met_fasta_keys.add(k) fusions.append(fusion) # if not met_fasta_keys: # logger.info(' Filtered all fusions for this gene pair.') if met_fasta_keys: logger.info( f'Keeping {len(met_fasta_keys)} fusion(s) for the event {gene_a}-{gene_b}' ) if not fusions: logger.warn('Finished: no fusions passed filtering') # Calculate expression of fused transcripts expr_by_fusion = None if reads and fusions: # filtered fasta for re-calling expression work_dir = safe_mkdir(splitext(output_bedpe)[0] + '_quant') fasta_path = join(work_dir, 'fusions.fasta') fasta_recs = [f.fasta_rec for f in fusions] SeqIO.write(fasta_recs, fasta_path, 'fasta') if pizzly_ref_fa: expr_by_fusion = requanitify_pizzly(pizzly_ref_fa, fasta_path, work_dir, reads) # expr_by_fusion = {fusion-fasta-id -> {length eff_length est_counts tpm}} # Second round: peptides and expression logger.info() logger.info( f'Round 2: making peptides for {len(fusions)} events in ' f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in fusions]))} genes pairs' ) met_peptide_keys = set() # collecting to get rid of duplicate peptides bedpe_entries = [] peptide_fusions = [] if peptide_flanking_len < 0: peptide_flanking_len = None for fusion in fusions: if fusion.side_3p.trx.contains_start_codon: logger.info( f'Translating {fusion.side_5p.trx.gene.name}>>{fusion.side_3p.trx.gene.name} fusion: {fusion}' ) fusion.make_peptide(peptide_flanking_len) if fusion.peptide: _verify_peptides(fusion.fasta_rec, fusion, peptide_flanking_len) # skipping duplicate peptides k = fusion.side_5p.trx.gene.name, fusion.side_3p.trx.gene.name, fusion.peptide if k in met_peptide_keys: logger.debug(f'Skipping peptide {k}: already added') continue met_peptide_keys.add(k) # writing bedpe entry = fusion.to_bedpe() # add expression if expr_by_fusion: entry.update(expr_by_fusion[fusion.fasta_rec.id]) tpm = float(entry['tpm']) if no_filtering is not True and tpm < min_tpm: logger.debug( f'Skipping peptide {entry}: TPM={tpm} is below {min_tpm}') continue if fusion.peptide: peptide_fusions.append(fusion) bedpe_entries.append(entry) # Writing bedpe with open(output_bedpe, 'w') as bedpe_fh: bedpe_header = [ 'chr 5p', 'start 5p', 'end 5p', 'chr 3p', 'start 3p', 'end 3p', 'name', 'tier', 'strand 5p', 'strand 3p', 'support', 'is canon bndry', 'inframe', 'peptide', 'fusion pos', 'nt in the break', 'transcripts', 'is canon intron dinuc', ] if expr_by_fusion: bedpe_header.extend(list(expr_by_fusion.values())[0].keys()) bedpe_writer = csv.DictWriter(bedpe_fh, fieldnames=bedpe_header, delimiter='\t') bedpe_writer.writeheader() for bedpe_entry in bedpe_entries: bedpe_writer.writerow(bedpe_entry) # _test_pvac(output_bedpe) # Write fasta if output_fasta: SeqIO.write([f.fasta_rec for f in peptide_fusions], output_fasta, 'fasta') logger.info() logger.info( f'Written {len(peptide_fusions)} fusions in ' f'{len(set([(f.side_3p.trx.gene.name, f.side_5p.trx.gene.name) for f in peptide_fusions]))} ' f'gene pairs good peptides bedpe: {output_bedpe}')