def find_germline_vcf(self, silent=False, caller=None): caller = caller or self.germline_caller if not caller: if not silent: warn(f'Batch {self.name} have no variant caler info assigned, skipping finding germline VCF') return assert caller # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019 vcf_fpath_gz = adjust_path(join(self.parent_project.date_dir, f'{self.normals[0].name}-germline-{caller}.vcf.gz')) # in datestamp. bcbio before 1.1.6 vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir, f'{self.normals[0].name}-germline-{caller}-annotated.vcf.gz')) if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}.vcf.gz: ' + vcf_fpath_gz) self.germline_vcf = vcf_fpath_gz elif isfile(vcf_old_fpath_gz): verify_file(vcf_old_fpath_gz, is_critical=True) if not silent: info(f'Found germline VCF in <date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz) self.germline_vcf = vcf_old_fpath_gz elif not silent: warn(f'Could not find germline variants files for batch {self.name}, caller {caller} neither as ' f'<date-dir>/<normal-name>-germline-{caller}.vcf.gz, nor as ' f'<date-dir>/<normal-name>-germline-{caller}-annotated.vcf.gz (bcbio < v1.1.6)')
def find_bam(self, silent=False): name = self.get_name_for_files() to_try = [ '-ready.cram', '-ready.bam', '-sort.bam', ] for ext in to_try: fpath = adjust_path(join(self.dirpath, name + ext)) if verify_file(fpath): return fpath input_file = self.sample_info['files'] if not isinstance(input_file, str): input_file = input_file[0] if isinstance(input_file, str) and input_file.endswith('.bam'): debug('Bcbio was run from BAM input') if not input_file.startswith('/'): input_file = abspath(join(self.bcbio_project.work_dir, input_file)) if verify_file(input_file): debug('Using BAM file from input YAML ' + input_file) return input_file else: debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist') if not silent: warn('No BAM or CRAM file found for ' + self.name)
def get_canonical_transcripts_ids(genome): short_genome = genome.split('-')[0] if short_genome.startswith('GRCh37'): short_genome = 'hg19' if short_genome.startswith('GRCh38'): short_genome = 'hg38' check_genome(short_genome) genome = short_genome canon_fpath = _get(join('{genome}', 'canon_transcripts_{genome}_ensembl.txt'), genome) replacement_fpath = _get('canon_cancer_replacement.txt') canon_fpath = verify_file(canon_fpath, description='Canonical transcripts path') replacement_fpath = verify_file(replacement_fpath, description='Canonical cancer transcripts replacement path') if not canon_fpath: return None with open(canon_fpath) as f: canon_tx_by_gname = dict(l.strip('\n').split('\t') for l in f) if replacement_fpath: with open(replacement_fpath) as f: for gname, tx_id in (l.strip('\n').split('\t') for l in f): canon_tx_by_gname[gname] = tx_id return canon_tx_by_gname
def sort_bed_gsort(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of fai_fpath or genome build name must be specified') with file_transaction(work_dir, output_bed_fpath) as tx: run('gsort {input_bed_fpath} {fai_fpath}'.format(**locals()), output_fpath=tx) return output_bed_fpath
def find_bam(self, silent=False): name = self.get_name_for_files() to_try = [ '-ready.bam', '-ready.cram', '-sort.bam', ] for ext in to_try: fpath = adjust_path(join(self.dirpath, name + ext)) if verify_file(fpath): return fpath input_file = self.sample_info['files'] if not isinstance(input_file, str): input_file = input_file[0] if isinstance(input_file, str) and input_file.endswith('.bam'): debug('Bcbio was run from BAM input') if not input_file.startswith('/'): input_file = abspath(join(self.parent_project.work_dir, input_file)) if verify_file(input_file): debug('Using BAM file from input YAML ' + input_file) return input_file else: debug('Input BAM file for sample ' + self.name + ' in YAML ' + input_file + ' does not exist') if not silent: warn('No BAM or CRAM file found for ' + self.name)
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = _load_yaml(fpath) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def load_yaml_config(fpath): verify_file(fpath, is_critical=True) try: dic = load_yaml(open(fpath)) except Exception: err(format_exc()) critical('Could not parse bcbio YAML ' + fpath) else: return dic
def find_qc_files(self, dst_dir, exclude_files=None, include_files=None): """ Parses bcbio MultiQC file list and collects all QC files belonging to this batch :param dst_dir: destination directory where the QC files will be copied to :param exclude_files: not include files matching these patterns :param include_files: only include files matching these patterns :return: list of file paths copied into `new_mq_data_dir` """ mq_dir = join(self.parent_project.date_dir, 'multiqc') mq_filelist = join(mq_dir, 'list_files_final.txt') verify_file(mq_filelist, is_critical=True) # Cromwell? cwl_targz = join(mq_dir, 'multiqc-inputs.tar.gz') tar_f_by_fp = dict() if isfile(cwl_targz): info(f'Found CWL MultiQC output {cwl_targz}, extracting required QC files from the archive') if cwl_targz: tar = tarfile.open(cwl_targz) for member in tar.getmembers(): rel_fp = member.name if 'call-multiqc_summary/execution/qc/multiqc/' in rel_fp: rel_fp = rel_fp.split('call-multiqc_summary/execution/qc/multiqc/')[1] tar_f_by_fp[rel_fp] = tar.extractfile(member) qc_files_not_found = [] qc_files_found = [] with open(mq_filelist) as inp: for fp in [l.strip() for l in inp if l.strip()]: if fp == 'trimmed' or fp.endswith('/trimmed'): continue # back-compatibility with bcbio if exclude_files: if isinstance(exclude_files, str): exclude_files = [exclude_files] if any(re.search(ptn, fp) for ptn in exclude_files): continue if include_files: if isinstance(include_files, str): include_files = [include_files] if not any(re.search(ptn, fp) for ptn in include_files): continue new_fp = _extract_qc_file(fp, dst_dir, self.parent_project.final_dir, tar_f_by_fp) if not new_fp: qc_files_not_found.append(fp) continue else: qc_files_found.append(new_fp) if qc_files_not_found: warn('-') warn(f'Some QC files from list {mq_filelist} were not found:' + ''.join('\n ' + fpath for fpath in qc_files_not_found)) return qc_files_found
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical( 'Either of chr_order, fai_fpath, or genome build name must be specified' ) chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def run(cmd, output_fpath=None, input_fpaths=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath if input_fpaths is not None: if isinstance(input_fpaths, str): input_fpaths = [input_fpaths] for fpath in input_fpaths: verify_file(fpath, is_critical=True) env = _get_env(env_vars) # info('env: ' + str(env)) if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpaths): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpaths) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpaths) else: _try_run(cmd, output_fpath, input_fpaths) else: _try_run(cmd, None, input_fpaths)
def find_sv_vcf(self, silent=False, caller=False): caller = caller or self.sv_caller sv_prio = join(self.tumors[0].dirpath, f'{self.name}-sv-prioritize-{caller}.vcf.gz') sv_unprio = join(self.tumors[0].dirpath, f'{self.name}-{caller}.vcf.gz') # CWL? sv_cwl_prio = join(self.parent_project.date_dir, f'{self.tumors[0].name}-{caller}-prioritized.vcf.gz') sv_cwl_unprio = join(self.parent_project.date_dir, f'{self.tumors[0].name}-{caller}.vcf.gz') if isfile(sv_prio): verify_file(sv_prio, is_critical=True) if not silent: info(f'Found SV VCF in <tumor>/<batch>-sv-prioritize-{caller}.vcf.gz: ' + sv_prio) self.sv_vcf = sv_prio elif isfile(sv_unprio): verify_file(sv_unprio, is_critical=True) if not silent: info(f'Found SV VCF in <tumor>/<batch>-{caller}.vcf.gz: ' + sv_unprio) self.sv_vcf = sv_unprio elif isfile(sv_cwl_prio): verify_file(sv_cwl_prio, is_critical=True) if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}-prioritized.vcf.gz: ' + sv_cwl_prio) self.sv_cwl_prio = sv_cwl_prio elif isfile(sv_cwl_unprio): verify_file(sv_cwl_unprio, is_critical=True) if not silent: info(f'Found SV VCF in <date-dir>/<tumor-name>-{caller}.vcf.gz: ' + sv_cwl_prio) self.sv_vcf = sv_cwl_unprio elif not silent: warn(f'Could not find SV VCF file for batch {self.name}, caller {caller} neither under sample folder as ' f'<tumor>/<batch>(-sv-prioritize)-{caller}.vcf.gz (conventional bcbio), ' f'nor in the project folder as project/<tumor>-{caller}(-prioritized).vcf.gz (CWL bcbio).')
def get_chrom_lengths(genome=None, fai_fpath=None): assert genome or fai_fpath, f'One of genome or fai_fpath should be not None: genome={genome}, fai_fpath={fai_fpath}' if not fai_fpath: check_genome(genome) fai_fpath = get_fai(genome) else: fai_fpath = verify_file(fai_fpath, is_critical=True) if not fai_fpath.endswith('.fai') and not fai_fpath.endswith('.fa'): critical('Error: .fai or .fa is accepted.') chr_lengths = [] if fai_fpath.endswith('.fa'): debug('Reading genome sequence (.fa) to get chromosome lengths') with open(fai_fpath, 'r') as handle: from Bio import SeqIO reference_records = SeqIO.parse(handle, 'fasta') for record in reference_records: chrom = record.id chr_lengths.append((chrom, len(record.seq))) else: debug('Reading genome index file (.fai) to get chromosome lengths') with open(fai_fpath, 'r') as handle: for line in handle: line = line.strip() if line: chrom, length = line.split()[0], int(line.split()[1]) chr_lengths.append((chrom, length)) return chr_lengths
def find_multiqc_report(self): for fpath in [ join(self.date_dir, BcbioProject.multiqc_report_name), join(self.date_dir, 'multiqc_postproc', 'multiqc_report.html'), ]: if verify_file(fpath, silent=True): return fpath
def canon_transcript_per_gene(genome, only_principal=False, use_gene_id=False): """ Returns a dict of lists: all most confident transcripts per gene according to APPRIS: first one in list is PRINCIPAL, the rest are ALTERNATIVE If only_principal=True, returns a dict of str, which just one transcript per gene (PRINCIPAL) """ short_genome = genome.split('-')[0] if short_genome.startswith('GRCh37'): short_genome = 'hg19' if short_genome.startswith('GRCh38'): short_genome = 'hg38' check_genome(short_genome) fpath = _get_ensembl_file('appris_data.principal.txt', short_genome) fpath = verify_file(fpath, is_critical=True, description='APPRIS file path') princ_per_gene = dict() alt_per_gene = defaultdict(list) with open(fpath) as f: for l in f: gene, geneid, enst, ccds, label = l.strip().split('\t') if 'PRINCIPAL' in label: princ_per_gene[geneid if use_gene_id else gene] = enst elif not only_principal and 'ALTERNATIVE' in label: alt_per_gene[geneid if use_gene_id else gene].append(enst) if only_principal: return princ_per_gene else: return {g: [t] + alt_per_gene[g] for g, t in princ_per_gene.items()}
def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False): self.raw_name = name self.name = self.raw_name.replace('.', '_') self.dirpath = verify_dir(join(self.bcbio_project.final_dir, self.name)) if not verify_dir(self.dirpath, silent=silent): if not silent: critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory ' f'{self.bcbio_project.final_dir}. Please check consistency between the YAML ' f'{self.bcbio_project.bcbio_yaml_fpath} and the directories in `final`: ' f'to every "description" value in YAML, there should be a corresponding folder with the ' f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) ' f'from consideration, if you are sure that missing folders are expected.') else: return False self.var_dirpath = join(self.dirpath, BcbioProject.var_dir) self.bam = self.find_bam(silent=silent) if self.is_rnaseq: gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts')) if isfile(gene_counts) and verify_file(gene_counts): self.counts_file = gene_counts else: if not silent: warn('Counts for ' + self.name + ' not found') else: if variantcallers_data: self._set_variant_files(variantcallers_data, ensemble=ensemble) else: if not silent: warn('No variant callers set in config, skipping finding VCF files') return True
def find_mutation_file(self, passed=True, caller=None): caller = caller or self.bcbio_project.somatic_caller mut_fname = caller + '.' + vf.mut_file_ext mut_fpath = join(self.dirpath, BcbioProject.varfilter_dir, mut_fname) if passed: mut_fpath = add_suffix(mut_fpath, vf.mut_pass_suffix) return verify_file(mut_fpath, silent=True)
def find_coverage_stats(self): sname = self.name dirpath = self.dirpath if self.phenotype == 'germline': sname = re.sub(r'-germline$', '', sname) dirpath = re.sub(r'-germline$', '', dirpath) return verify_file(join(dirpath, 'qc', 'coverage', sname + '_coverage.bed'), silent=True)
def read_samples(args): bam_by_sample = find_bams(args) if bam_by_sample: info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else '')) input_not_bam = [ verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample ] input_not_bam = [fpath for fpath in input_not_bam if fpath] fastqs_by_sample = dict() if not input_not_bam and not bam_by_sample: critical('No correct input files') if input_not_bam: info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files') fastqs_by_sample = find_fastq_pairs(input_not_bam) if fastqs_by_sample: info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs') intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys()) if intersection: critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection))) return fastqs_by_sample, bam_by_sample
def _set_name_and_paths(self, name, variantcallers_data, ensemble=False, silent=False): self.raw_name = name self.name = self.raw_name.replace('.', '_') self.rgid = self.name self.dirpath = verify_dir(join(self.parent_project.final_dir, self.name)) if not verify_dir(self.dirpath, silent=silent): critical(f'Sample "{self.name}" specified in bcbio YAML is not found in the final directory ' f'{self.parent_project.final_dir}. Please check consistency between the YAML ' f'{self.parent_project.bcbio_yaml_fpath} and the directories in `final`: ' f'to every "description" value in YAML, there should be a corresponding folder with the ' f'same name in `final`. You can use `-e` option to exclude samples (comma-separated) ' f'from consideration, if you are sure that missing folders are expected.') self.bam = self.find_bam(silent=silent) if self.is_rnaseq: gene_counts = adjust_path(join(self.dirpath, self.get_name_for_files() + '-ready.counts')) if isfile(gene_counts) and verify_file(gene_counts): self.counts_file = gene_counts else: if not silent: warn('Counts for ' + self.name + ' not found') else: if variantcallers_data: self._set_variant_callers(variantcallers_data, ensemble=ensemble) else: if not silent: warn('No variant callers set in config, skipping finding VCF files')
def main(bcbio_dir, bed, depth, threads=None, isdebug=True): snp_file = verify_file(bed) depth_cutoff = depth log.init(isdebug) try: import az except ImportError: parallel_cfg = ParallelCfg(threads=threads) else: sys_cfg = az.init_sys_cfg() parallel_cfg = ParallelCfg( scheduler=sys_cfg.get('scheduler'), queue=sys_cfg.get('queue'), resources=sys_cfg.get('resources'), threads=threads or sys_cfg.get('threads'), tag='clearup') log.info('Loading bcbio project from ' + bcbio_dir) log.info('-' * 70) proj = BcbioProject() proj.load_from_bcbio_dir(bcbio_dir, proc_name='clearup') log.info('Loaded ' + proj.final_dir) log_dir = safe_mkdir(join(proj.log_dir, 'clearup')) work_dir = safe_mkdir(join(proj.work_dir, 'clearup')) out_dir = safe_mkdir(join(proj.date_dir, 'clearup')) with parallel_view(len(proj.samples), parallel_cfg, log_dir) as parall_view: genotype(proj.samples, snp_file, parall_view, work_dir, out_dir, proj.genome_build)
def file_nonempty_check(output_fpath=None, input_fpaths=None): if output_fpath is None: return True ok = verify_file(output_fpath) if not ok: err(f'Did not find non-empty output file {output_fpath}') return ok
def lift_over(fpath, from_genome, to_genome): chain_file = join(dirname(__file__), 'over.chain', f'{from_genome}To{to_genome.title()}.over.chain.gz') if not verify_file(chain_file): log.critical(f'Error: conversion from {from_genome} to {to_genome} is not supported!') out_fpath = add_suffix(fpath, to_genome) call_process.run(f'liftOver {fpath} {chain_file} {out_fpath} {out_fpath}.unMapped') return out_fpath
def run(cmd, output_fpath=None, input_fpath=None, checks=None, stdout_to_outputfile=True, stdout_tx=True, reuse=False, env_vars=None): """Run the provided command, logging details and checking for errors. """ if output_fpath and reuse: if verify_file(output_fpath, silent=True): info(output_fpath + ' exists, reusing') return output_fpath if not output_fpath.endswith('.gz') and verify_file(output_fpath + '.gz', silent=True): info(output_fpath + '.gz exists, reusing') return output_fpath env = _get_env(env_vars) if checks is None: checks = [file_nonempty_check] def _try_run(_cmd, _output_fpath, _input_fpath): try: info(' '.join(str(x) for x in _cmd) if not isinstance(_cmd, str) else _cmd) _do_run(_cmd, checks, env, _output_fpath, _input_fpath) except: raise if output_fpath: if isfile(output_fpath): os.remove(output_fpath) if output_fpath: if stdout_tx: with file_transaction(None, output_fpath) as tx_out_file: if stdout_to_outputfile: cmd += ' > ' + tx_out_file else: cmd += '\n' cmd = cmd.replace(' ' + output_fpath + ' ', ' ' + tx_out_file + ' ') \ .replace(' "' + output_fpath + '" ', ' ' + tx_out_file + '" ') \ .replace(' \'' + output_fpath + '\' ', ' ' + tx_out_file + '\' ') \ .replace(' ' + output_fpath + '\n', ' ' + tx_out_file) \ .replace(' "' + output_fpath + '"\n', ' ' + tx_out_file + '"') \ .replace(' \'' + output_fpath + '\'\n', ' ' + tx_out_file + '\'') \ .replace('\n', '') _try_run(cmd, tx_out_file, input_fpath) else: _try_run(cmd, output_fpath, input_fpath) else: _try_run(cmd, None, input_fpath)
def main(input_bed, output_file, output_features=False, genome=None, only_canonical=False, short=False, extended=False, high_confidence=False, ambiguities_method=False, coding_only=False, collapse_exons=False, work_dir=False, is_debug=False): """ Annotating BED file based on reference features annotations. """ logger.init(is_debug_=is_debug) if not genome: raise click.BadParameter('Error: please, specify genome build name with -g (e.g. `-g hg19`)', param='genome') if short: if extended: raise click.BadParameter('--short and --extended can\'t be set both', param='extended') if output_features: raise click.BadParameter('--short and --output-features can\'t be set both', param='output_features') elif output_features or extended: extended = True short = False if not verify_file(input_bed): click.BadParameter(f'Usage: {__file__} Input_BED_file -g hg19 -o Annotated_BED_file [options]', param='input_bed') input_bed = verify_file(input_bed, is_critical=True, description=f'Input BED file for {__file__}') if work_dir: work_dir = join(adjust_path(work_dir), os.path.splitext(basename(input_bed))[0]) safe_mkdir(work_dir) info(f'Created work directory {work_dir}') else: work_dir = mkdtemp('bed_annotate') debug('Created temporary work directory {work_dir}') input_bed = clean_bed(input_bed, work_dir) input_bed = verify_bed(input_bed, is_critical=True, description=f'Input BED file for {__file__} after cleaning') output_file = adjust_path(output_file) output_file = annotate( input_bed, output_file, work_dir, genome=genome, only_canonical=only_canonical, short=short, extended=extended, high_confidence=high_confidence, collapse_exons=collapse_exons, output_features=output_features, ambiguities_method=ambiguities_method, coding_only=coding_only, is_debug=is_debug) if not work_dir: debug(f'Removing work directory {work_dir}') shutil.rmtree(work_dir) info(f'Done, saved to {output_file}')
def _check_dir_not_empty(dirpath, description=None): assert verify_dir(dirpath, description=description), dirpath contents = [join(dirpath, fname) for fname in os.listdir(dirpath) if not fname.startswith('.')] assert len(contents) >= 1, dirpath + ': ' + str(contents) assert all(verify_file(realpath(fpath), is_critical=True) for fpath in contents if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
def sort_bed(input_bed_fpath, output_bed_fpath=None, work_dir=None, fai_fpath=None, chr_order=None, genome=None): input_bed_fpath = verify_bed(input_bed_fpath, is_critical=True) output_bed_fpath = adjust_path(output_bed_fpath) if output_bed_fpath \ else intermediate_fname(work_dir, input_bed_fpath, 'sorted') debug('Sorting regions in ' + str(input_bed_fpath)) if can_reuse(output_bed_fpath, input_bed_fpath): debug(output_bed_fpath + ' exists, reusing') return output_bed_fpath regions = [] if not chr_order: if fai_fpath: fai_fpath = verify_file(fai_fpath) elif genome: fai_fpath = verify_file(ref.get_fai(genome)) else: critical('Either of chr_order, fai_fpath, or genome build name must be specified') chr_order = get_chrom_order(fai_fpath=fai_fpath) with open(input_bed_fpath) as f: with file_transaction(work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append(Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') debug('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath) return output_bed_fpath
def set_project_level_dirs(self, bcbio_cnf, config_dir, project_name=None, final_dir=None, date_dir=None, create_dirs=False, proc_name='postproc'): self.final_dir = self.set_final_dir(bcbio_cnf, config_dir, final_dir) if create_dirs: safe_mkdir(self.final_dir) self.project_name = self._set_project_name(self.final_dir, project_name) self.work_dir = abspath(join(self.final_dir, pardir, 'work')) if create_dirs: safe_mkdir(self.work_dir) self.date_dir = self._set_date_dir(bcbio_cnf, self.final_dir, date_dir, create_dir=create_dirs, silent=self.silent) self.log_dir = join(self.date_dir, 'log') self.postproc_log_dir = join(self.log_dir, proc_name) if create_dirs: safe_mkdir(self.postproc_log_dir) self.versions = verify_file(join(self.date_dir, 'data_versions.txt'), silent=True) self.programs = verify_file(join(self.date_dir, 'programs.txt'), silent=True)
def _find_mutation_files(base_dir, passed=True, caller=None, is_germline=False): assert caller mut_fname = caller + '.' + vf.mut_file_ext mut_fpath = join(base_dir, mut_fname) single_mut_fpath = add_suffix(mut_fpath, vf.mut_single_suffix) paired_mut_fpath = add_suffix(mut_fpath, vf.mut_paired_suffix) fpaths = [mut_fpath, single_mut_fpath, paired_mut_fpath] if passed: fpaths = [add_suffix(p, vf.mut_pass_suffix) for p in fpaths] return [p for p in fpaths if verify_file(p, silent=True)]
def get_dbsnp_multi_mafs(genome_cfg): if 'dbsnp_multi_mafs' not in genome_cfg: warn( 'Warning: dbsnp_multi_mafs not provided in the system configuration file for the genome.' ) return None return verify_file( genome_cfg['dbsnp_multi_mafs'], is_critical=True, description='dbSNP multi mafs file in system configuration file')
def _get(relative_path, genome=None, is_critical=False): if genome: check_genome(genome) else: genome = '' relative_path = relative_path.format(genome=genome) path = abspath(join(dirname(__file__), relative_path)) if is_critical: return verify_file(path, is_critical=True) return path
def _read_list(reason, fpath): gene_d = {} fpath = verify_file(fpath, description=reason + ' blacklist genes file', is_critical=True) for l in iter_lines(fpath): fs = l.split('\t') gene_name = l.split('\t')[0] meta_info = l.split('\t')[1] if len(fs) == 2 else '' gene_d[gene_name] = meta_info return gene_d
def _check_dir_not_empty(dirpath, description=None): assert verify_dir(dirpath, description=description), dirpath contents = [ join(dirpath, fname) for fname in os.listdir(dirpath) if not fname.startswith('.') ] assert len(contents) >= 1, dirpath + ': ' + str(contents) assert all( verify_file(realpath(fpath), is_critical=True) for fpath in contents if isfile(realpath(fpath))), dirpath + ': ' + str(contents)
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info('Converting the BAM to BED to save some memory.') # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(**locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def find_somatic_vcf(self, silent=False, caller=None): caller = caller or self.somatic_caller if not caller: if not silent: warn(f'Batch {self.name} have no variant caler info assigned, skipping finding somatic VCF') return # in datestamp. cwl-bcbio writes there vcf_cwl_fpath_gz = adjust_path(join(self.parent_project.date_dir, self.name + '-' + caller + '.vcf.gz')) # in datestamp. bcbio before 1.1.6 vcf_old_fpath_gz = adjust_path(join(self.parent_project.date_dir, self.name + '-' + caller + '-annotated.vcf.gz')) # in sample dir. starting from bcbio 1.1.6, ~ Dec 2019 vcf_fpath_gz = adjust_path(join(self.tumors[0].dirpath, self.tumors[0].name + '-' + caller + '.vcf.gz')) if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info(f'Found somatic VCF in <final-dir>/<tumor-name>/<tumor-name>-{caller}.vcf.gz (conventional bcbio): ' + vcf_fpath_gz) self.somatic_vcf = vcf_fpath_gz elif isfile(vcf_old_fpath_gz): verify_file(vcf_old_fpath_gz, is_critical=True) if not silent: info(f'Found somatic VCF in <date-dir>/<batch>-{caller}-annotated.vcf.gz (bcbio < v1.1.6)): ' + vcf_old_fpath_gz) self.somatic_vcf = vcf_old_fpath_gz elif isfile(vcf_cwl_fpath_gz): verify_file(vcf_cwl_fpath_gz, is_critical=True) if not silent: info(f'Found somatic VCF in project/<batch>-{caller}.vcf.gz (CWL bcbio): ' + vcf_cwl_fpath_gz) self.somatic_vcf = vcf_cwl_fpath_gz elif not silent: warn(f'Could not find somatic variants files for batch {self.name}, caller {caller} neither as ' f'{self.parent_project.final_dir}/<tumor-name>/<tumor-name>-{caller}.vcf.gz (conventional bcbio), nor as ' f'{self.parent_project.date_dir}/<batch>-{caller}-annotated.vcf.gz (bcbio < v1.1.6), nor as ' f'project/<batch>-{caller}.vcf.gz (CWL bcbio).')
def merge_overlaps(work_dir, bed_fpath, distance=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged') if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath): return output_fpath with file_transaction(work_dir, output_fpath) as tx: kwargs = dict(d=distance) if distance else dict() BedTool(bed_fpath).merge(**kwargs).saveas(tx) return output_fpath
def set_project_level_dirs(self, bcbio_cnf, config_dir, project_name=None, final_dir=None, date_dir=None, create_dirs=False, proc_name='postproc'): self.final_dir = self.set_final_dir(bcbio_cnf, config_dir, final_dir) if create_dirs: safe_mkdir(self.final_dir) self.project_name = self._set_project_name(self.final_dir, project_name) self.work_dir = abspath(join(self.final_dir, pardir, 'work')) if create_dirs: safe_mkdir(self.work_dir) self.date_dir = self._set_date_dir(bcbio_cnf, self.final_dir, date_dir, create_dir=create_dirs, silent=self.silent) self.log_dir = join(self.date_dir, 'log') self.postproc_log_dir = join(self.log_dir, proc_name) if create_dirs: safe_mkdir(self.postproc_log_dir) self.var_dir = join(self.date_dir, BcbioProject.var_dir) self.raw_var_dir = join(self.var_dir, 'raw') self.expression_dir = join(self.date_dir, BcbioProject.expression_dir) self.versions = verify_file(join(self.date_dir, 'data_versions.txt'), silent=True) self.programs = verify_file(join(self.date_dir, 'programs.txt'), silent=True)
def main(paths, output_dir, genome, depth): log.init(True) bed_files = [verify_file(f, is_critical=True) for f in paths if isfile(f)] bcbio_projs = [] dirs = [verify_dir(f, is_critical=True) for f in paths if isdir(f)] if dirs: for d in dirs: proj = BcbioProject() proj.load_from_bcbio_dir(d, proc_name='clearup') bcbio_projs.append(proj) build_snps_panel(bcbio_projs, bed_files, safe_mkdir(output_dir), genome)
def merge_overlaps(work_dir, bed_fpath, distance=None): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ output_fpath = intermediate_fname(work_dir, bed_fpath, 'merged') if isfile(output_fpath) and verify_file(output_fpath, cmp_f=bed_fpath): return output_fpath with file_transaction(work_dir, output_fpath) as tx: import pybedtools kwargs = dict(d=distance) if distance else dict() pybedtools.BedTool(bed_fpath).merge(**kwargs).saveas(tx) return output_fpath
def verify_bed(bed, description='', is_critical=False, silent=False): if isinstance(bed, BedTool): return bed fpath = adjust_path(bed) if not verify_file(fpath, description, is_critical=is_critical, silent=silent): return None error = BedFile(fpath).checkformat() if error: fn = critical if is_critical else err fn('Error: incorrect bed file format (' + fpath + '): ' + str(error) + '\n') return None return fpath
def parse_mut_tp53(mut_fpath): mut_tp53 = set() if verify_file(mut_fpath): with open(mut_fpath) as f: for l in f: l = l.strip() if not l: continue line = l.split('\t') if not line[19] or 'p.' not in line[19]: continue prot = line[19].replace('p.', '') mut_tp53.add(prot) return mut_tp53
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'): info( 'Converting the BAM to BED to save some memory.' ) # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/ bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz' cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format( **locals()) info(cmdline) os.system(cmdline) bam_bed_fpath = verify_file(bam_bed_fpath) if bam_bed_fpath: info('Done, saved to ' + bam_bed_fpath) else: err('Error, result is non-existent or empty') return bam_bed_fpath
def verify_bam(fpath, description='', is_critical=False, silent=False): if not verify_file(fpath, description, is_critical=is_critical, silent=silent): return None fpath = adjust_path(fpath) logfn = critical if is_critical else err if not fpath.endswith('.bam'): logfn('The file ' + fpath + ' is supposed to be BAM but does not have the .bam ' 'extension. Please, make sure you pass proper file.') return None # TODO: check if binary return fpath
def verify_bam(fpath, description='', is_critical=False, silent=False): if not verify_file( fpath, description, is_critical=is_critical, silent=silent): return None fpath = adjust_path(fpath) logfn = critical if is_critical else err if not fpath.endswith('.bam'): logfn('The file ' + fpath + ' is supposed to be BAM but does not have the .bam ' 'extension. Please, make sure you pass proper file.') return None # TODO: check if binary return fpath
def verify_bed(bed, description='', is_critical=False, silent=False): import pybedtools if isinstance(bed, pybedtools.BedTool): return bed fpath = adjust_path(bed) if not verify_file( fpath, description, is_critical=is_critical, silent=silent): return None error = BedFile(fpath).checkformat() if error: fn = critical if is_critical else err fn('Error: incorrect bed file format (' + fpath + '): ' + str(error) + '\n') return None return fpath
def detect_run_info_in_config_dir(config_dir): run_info_fpaths_in_config = [ abspath(join(config_dir, fname)) for fname in os.listdir(config_dir) if fname.startswith('run_info') and fname.endswith('.yaml') ] if len(run_info_fpaths_in_config) > 1: critical( 'More than one YAML file containing run_info in name found in the config ' 'directory ' + config_dir + ': ' + ' '.join(run_info_fpaths_in_config)) if len(run_info_fpaths_in_config) == 0: return None run_cnf = verify_file(run_info_fpaths_in_config[0], is_critical=True) info('Using run configuration from the config directory ' + run_cnf) return run_cnf
def read_samples(args): bam_by_sample = find_bams(args) if bam_by_sample: info('Found ' + str(len(bam_by_sample)) + ' BAM file' + ('s' if len(bam_by_sample) > 1 else '')) input_not_bam = [verify_file(fpath) for fpath in args if adjust_path(fpath) not in bam_by_sample] input_not_bam = [fpath for fpath in input_not_bam if fpath] fastqs_by_sample = dict() if not input_not_bam and not bam_by_sample: critical('No correct input files') if input_not_bam: info('Input ' + str(len(input_not_bam)) + ' correct input non-BAM files') fastqs_by_sample = find_fastq_pairs(input_not_bam) if fastqs_by_sample: info('Found ' + str(len(fastqs_by_sample)) + ' FastQ pairs') intersection = set(fastqs_by_sample.keys()) & set(bam_by_sample.keys()) if intersection: critical('The following samples both had input BAMs and FastQ: ' + ', '.join(list(intersection))) return fastqs_by_sample, bam_by_sample
def find_cnvkit_filt_file(self): return verify_file(join(self.date_dir, BcbioProject.cnv_dir, add_suffix(BcbioProject.cnvkit_fname, 'filt')), silent=True)
def find_ngs_report(self, silent=False): return \ verify_file(join(self.bcbio_project.date_dir, BcbioProject.reports_dir, self.name + '.html'), silent=silent) or \ verify_file(join(self.dirpath, BcbioProject.ngs_report_name, BcbioProject.ngs_report_name + '.html'), silent=silent)
def find_cnvkit_file(self): return verify_file(join(self.date_dir, BcbioProject.cnv_dir, BcbioProject.cnvkit_fname), silent=True)
def find_seq2c_coverage(self): return verify_file(join(self.date_dir, BcbioProject.cnv_dir, 'seq2c-cov.tsv'), silent=True)
def find_vcf_file(self, batch_name, silent=False, caller=None): caller = caller or self.somatic_caller vcf_fname = batch_name + '-' + caller + '.vcf' annot_vcf_fname = batch_name + '-' + caller + '-annotated.vcf' vcf_annot_fpath_gz = adjust_path(join(self.date_dir, annot_vcf_fname + '.gz')) # in datestamp var_raw_vcf_annot_fpath_gz = adjust_path(join(self.raw_var_dir, annot_vcf_fname + '.gz')) # in datestamp/var/raw vcf_fpath_gz = adjust_path(join(self.date_dir, vcf_fname + '.gz')) # in datestamp var_vcf_fpath_gz = adjust_path(join(self.var_dir, vcf_fname + '.gz')) # in datestamp/var var_raw_vcf_fpath_gz = adjust_path(join(self.raw_var_dir, vcf_fname + '.gz')) # in datestamp/var/raw vcf_fpath = adjust_path(join(self.date_dir, vcf_fname)) # in datestamp var_vcf_fpath = adjust_path(join(self.var_dir, vcf_fname)) # in datestamp/var var_raw_vcf_fpath = adjust_path(join(self.raw_var_dir, vcf_fname)) # in datestamp/var/raw if isfile(vcf_annot_fpath_gz): verify_file(vcf_annot_fpath_gz, is_critical=True) if not silent: info('Found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz) return vcf_annot_fpath_gz else: debug('Not found annotated VCF in the datestamp dir ' + vcf_annot_fpath_gz) if isfile(var_raw_vcf_annot_fpath_gz): verify_file(var_raw_vcf_annot_fpath_gz, is_critical=True) if not silent: info('Found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz) return var_raw_vcf_annot_fpath_gz else: debug('Not found annotated VCF in the datestamp/var/raw dir ' + var_raw_vcf_annot_fpath_gz) if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp dir ' + vcf_fpath_gz) return vcf_fpath_gz else: debug('Not found VCF in the datestamp dir ' + vcf_fpath_gz) if isfile(var_raw_vcf_fpath_gz): verify_file(var_raw_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz) return var_raw_vcf_fpath_gz else: debug('Not found VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath_gz) if isfile(vcf_fpath): verify_file(vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp dir ' + vcf_fpath) return vcf_fpath else: debug('Not found uncompressed VCF in the datestamp dir ' + vcf_fpath) if isfile(var_raw_vcf_fpath): verify_file(var_raw_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath) return var_raw_vcf_fpath else: debug('Not found uncompressed VCF in the datestamp/var/raw dir ' + var_raw_vcf_fpath) if isfile(var_vcf_fpath_gz): verify_file(var_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the datestamp/var dir ' + var_vcf_fpath_gz) return var_vcf_fpath_gz else: debug('Not found VCF in the datestamp/var dir ' + var_vcf_fpath_gz) if isfile(var_vcf_fpath): verify_file(var_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath) return var_vcf_fpath else: debug('Not found uncompressed VCF in the datestamp/var dir ' + var_vcf_fpath) if not silent: warn('Warning: no VCF found for batch ' + batch_name + ', ' + caller + ', gzip or ' 'uncompressed version in the datestamp directory.') return None
def find_seq2c_file(self): return verify_file(join(self.date_dir, BcbioProject.cnv_dir, BcbioProject.seq2c_fname), silent=True) or \ verify_file(join(self.date_dir, BcbioProject.cnv_dir, 'Seq2C.tsv'), silent=True)
def main(): description = ''' The script writes all RefSeq features for requested genome build, and generates 3 files: all_features.{genome}.bed: Gene (protein_coding) Transcript (protein_coding and ncRNA) Exon (ncRNA) CDS (protein_coding) all_features.{genome}.canon.bed: The same, but taking canonical (or longest) transcripts only CDS.{genome}.bed CDS, canonical (or longest) transcripts only Usage: ' + __file__ + ' hg19 [db.gtf] And db.gtf is either of the following: Ensembl GTF ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz 1 pseudogene gene 11869 14412 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; 1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; ... RefSeq GTF ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/H_sapiens/GFF/ref_GRCh38.p2_top_level.gff3.gz NC_000001.10 RefSeq region 1 249250621 . + . ID=id0;Name=1;Dbxref=taxon:9606;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA NC_000001.10 BestRefSeq gene 11874 14409 . + . ID=gene0;Name=DDX11L1;Dbxref=GeneID:100287102,HGNC:37102;description=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;gbkey=Gene;gene=DDX11L1;part=1%2F1;pseudo=true NC_000001.10 BestRefSeq transcript 11874 14409 . + . ID=rna0;Name=NR_046018.2;Parent=gene0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2 NC_000001.10 BestRefSeq exon 11874 12227 . + . ID=id1;Parent=rna0;Dbxref=GeneID:100287102,Genbank:NR_046018.2,HGNC:37102;gbkey=misc_RNA;gene=DDX11L1;product=DEAD%2FH %28Asp-Glu-Ala-Asp%2FHis%29 box helicase 11 like 1;transcript_id=NR_046018.2 ... RefSeq_knownGene.txt or UCSC_knownGene.txt (from http://genome.ucsc.edu/cgi-bin/hgTables) #hg19.knownGene.name hg19.knownGene.chrom hg19.knownGene.strand hg19.knownGene.txStart hg19.knownGene.txEnd hg19.knownGene.exonCount hg19.knownGene.exonStarts hg19.knownGene.exonEnds hg19.kgXref.geneSymbol uc001aaa.3 chr1 + 11873 14409 3 11873,12612,13220, 12227,12721,14409, DDX11L1 ... See more info in http://wiki.rd.astrazeneca.net/display/NG/SOP+-+Making+the+full+list+of+UCSC+exons+with+approved+HUGO+gene+symbols''' options = [ # (['--bam'], dict(dest='bam', help='path to the BAM file to analyse',)), ] parser = OptionParser(description=description) for args, kwargs in options: parser.add_option(*args, **kwargs) opts, args = parser.parse_args() if len(args) == 0: parser.exit(1, 'Please provide genome name as the first argument') genome_name = args[0] chrom_order = ref.get_chrom_order(genome_name) canonical_transcripts_ids = ref.get_canonical_transcripts_ids(genome_name) if len(args) > 1: input_fpath = verify_file(args[1]) else: input_fpath = ba.get_refseq_gene(genome_name) output_dirpath = ba.get_refseq_dirpath() synonyms_fpath = ba.get_hgnc_gene_synonyms() not_approved_fpath = join(output_dirpath, 'not_approved.txt') info('Reading the features...') with open_gzipsafe(input_fpath) as inp: if input_fpath.endswith('.gtf') or input_fpath.endswith('.gtf.gz'): gene_by_name_and_chrom = _proc_ensembl_gtf(inp, output_dirpath, chrom_order) elif input_fpath.endswith('.gff3') or input_fpath.endswith('.gff3.gz'): gene_by_name_and_chrom = _proc_refseq_gff3(inp, output_dirpath, chrom_order) else: gene_by_name_and_chrom = _proc_ucsc(inp, output_dirpath, chrom_order) if synonyms_fpath and DO_APPROVE: gene_by_name_and_chrom, not_approved_gene_names = _approve(gene_by_name_and_chrom, synonyms_fpath) info('') info('Not approved by HGNC - ' + str(len(not_approved_gene_names)) + ' genes.') if not_approved_fpath: with open(not_approved_fpath, 'w') as f: f.write('#Searched as\tStatus\n') f.writelines((l + '\n' for l in not_approved_gene_names)) info('Saved not approved to ' + not_approved_fpath) info('Found:') info(' ' + str(len(gene_by_name_and_chrom)) + ' genes') genes = gene_by_name_and_chrom.values() coding_genes = [g for g in genes if any(t.coding for t in g.transcripts)] coding_transcripts = [t for g in coding_genes for t in g.transcripts if t.coding] rna_genes = [g for g in genes if all(not t.coding for t in g.transcripts)] rna_transcripts = [t for g in genes for t in g.transcripts if not t.coding] mixed_genes = [g for g in genes if any(not t.coding for t in g.transcripts) and any(t.coding for t in g.transcripts)] info(' ' + str(len(coding_genes)) + ' coding genes') info(' ' + str(len(coding_transcripts)) + ' coding transcripts') info(' ' + str(len(rna_genes)) + ' RNA genes') info(' ' + str(len(rna_transcripts)) + ' RNA transcripts') info(' ' + str(len(mixed_genes)) + ' genes with both coding and RNA transcripts') for g in coding_genes: g.coding = True g.biotype = 'protein_coding' for g in rna_genes: g.coding = False g.biotype = 'RNA' info() # info('Choosing genes with exons...') # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] # genes = [g for g in genes if any(tx.exons for tx in g.transcripts)] info('Choosing canonical...') canon_genes = choose_canonical(genes, canonical_transcripts_ids) info() info('Sorting and printing all regions...') all_features_fpath = ba.get_all_features(genome_name) write_all_features(genes, all_features_fpath, canon_only=False) all_features_fpath = bgzip_and_tabix(all_features_fpath, tabix_parameters='-p bed') info() info('Sorting and printing canonical regions...') canon_output_fpath = ba.get_all_features_canonical(genome_name, gzip=False) write_all_features(canon_genes, canon_output_fpath, canon_only=True) canon_output_fpath = bgzip_and_tabix(canon_output_fpath, tabix_parameters='-p bed') info() info('Sorting and printing canonical CDS...') cds_output_fpath = ba.get_cds(genome_name) write_all_features(canon_genes, cds_output_fpath, canon_only=True, cds_only=True) # info() # info('Sorting and printing CDS for Seq2C (unique transcript per gene)...') # seq2c_output_fpath = ga.get_seq2c_cds(genome_name) # write_all_features(canon_genes, seq2c_output_fpath, canon_only=True, cds_only=True, seq2c_cds=True) info() info('Saved all regions to\n ' + all_features_fpath + '\n ' + canon_output_fpath + '\n ' + cds_output_fpath + '\n ' + seq2c_output_fpath)
def find_vcf_file_from_sample_dir(sample, silent=False, caller=None): caller = caller or sample.bcbio_project.somatic_caller vcf_fname = sample.get_name_for_files() + '-' + caller + '.vcf' sample_var_dirpath = join(sample.dirpath, 'var') vcf_fpath_gz = adjust_path(join(sample.dirpath, vcf_fname + '.gz')) # in var var_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, vcf_fname + '.gz')) # in var var_raw_vcf_fpath_gz = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname + '.gz')) # in var vcf_fpath = adjust_path(join(sample.dirpath, vcf_fname)) var_vcf_fpath = adjust_path(join(sample_var_dirpath, vcf_fname)) # in var var_raw_vcf_fpath = adjust_path(join(sample_var_dirpath, 'raw', vcf_fname)) # in var if isfile(vcf_fpath_gz): verify_file(vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF ' + vcf_fpath_gz) return vcf_fpath_gz else: debug('Not found VCF ' + vcf_fpath_gz) if isfile(var_vcf_fpath_gz): verify_file(var_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the var/ dir ' + var_vcf_fpath_gz) return var_vcf_fpath_gz else: debug('Not found VCF in the var/ dir ' + var_vcf_fpath_gz) if isfile(var_raw_vcf_fpath_gz): verify_file(var_raw_vcf_fpath_gz, is_critical=True) if not silent: info('Found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz) return var_raw_vcf_fpath_gz else: debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath_gz) if isfile(vcf_fpath): verify_file(vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF ' + vcf_fpath) return vcf_fpath else: debug('Not found uncompressed VCF ' + vcf_fpath) if isfile(var_vcf_fpath): verify_file(var_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the var/ dir ' + var_vcf_fpath) return var_vcf_fpath else: debug('Not found VCF in the var/ dir ' + var_vcf_fpath) if isfile(var_raw_vcf_fpath): verify_file(var_raw_vcf_fpath, is_critical=True) if not silent: info('Found uncompressed VCF in the var/raw/ dir ' + var_raw_vcf_fpath) return var_raw_vcf_fpath else: debug('Not found VCF in the var/raw/ dir ' + var_raw_vcf_fpath) if not silent: warn('Warning: no VCF found for ' + sample.name + ' (' + caller + '), gzip or uncompressed version in and outside ' 'the var directory. Phenotype is ' + str(sample.phenotype)) return None