def iter_vcf__pysam(input_file, proc_rec=None, proc_hdr=None, output_file=None): import pysam import sys vcf = pysam.VariantFile(input_file) if output_file: w = open(output_file, 'w') else: w = sys.stdout # Header if proc_hdr is not None: proc_hdr(vcf) w.write(str(vcf.header)) # Records for rec in vcf: if proc_rec: rec_res = proc_rec(rec) if rec_res is not None: print(rec_res) w.write(str(rec_res)) vcf.close() if output_file: w.close() out_ungz, out_gz = get_ungz_gz(output_file) run_simple(f'bgzip -f {out_ungz} && tabix -f -p vcf {out_gz}')
def iter_vcf(input_file, output_file, proc_rec, proc_hdr=None, postproc_hdr=None, **kwargs): """ :param input_file: path to input VCF file :param output_file: path to output VCF file (can be .vcf or .vcf.gz, but it will always bgzip/tabix and write with .vcf.gz extention) :param proc_rec: a function to process a single cyvcf Record object. Returns either a (new) Record object to write, or None to indicate that the record should be discarded :param proc_hdr: a function to process cyvcf object once (i.e. to add values to the header with vcf.add_info_to_header, etc) :param postproc_hdr: a function to postprocess finalized header string (vcf.rawheader), e.g. in order to remove values :param kwargs: any paramters to pass directly into proc_rec """ from cyvcf2 import VCF vcf = VCF(input_file, gts012=True) if proc_hdr is not None: proc_hdr(vcf) # w = None if output_file is not None: out_ungz, out_gz = get_ungz_gz(output_file) # w = Writer(out_ungz, vcf) # w.write_header() w = open(out_ungz, 'w') else: # sys.stdout.write(vcf.raw_header) w = sys.stdout header = vcf.raw_header if postproc_hdr is not None: header = postproc_hdr(header) w.write(header) for rec in vcf: if proc_rec: rec_res = proc_rec(rec, vcf, **kwargs) if rec_res is not None: # if w is not None: # sys.stderr.write('Writing record', rec_res, '\n') # w.write_record(rec_res) # else: # print(rec_res) # sys.stderr.write(f'Writing record {rec_res}\n') w.write(f'{rec_res}') sys.stderr.write(f'Finished writing {output_file}\n') vcf.close() if output_file is not None: w.close() run_simple(f'bgzip -f {out_ungz} && tabix -f -p vcf {out_gz}') sys.stderr.write(f'Compressed {output_file}\n')
def setup_tibanna(tibanna_id=None, buckets=None): try: subprocess.check_call(f'tibanna --version', shell=True) except subprocess.CalledProcessError: logger.err('Error: tibanna is not installed. Please run `pip install -S tibanna`') sys.exit(1) if not tibanna_id: tibanna_id = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) assert not check_tibanna_id_exists(tibanna_id), 'Random tibanna ID already exists: ' + tibanna_id step_func_name = f'tibanna_unicorn_{tibanna_id}' if not check_tibanna_id_exists(tibanna_id): buckets_str = '' if not buckets else ('-b ' + ','.join(buckets)) run_simple(f'tibanna deploy_unicorn -g {step_func_name} {buckets_str} --no-setenv') return step_func_name
def _test_pvac(bedpe_path): pvac_bedpe = bedpe_path.replace('.bedpe', '.pvac.bedpe') pvac_tsv_path = bedpe_path.replace('.bedpe', '.pvac.tsv') pvac_fasta_fpath = bedpe_path.replace('.bedpe', '.pvac.fasta') pvac_fasta_key_fpath = bedpe_path.replace('.bedpe', '.pvac.fasta_key') run_simple(f'grep -v ^chr {bedpe_path} > {pvac_bedpe}') from lib.fasta_generator import FusionFastaGenerator from lib.pipeline import MHCIPipeline class_i_arguments = { 'input_file': pvac_bedpe, 'input_file_type': 'bedpe', 'sample_name': bedpe_path.replace('.bedpe', '.pvac'), 'alleles': 'HLA-A*02:01', 'prediction_algorithms': 'NetMHCcons', 'output_dir': dirname(bedpe_path), 'epitope_lengths': 11, } if isfile(pvac_tsv_path): os.remove(pvac_tsv_path) pipeline = MHCIPipeline(**class_i_arguments) pipeline.convert_vcf() generate_fasta_params = { 'input_file': pvac_tsv_path, 'epitope_length': 11, 'output_file': pvac_fasta_fpath, 'output_key_file': pvac_fasta_key_fpath, 'downstream_sequence_length': 1000, } fasta_generator = FusionFastaGenerator(**generate_fasta_params) fasta_generator.execute() generate_fasta_params = { 'input_file': pvac_tsv_path, 'epitope_length': 8, 'output_file': pvac_fasta_fpath, 'output_key_file': pvac_fasta_key_fpath, 'downstream_sequence_length': 1000, } fasta_generator = FusionFastaGenerator(**generate_fasta_params) fasta_generator.execute()
def ungzip_if_needed(cnf, fpath, silent=False): if fpath.endswith('.gz'): fpath = fpath[:-3] if not file_exists(fpath) and file_exists(fpath + '.gz'): gz_fpath = fpath + '.gz' cmdline = 'gunzip -c {gz_fpath} > {fpath}'.format(**locals()) res = run_simple(cmdline) if not silent: info() if not res: return None return fpath
def requanitify_pizzly(pizzly_ref_fa, fusions_fasta, work_dir, fastq): """ Returns dict fusion-fasta-id -> {length eff_length est_counts tpm} """ trx_with_fusions = join(work_dir, 'transcripts_with_fusions.fasta.gz') kidx = join(work_dir, 'transcripts_with_fusions.kidx') if not isfile(trx_with_fusions): run_simple( f"cat {pizzly_ref_fa} {fusions_fasta} | gzip -c > {trx_with_fusions}" ) if not isfile(kidx): run_simple(f"kallisto index -k31 -i {kidx} {trx_with_fusions}") abundance = join(work_dir, 'abundance.tsv') if not isfile(abundance): run_simple(f"kallisto quant -i {kidx} -o {work_dir} {' '.join(fastq)}") logger.debug(f'Reading expression from {abundance}') expr_by_fusion = dict() with open(abundance) as f: header = f.readline().strip().split('\t') for row in csv.DictReader(f, delimiter='\t', fieldnames=header): expr_by_fusion[row['target_id']] = row return expr_by_fusion
def main(output_dir=None, tumor_bam=None, normal_bam=None, normal_name=None, tumor_name=None, genome=None, input_genomes_url=None, ref_fa=None, viruses_fa=None, repeat_masker_bed=None, breakend_pon=None, bp_pon=None, bp_hotspots=None, min_tumor_af=None, requested_cores=None, unlock=False, dryrun=False, maxcoverage=None, chunksize_mil=None, jvm_heap=None, externalaligner=None): conf = {} output_dir = output_dir or 'gridss_results' output_dir = safe_mkdir(abspath(output_dir)) log_dir = safe_mkdir(join(output_dir, 'log')) logger.init(log_fpath_=join(log_dir, 'gridss.log'), save_previous=True) if isfile(join(output_dir, 'work', 'all.done')): run_simple('rm ' + join(output_dir, 'work', 'all.done')) conf['output_dir'] = adjust_path(output_dir) tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') if normal_bam: normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') conf['normal_bam'] = verify_file(normal_bam, 'Normal BAM, -N option'), conf['normal_name'] = normal_name conf['tumor_bam'] = verify_file(tumor_bam, 'Tumor BAM, -T option') conf['tumor_name'] = tumor_name try: machine_cores = len(os.sched_getaffinity(0)) except: machine_cores = 1 cores = min(machine_cores, 8) if requested_cores: cores = min(cores, requested_cores) conf['cores'] = cores if maxcoverage: conf['maxcoverage'] = maxcoverage if chunksize_mil: conf['chunksize_mil'] = chunksize_mil if jvm_heap: conf['jvm_heap'] = jvm_heap if externalaligner: conf['externalaligner'] = externalaligner conf['genome'] = genome try: from reference_data import api as refdata except: pass else: # check reference_data can find the genomes dir, and error out if not genomes_dir = refdata.find_genomes_dir(input_genomes_url) if genomes_dir: conf['genomes_dir'] = genomes_dir if ref_fa: if not externalaligner == 'minimap2' and not verify_file(ref_fa + '.bwt'): log.critical(f'Please, index {ref_fa} using' f' bwa index {ref_fa}') if not verify_file(ref_fa + '.fai'): log.critical(f'Please, index {ref_fa} using' f' samtools faidx {ref_fa}') conf['ref_fa'] = ref_fa if viruses_fa: if not externalaligner == 'minimap2' and not verify_file(viruses_fa + '.bwt'): log.critical(f'Please, index {viruses_fa} using: ' f' bwa index {viruses_fa}') if not verify_file(viruses_fa + '.fai'): log.critical(f'Please, index {viruses_fa} using ' f' samtools faidx {viruses_fa}') dict_file = viruses_fa.replace('.fa', '.dict') if not verify_file(dict_file): log.critical(f'Please, index {viruses_fa} using: ' f' samtools dict {viruses_fa} -o {dict_file}') img_file = viruses_fa + '.img' if not verify_file(img_file): log.critical( f'Please, create an img file for {viruses_fa} using:\n' f' gatk BwaMemIndexImageCreator -I {viruses_fa} -O {img_file}' ) conf['viruses_fa'] = verify_file(viruses_fa) if repeat_masker_bed: conf['repeat_masker_bed'] = repeat_masker_bed if breakend_pon: conf['breakend_pon'] = breakend_pon if bp_pon: conf['bp_pon'] = bp_pon if bp_hotspots: conf['bp_hotspots'] = bp_hotspots if min_tumor_af: conf['min_tumor_af'] = min_tumor_af py_path = sys.executable # e.g. /miniconda/envs/umccrise_hmf/bin/python env_path = dirname(dirname(py_path)) # e.g. /miniconda/envs/umccrise_hmf found = glob.glob(join(env_path, 'share/gridss-*/gridss.jar')) if not found: hmf_env_path = secondary_conda_env('hmf', is_critical=False) if hmf_env_path: found = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar')) if not found: critical( 'Cannot find gridss JAR. Make sure you ran `conda install -c bioconda gridss`' ) conf['gridss_env'] = hmf_env_path conf['gridss_jar'] = found[0] run_snakemake(join(package_path(), 'gridss', 'Snakefile'), conf, cores=cores, output_dir=output_dir, unlock=unlock, dryrun=dryrun)
def main(output_dir=None, normal_bam=None, tumor_bam=None, snv_vcf=None, normal_name=None, tumor_name=None, sample=None, genome=None, genomes_dir=None, gridss_ref_dir=None, ref_fa=None, threads=None, jvmheap=None): gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx')) gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts')) normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\ .replace('-ready', '').replace('-sorted', '') tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\ .replace('-ready', '').replace('-sorted', '') sample = sample or tumor_name output_dir = safe_mkdir(abspath(output_dir or 'gridss')) logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True) output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf') assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet' if genomes_dir: refdata.find_genomes_dir(genomes_dir) if not gridss_ref_dir: gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir') if not ref_fa: ref_fa = ref_fa.get_ref_file(genome, 'fa') hmf_env_path = conda_utils.secondary_conda_env('hmf') gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0] amber_jar = glob.glob( join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0] cobalt_jar = glob.glob( join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0] purple_jar = glob.glob( join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0] linx_jar = glob.glob( join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0] cmd = f""" PATH={hmf_env_path}/bin:$PATH \ THREADS={threads} \ GRIDSS_JAR={gridss_jar} \ AMBER_JAR={amber_jar} \ COBALT_JAR={cobalt_jar} \ PURPLE_JAR={purple_jar} \ LINX_JAR={linx_jar} \ bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \ -n {normal_bam} \ -t {tumor_bam} \ -v {output_vcf} \ -s {sample} \ --normal_sample {normal_name} \ --tumour_sample {tumor_name} \ --snvvcf {snv_vcf} \ --ref_dir {gridss_ref_dir} \ --install_dir {gridss_scripts_dir} \ --reference {ref_fa} \ --output_dir {output_dir} \ {f"--jvmheap {jvmheap}" if jvmheap else ""} """.strip() try: run_simple(cmd) except subprocess.SubprocessError: err('--------\n') err(f'Error running GRIDSS-PURPLE-LINX.\n') raise
def run_snakemake(snakefile, conf, jobs=None, output_dir=None, forcerun=None, unlock=False, dryrun=False, target_rules=None, cluster=None, cluster_cmd=None, log_dir=None, dag=None, report=None, restart_times=None): conf['total_cores'] = jobs ######################### #### Setting cluster #### ######################### cluster_param = '' cluster_log_dir = '' if cluster or cluster_cmd: assert log_dir, 'For cluster run, must also specify log_dir' if cluster_cmd: cluster_param = f' --cluster "{cluster_cmd}"' else: cluster_log_dir = safe_mkdir(join(log_dir, 'cluster')) cluster_param = make_cluster_cmdl(cluster_log_dir, 'umccrise') ########################## #### Preparing config #### ########################## if log_dir: safe_mkdir(log_dir) conf_f = open(join(log_dir, '.conf.yaml'), 'w') else: conf_f = tempfile.NamedTemporaryFile(mode='wt', delete=False) yaml.dump(conf, conf_f) conf_f.close() ############################### #### Building command line #### ############################### if forcerun: forcerun = " ".join(forcerun.split(',')) cmd = ( f'snakemake ' f'{" ".join(flatten([target_rules])) if target_rules else ""} ' + f'--snakefile {snakefile} ' f'--printshellcmds ' f'{"--dryrun " if dryrun else ""}' f'{"--dag " if dag else ""}' f'{f"--report {report} " if report else ""}' f'{f"--directory {output_dir} " if output_dir else ""}' f'{f"-j {jobs} " if jobs else ""}' f'--rerun-incomplete ' f'{f"--restart-times {restart_times} " if restart_times else ""}' f'{cluster_param} ' f'--configfile {conf_f.name} ' + f'{"--dag " if dag else ""}' f'{f"--forcerun {forcerun}" if forcerun else ""}' ) ################# #### Running #### ################# if unlock: print('* Unlocking previous run... *') run_simple(cmd + ' --unlock') print('* Now rerunning *') try: run_simple(cmd) except subprocess.CalledProcessError: logger.error('--------') logger.error(f'Error: snakemake returned a non-zero status. Working directory: {output_dir}') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.error(f'Review cluster job logs in {cluster_log_dir}') sys.exit(1) except KeyboardInterrupt: logger.error('--------') logger.error(f'Interrupted. Fixing logs permissions. Working directory: {output_dir}') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.error(f'Review cluster job logs in {cluster_log_dir}') sys.exit(1) else: logger.info('--------') if cluster_log_dir: run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.info(f'Finished. Output directory: {output_dir}')
def run_snakemake(snakefile, conf, cores=None, output_dir=None, forcerun=None, unlock=False, dryrun=False, target_rules=None, debug=False, log_dir=None, dag=None, report=None, restart_times=1, tibanna_cfg=None, resources=None, cluster_param=None, cluster_log_dir=None, local_cores=None, ncpus_per_batch=None, ncpus_per_sample=None, tmp_dirs:list = None): ########################## #### Preparing config #### ########################## if unlock: conf['unlock'] = 'yes' if debug: conf['debug'] = 'yes' if restart_times is None: restart_times = 0 if restart_times is None: restart_times = DEFAULT_RESTART_TIMES restart_times = int(restart_times) if ncpus_per_batch: conf['threads_per_batch'] = ncpus_per_batch if ncpus_per_sample: conf['threads_per_sample'] = ncpus_per_sample if log_dir: safe_mkdir(log_dir) conf_f = open(join(log_dir, '.conf.yaml'), 'w') else: conf_f = tempfile.NamedTemporaryFile(mode='wt', delete=False) yaml.dump(conf, conf_f) conf_f.close() ############################### #### Building command line #### ############################### if forcerun: forcerun = " ".join(forcerun.split(',')) tibanna_opts = '' if tibanna_cfg: output_s3 = tibanna_cfg['output_s3'] output_bucket_name = output_s3.split('/')[0] if ':' in output_bucket_name: output_bucket_name = output_bucket_name.split(':')[1] step_func_name = setup_tibanna(tibanna_cfg['id'], [output_bucket_name]) tibanna_opts = f'--tibanna --default-remote-prefix {output_s3} --tibanna-sfn {step_func_name}' cmd = ( f'snakemake ' f'{" ".join(flatten([target_rules])) if target_rules else ""} ' + f'--snakefile {snakefile} ' f'--printshellcmds ' f'{"--dryrun " if dryrun else ""}' f'--rerun-incomplete ' f'{"--dag " if dag else ""}' f'{f"--report {report} " if report else ""}' f'{f"--directory {output_dir} " if output_dir else ""}' f'--cores {cores} ' f'{f"--local-cores {local_cores} " if local_cores else ""}' f'{f"--restart-times {restart_times - 1} " if restart_times > 1 else ""}' f'{cluster_param if cluster_param else ""} ' f'--configfile {conf_f.name} ' + f'{f"--forcerun {forcerun} " if forcerun else ""}' + f'{f"--resources {resources} " if resources else ""} ' f'{tibanna_opts}' ) ################# #### Running #### ################# if unlock: print('* Unlocking previous run... *') run_simple(cmd + ' --unlock') print('* Now rerunning *') try: run_simple(cmd) except subprocess.CalledProcessError: logger.error('--------') logger.error(f'Error: snakemake returned a non-zero status. Working directory: {output_dir}') if cluster_log_dir and isdir(cluster_log_dir): run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.error(f'Review cluster job logs in {cluster_log_dir}') for tmp_dir in tmp_dirs or []: tmp_dir.cleanup() sys.exit(1) except KeyboardInterrupt: logger.error('--------') logger.error(f'Interrupted. Fixing logs permissions. Working directory: {output_dir}') if cluster_log_dir and isdir(cluster_log_dir): run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.error(f'Review cluster job logs in {cluster_log_dir}') for tmp_dir in tmp_dirs or []: tmp_dir.cleanup() sys.exit(1) else: logger.info('--------') if cluster_log_dir and isdir(cluster_log_dir): run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True) logger.info(f'Finished. Output directory: {output_dir}') for tmp_dir in tmp_dirs or []: tmp_dir.cleanup()