def run_pcgr(pcgr_paths, config_options): """ Main function to run the PCGR workflow """ debug = config_options['debug'] report_nonfloating_toc = 1 if config_options['other']['nonfloating_toc'] else 0 vep_regulatory_annotation = 'ON' if config_options['other']['vep_regulatory'] == 1 else 'OFF' clinical_trials_set = 'ON' if config_options['clinicaltrials']['run'] else 'OFF' msi_prediction_set = 'ON' if config_options['msi']['run'] else 'OFF' msig_estimation_set = 'ON' if config_options['msigs']['run'] else 'OFF' tmb_estimation_set = 'ON' if config_options['tmb']['run'] else 'OFF' vcf_validation = 0 if config_options['other']['no_vcf_validate'] else 1 run_vcf2maf = config_options['other']['vcf2maf'] assay_mode = 'Tumor vs. Control' tumor_only = 0 cell_line = 0 if config_options['tumor_only']['tumor_only']: assay_mode = 'Tumor-Only' tumor_only = 1 if config_options['tumor_only']['cell_line']: cell_line = 1 assay_mode = 'Tumor-Only (cell line)' # set basic run commands output_vcf = 'None' output_pass_vcf = 'None' output_pass_tsv = 'None' output_maf = 'None' GENCODE_VERSION = pcgr_vars.GENCODE_VERSION NCBI_BUILD_MAF = pcgr_vars.NCBI_BUILD_MAF VEP_ASSEMBLY = pcgr_vars.VEP_ASSEMBLY MAX_VARIANTS_FOR_REPORT = pcgr_vars.MAX_VARIANTS_FOR_REPORT if config_options['genome_assembly'] == 'grch37': NCBI_BUILD_MAF = 'GRCh37' GENCODE_VERSION = 'release 19' VEP_ASSEMBLY = 'GRCh37' logger = getlogger('pcgr-get-OS') vep_dir = os.path.join(str(pcgr_paths['db_dir']), '.vep') input_vcf = 'None' input_cna = 'None' input_rna_fusion = 'None' input_rna_expression = 'None' input_cpsr_report = 'None' panel_normal = 'None' # panel-of-normals annotation pon_annotation = 0 # Specify paths for input files and directories if pcgr_paths['input_vcf_basename'] != 'NA': input_vcf = os.path.join(pcgr_paths['input_vcf_dir'], pcgr_paths['input_vcf_basename']) if pcgr_paths['input_cna_basename'] != 'NA': input_cna = os.path.join(pcgr_paths['input_cna_dir'], pcgr_paths['input_cna_basename']) if pcgr_paths['input_rna_fusion_basename'] != 'NA': input_rna_fusion = os.path.join(pcgr_paths['input_rna_fusion_dir'], pcgr_paths['input_rna_fusion_basename']) if pcgr_paths['input_rna_expression_basename'] != 'NA': input_rna_expression = os.path.join(pcgr_paths['input_rna_expression_dir'], pcgr_paths['input_rna_expression_basename']) if pcgr_paths['input_cpsr_report_basename'] != 'NA': input_cpsr_report = os.path.join(pcgr_paths['input_cpsr_report_dir'], pcgr_paths['input_cpsr_report_basename']) if pcgr_paths['panel_normal_vcf_basename'] != 'NA': panel_normal = os.path.join(pcgr_paths['panel_normal_vcf_dir'], pcgr_paths['panel_normal_vcf_basename']) data_dir = pcgr_paths['base_dir'] output_dir = pcgr_paths['output_dir'] # PCGR|validate_input - verify that VCF and CNA segment file is of appropriate format logger = getlogger("pcgr-validate-input-arguments") logger.info("PCGR - STEP 0: Validate input data and options") vcf_validate_command = ( f'pcgr_validate_input.py ' f'{data_dir} ' f'{input_vcf} ' f'{input_cna} ' f'{input_rna_fusion} ' f'{input_rna_expression} ' f'{panel_normal} ' f'{vcf_validation} ' f'{tumor_only} ' f'{config_options["genome_assembly"]} ' f'{config_options["other"]["preserved_info_tags"]} ' f'{config_options["allelic_support"]["tumor_dp_tag"]} {config_options["allelic_support"]["tumor_af_tag"]} ' f'{config_options["allelic_support"]["control_dp_tag"]} {config_options["allelic_support"]["control_af_tag"]} ' f'{config_options["allelic_support"]["call_conf_tag"]} ' f'{config_options["tumor_only"]["exclude_likely_hom_germline"]} ' f'{config_options["tumor_only"]["exclude_likely_het_germline"]} ' f'--output_dir {output_dir} ' f'{"--debug " if debug else ""}' f'{"--keep_uncompressed" if run_vcf2maf else ""} ' ) check_subprocess(logger, vcf_validate_command, debug) logger.info('Finished pcgr-validate-input-arguments') print('----') # PCGR|start - Log key information about sample, options and sequencing assay/design logger = getlogger('pcgr-start') logger.info('--- Personal Cancer Genome Reporter workflow ----') logger.info(f'Sample name: {config_options["sample_id"]}') if config_options['tumor_type']['type'] == 'Cancer_NOS': logger.info('Tumor type: Cancer_NOS (Any tumortype)') else: logger.info(f'Tumor type: {config_options["tumor_type"]["type"]}') logger.info(f'Sequencing assay - type: {config_options["assay"]}') logger.info(f'Sequencing assay - mode: {assay_mode}') logger.info(f'Sequencing assay - coding target size: {config_options["tmb"]["target_size_mb"]}Mb') logger.info(f'Genome assembly: {config_options["genome_assembly"]}') logger.info(f'Mutational signature estimation: {msig_estimation_set}') logger.info(f'MSI classification: {msi_prediction_set}') logger.info(f'Mutational burden estimation: {tmb_estimation_set}') logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}') if not input_vcf == 'None': # Define temporary output file names prefix = os.path.join(output_dir, f'{config_options["sample_id"]}.pcgr_acmg.{config_options["genome_assembly"]}') output_vcf = f'{prefix}.vcf.gz' output_pass_vcf = f'{prefix}.pass.vcf.gz' output_pass_tsv = f'{prefix}.pass.tsv' output_pass_raw_tsv_gz = f'{prefix}.pass.raw.tsv.gz' output_maf = f'{prefix}.tmp.maf' output_vcf2maf_log = f'{prefix}.maf.log' input_vcf_pcgr_ready = os.path.join(output_dir, re.sub(r"(\.vcf$|\.vcf\.gz$)", ".pcgr_ready.vcf.gz", pcgr_paths["input_vcf_basename"])) # needs to be uncompressed for vcf2maf input_vcf_pcgr_ready_uncompressed = os.path.join(output_dir, re.sub(r"(\.vcf$|\.vcf\.gz$)", ".pcgr_ready.vcf", pcgr_paths["input_vcf_basename"])) vep_vcf = re.sub(r"(\.vcf$|\.vcf\.gz$)", ".vep.vcf.gz", input_vcf_pcgr_ready) vep_vcfanno_vcf = re.sub(r"(\.vcf$|\.vcf\.gz$)", ".vep.vcfanno.vcf", input_vcf_pcgr_ready) vep_vcfanno_annotated_vcf = re.sub(r"\.vcfanno", ".vcfanno.annotated", vep_vcfanno_vcf) + ".gz" vep_vcfanno_annotated_pass_vcf = re.sub(r"\.vcfanno", ".vcfanno.annotated.pass", vep_vcfanno_vcf) + ".gz" fasta_assembly = os.path.join(vep_dir, 'homo_sapiens', f'{pcgr_vars.VEP_VERSION}_{VEP_ASSEMBLY}', f'Homo_sapiens.{VEP_ASSEMBLY}.dna.primary_assembly.fa.gz') # List all VEP flags used when calling VEP vep_flags = ( f'--hgvs --af --af_1kg --af_gnomad --variant_class --domains --symbol --protein --ccds --mane ' f'--uniprot --appris --biotype --tsl --canonical --format vcf --cache --numbers --total_length --allele_number ' f'--no_stats --no_escape --xref_refseq --vcf --check_ref --dont_skip --flag_pick_allele --plugin NearestExonJB,max_range=50000 ' f'--force_overwrite --species homo_sapiens --offline --compress_output bgzip' ) vep_options = ( f'--dir {vep_dir} --assembly {VEP_ASSEMBLY} --cache_version {pcgr_vars.VEP_VERSION} ' f'--fasta {fasta_assembly} --pick_order {config_options["other"]["vep_pick_order"]} ' f'--buffer_size {config_options["other"]["vep_buffer_size"]} ' f'--fork {config_options["other"]["vep_n_forks"]} ' f'{vep_flags} ' f'{"--verbose" if debug else "--quiet"} ' ) gencode_set_in_use = "GENCODE - all transcripts" if config_options['other']['vep_no_intergenic'] == 1: vep_options += '--no_intergenic ' if config_options['other']['vep_regulatory'] == 1: vep_options += '--regulatory ' if config_options['other']['vep_gencode_all'] == 0: vep_options += '--gencode_basic ' gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)" # Compose full VEP command vep_main_command = f'{utils.get_perl_exports()} && vep --input_file {input_vcf_pcgr_ready} --output_file {vep_vcf} {vep_options}' vep_tabix_command = f'tabix -f -p vcf {vep_vcf}' # PCGR|VEP - run consequence annotation with Variant Effect Predictor print('----') logger = getlogger('pcgr-vep') logger.info(f'PCGR - STEP 1: Basic variant annotation with Variant Effect Predictor ({pcgr_vars.VEP_VERSION}, GENCODE {GENCODE_VERSION}, {config_options["genome_assembly"]})') logger.info(f'VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)') logger.info(f'VEP configuration - transcript pick order: {config_options["other"]["vep_pick_order"]}') logger.info(f'VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options') logger.info(f'VEP configuration - GENCODE set: {gencode_set_in_use}') logger.info(f'VEP configuration - skip intergenic: {"TRUE" if config_options["other"]["vep_no_intergenic"] else "FALSE"}') logger.info(f'VEP configuration - regulatory annotation: {vep_regulatory_annotation}') logger.info(f'VEP configuration - buffer_size/number of forks: {config_options["other"]["vep_buffer_size"]}/{config_options["other"]["vep_n_forks"]}') check_subprocess(logger, vep_main_command, debug) check_subprocess(logger, vep_tabix_command, debug) logger.info('Finished pcgr-vep') print('----') # PCGR|vcf2maf - if option set, convert VCF to MAF with https://github.com/mskcc/vcf2maf if run_vcf2maf: logger.info('Converting VEP-annotated VCF to MAF with https://github.com/mskcc/vcf2maf') vcf2maf_command = ( f'vcf2maf.pl --inhibit-vep --input-vcf {input_vcf_pcgr_ready_uncompressed} ' f'--tumor-id {config_options["sample_id"]} --output-maf {output_maf} --ref-fasta {fasta_assembly} ' f'--ncbi-build {NCBI_BUILD_MAF} > {output_vcf2maf_log} 2>&1' ) check_subprocess(logger, vcf2maf_command, debug) utils.remove(input_vcf_pcgr_ready_uncompressed) utils.remove(output_vcf2maf_log) logger.info('Finished pcgr-vep-vcf2maf') print('----') # PCGR|vcfanno - annotate VCF against a number of variant annotation resources logger = getlogger("pcgr-vcfanno") pcgr_vcfanno_command = ( f'pcgr_vcfanno.py {vep_vcf} {vep_vcfanno_vcf} {pcgr_paths["db_dir"]} ' f'--num_processes {config_options["other"]["vcfanno_n_proc"]} ' f'--chasmplus --dbnsfp --docm --clinvar --icgc --civic --cgi --tcga_pcdm --winmsk --simplerepeats ' f'--tcga --uniprot --cancer_hotspots --pcgr_onco_xref ' f'{"--debug " if debug else ""}' ) anno_src_msg = ( f"Annotation sources: {'Panel-of-Normals, ' if panel_normal != 'None' else ''}ClinVar, dbNSFP, " f"UniProtKB, cancerhotspots.org, CiVIC, CGI, DoCM, CHASMplus driver mutations, TCGA, ICGC-PCAWG" ) logger.info("PCGR - STEP 2: Annotation for precision oncology with pcgr-vcfanno") logger.info(anno_src_msg) if panel_normal != "None": pon_annotation = 1 pcgr_vcfanno_command += f'--panel_normal_vcf {panel_normal}' check_subprocess(logger, pcgr_vcfanno_command, debug) logger.info("Finished pcgr-vcfanno") print('----') # PCGR|pcgr_summarise - expand annotations in VCF file logger = getlogger("pcgr-summarise") pcgr_summarise_command = ( f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {pon_annotation} ' f'{config_options["other"]["vep_regulatory"]} ' f'{pcgr_paths["db_dir"]} ' f'{"--debug" if debug else ""}' ) logger.info("PCGR - STEP 3: Cancer gene annotations with pcgr-summarise") check_subprocess(logger, pcgr_summarise_command, debug) # PCGR|clean - move output files and clean up temporary files os.rename(vep_vcfanno_annotated_vcf, output_vcf) os.rename(f'{vep_vcfanno_annotated_vcf}.tbi', f'{output_vcf}.tbi') os.rename(vep_vcfanno_annotated_pass_vcf, output_pass_vcf) os.rename(f'{vep_vcfanno_annotated_pass_vcf}.tbi', f'{output_pass_vcf}.tbi') delete_files = ( glob(f'{vep_vcf}*') + glob(f'{vep_vcfanno_annotated_vcf}') + glob(f'{vep_vcfanno_annotated_pass_vcf}*') + glob(f'{vep_vcfanno_vcf}*') + glob(f'{input_vcf_pcgr_ready_uncompressed}*') ) # do not delete if debugging if not debug: for fn in delete_files: #print(f"Deleting {fn}") utils.remove(fn) logger.info('Finished pcgr-summarise main command') # PCGR|vcf2tsv - convert VCF to TSV with https://github.com/sigven/vcf2tsv pcgr_vcf2tsv_command = f'vcf2tsv.py {output_pass_vcf} --compress {output_pass_tsv}' logger.info("Converting VCF to TSV with https://github.com/sigven/vcf2tsv") check_subprocess(logger, pcgr_vcf2tsv_command, debug) logger.info('Finished pcgr-summarise-vcf2tsv') if config_options['assay'] == 'WGS' or config_options['assay'] == 'WES': output_pass_tsv_gz = f'{output_pass_tsv}.gz' # check that output file exist if os.path.exists(output_pass_tsv_gz): # get number of rows/variants annotated, using pandas var_data = pandas.read_csv(output_pass_tsv_gz, sep = '\t', low_memory = False, header = [1]) num_variants_raw = len(var_data) if num_variants_raw > MAX_VARIANTS_FOR_REPORT: logger.info(f'Number of raw variants in input VCF ({num_variants_raw}) exceeds {MAX_VARIANTS_FOR_REPORT} - intergenic/intronic variants will be excluded prior to reporting') # Exclude intronic and intergenic variants prior to analysis with pcgrr (reporting and further analysis) var_data_filtered = var_data[~var_data.Consequence.str.contains('^intron') & ~var_data.Consequence.str.contains('^intergenic')] num_variants_excluded1 = num_variants_raw - len(var_data_filtered) logger.info(f'Number of intergenic/intronic variants excluded: {num_variants_excluded1}') # Exclude upstream_gene/downstream_gene variants if size of filtered variant set is still above MAX_VARIANTS_FOR_REPORT # TODO: in this case, the TMB calculation will be an underestimate (but still likely huge) var_data_filtered_final = var_data_filtered if len(var_data_filtered) > MAX_VARIANTS_FOR_REPORT: var_data_filtered_final = var_data_filtered[~var_data_filtered.Consequence.str.contains('^upstream_gene') & ~var_data_filtered.Consequence.str.contains('^downstream_gene')] num_variants_excluded2 = len(var_data_filtered) - len(var_data_filtered_final) logger.info(f'Number of upstream_gene/downstream_gene variants excluded: {num_variants_excluded2}') # get vcf2tsv header and pipe to output TSV file get_vcf2tsv_header = f'gzip -dc {output_pass_tsv_gz} | egrep \'^#\' > {output_pass_tsv}' check_subprocess(logger, get_vcf2tsv_header, debug) # rename original vcf2tsv (gzipped) to 'raw' filename rename_output_tsv = f'mv {output_pass_tsv_gz} {output_pass_raw_tsv_gz}' check_subprocess(logger, rename_output_tsv, debug) # append filtered data output to output TSV file var_data_filtered_final.to_csv(output_pass_tsv, sep='\t', encoding='utf-8', mode = 'a', index = False) # gzip filtered output TSV file gzip_filtered_output_tsv = f'gzip -f {output_pass_tsv}' check_subprocess(logger, gzip_filtered_output_tsv, debug) logger.info('Finished pcgr-summarise') print('----') # Generation of HTML reports for VEP/vcfanno-annotated VCF and copy number segment file if not config_options['other']['basic']: co = config_options ttype = co['tumor_type']['type'].replace(' ', '_').replace('/', '@') logger = getlogger('pcgr-writer') logger.info('PCGR - STEP 4: Generation of output files - variant interpretation report for precision oncology') # export PATH to R conda env Rscript rscript = utils.script_path('pcgrr', 'bin/Rscript') pcgrr_script = utils.script_path('pcgr', 'bin/pcgrr.R') pcgr_report_command = ( f"{rscript} {pcgrr_script} " f"{output_dir} " f"{output_pass_tsv}.gz " f"{input_cna} " f"{input_rna_fusion} " f"{input_rna_expression} " f"{input_cpsr_report} " f"{config_options['sample_id']} " f"{pcgr_vars.PCGR_VERSION} " f"{pcgr_vars.DB_VERSION} " f"{config_options['genome_assembly']} " f"{data_dir} " f"{co['tumor_purity']} " f"{co['tumor_ploidy']} " f"{ttype} " f"{co['tmb']['target_size_mb']} " f"{co['assay']} " f"{tumor_only} " f"{cell_line} " f"{co['tumor_only']['maf_onekg_afr']} " f"{co['tumor_only']['maf_onekg_amr']} " f"{co['tumor_only']['maf_onekg_eas']} " f"{co['tumor_only']['maf_onekg_eur']} " f"{co['tumor_only']['maf_onekg_sas']} " f"{co['tumor_only']['maf_onekg_global']} " f"{co['tumor_only']['maf_gnomad_afr']} " f"{co['tumor_only']['maf_gnomad_amr']} " f"{co['tumor_only']['maf_gnomad_asj']} " f"{co['tumor_only']['maf_gnomad_eas']} " f"{co['tumor_only']['maf_gnomad_fin']} " f"{co['tumor_only']['maf_gnomad_nfe']} " f"{co['tumor_only']['maf_gnomad_oth']} " f"{co['tumor_only']['maf_gnomad_sas']} " f"{co['tumor_only']['maf_gnomad_global']} " f"{co['tumor_only']['exclude_pon']} " f"{co['tumor_only']['exclude_likely_hom_germline']} " f"{co['tumor_only']['exclude_likely_het_germline']} " f"{co['tumor_only']['exclude_dbsnp_nonsomatic']} " f"{co['tumor_only']['exclude_nonexonic']} " f"{co['tmb']['run']} " f"{co['tmb']['algorithm']} " f"{co['msi']['run']} " f"{co['msigs']['run']} " f"{co['msigs']['mutation_limit']} " f"{co['msigs']['all_reference_signatures']} " f"{co['msigs']['include_artefact_signatures']} " f"{co['msigs']['prevalence_reference_signatures']} " f"{co['cna']['logR_homdel']} " f"{co['cna']['logR_gain']} " f"{co['cna']['cna_overlap_pct']} " f"{co['allelic_support']['tumor_af_min']} " f"{co['allelic_support']['tumor_dp_min']} " f"{co['allelic_support']['control_dp_min']} " f"{co['allelic_support']['control_af_max']} " f"{co['allelic_support']['tumor_af_tag']} " f"{co['allelic_support']['tumor_dp_tag']} " f"{co['allelic_support']['control_af_tag']} " f"{co['allelic_support']['control_dp_tag']} " f"{co['allelic_support']['call_conf_tag']} " f"{co['clinicaltrials']['run']} " f"{co['other']['vep_n_forks']} " f"{co['other']['vep_buffer_size']} " f"{co['other']['vep_no_intergenic']} " f"{co['other']['vep_pick_order']} " f"{co['other']['vep_regulatory']} " f"{co['other']['vep_gencode_all']} " f"{co['other']['vcf2maf']} " f"{co['other']['list_noncoding']} " f"{co['other']['preserved_info_tags']} " f"{co['other']['visual_theme']} " f"{report_nonfloating_toc} " f"{co['other']['no_vcf_validate']}" ) if debug: print(pcgr_report_command) check_subprocess(logger, pcgr_report_command, debug) logger.info("Finished PCGR!") print('----') print()
def run_cpsr(arg_dict, cpsr_paths): """ Main function to run the CPSR workflow """ debug = arg_dict['debug'] diagnostic_grade_only = 0 vcf_validation = 1 virtual_panel_id = "-1" ignore_noncoding = 0 gwas_findings = 0 secondary_findings = 0 classify_all = 0 clinvar_ignore_noncancer = 0 report_nonfloating_toc = 0 vep_no_intergenic = 0 vep_regulatory = 0 preserved_info_tags = arg_dict['preserved_info_tags'] diagnostic_grade_set = "OFF" secondary_findings_set = "OFF" gwas_findings_set = "OFF" if arg_dict['vep_regulatory']: vep_regulatory = 1 if arg_dict["vep_no_intergenic"]: vep_no_intergenic = 1 if arg_dict['clinvar_ignore_noncancer']: clinvar_ignore_noncancer = 1 if arg_dict['classify_all']: classify_all = 1 if arg_dict['gwas_findings']: gwas_findings = 1 gwas_findings_set = "ON" if arg_dict['secondary_findings']: secondary_findings = 1 secondary_findings_set = "ON" if arg_dict['diagnostic_grade_only']: diagnostic_grade_only = 1 diagnostic_grade_set = "ON" if arg_dict['report_nonfloating_toc']: report_nonfloating_toc = 1 if arg_dict['no_vcf_validate']: vcf_validation = 0 if arg_dict['virtual_panel_id'] != "-1": virtual_panel_id = arg_dict['virtual_panel_id'] if arg_dict['custom_list']: virtual_panel_id = "-1" if arg_dict['ignore_noncoding']: ignore_noncoding = 1 output_vcf = 'None' output_pass_vcf = 'None' output_pass_tsv = 'None' uid = '' GENCODE_VERSION = pcgr_vars.GENCODE_VERSION VEP_ASSEMBLY = pcgr_vars.VEP_ASSEMBLY VEP_VERSION = pcgr_vars.VEP_VERSION if arg_dict['genome_assembly'] == 'grch37': GENCODE_VERSION = '19' VEP_ASSEMBLY = 'GRCh37' vepdb_dir = os.path.join(str(cpsr_paths['db_dir']),'.vep') input_vcf = 'None' input_customlist = 'None' if cpsr_paths['input_vcf_basename'] != 'NA': input_vcf = os.path.join(cpsr_paths['input_vcf_dir'], cpsr_paths['input_vcf_basename']) if cpsr_paths['input_customlist_basename'] != 'NA': input_customlist = os.path.join(cpsr_paths['input_customlist_dir'], cpsr_paths['input_customlist_basename']) data_dir = cpsr_paths['base_dir'] output_dir = cpsr_paths['output_dir'] vep_dir = vepdb_dir logger = getlogger('cpsr-validate-input-arguments') logger.info("CPSR - STEP 0: Validate input data") check_subprocess(logger, f'mkdir -p {output_dir}', debug) ## CPSR|Validate input VCF - check formatting, non-overlap with CPSR INFO tags, and whether sample contains any variants in cancer predisposition loci vcf_validate_command = ( f'cpsr_validate_input.py ' f'{data_dir} ' f'{input_vcf} ' f'{input_customlist} ' f'{preserved_info_tags} ' f'{vcf_validation} ' f'{arg_dict["genome_assembly"]} ' f'{arg_dict["sample_id"]} ' f'{virtual_panel_id} ' f'{diagnostic_grade_only} ' f'--output_dir {output_dir} {"--debug" if debug else ""}' ) check_subprocess(logger, vcf_validate_command, debug) logger.info('Finished cpsr-validate-input-arguments') print('----') ## CPSR|Start - log key information about run logger = getlogger("cpsr-start") logger.info("--- Cancer Predisposition Sequencing Reporter workflow ----") logger.info(f"Sample name: {arg_dict['sample_id']}") if not input_customlist == 'None': logger.info(f"Virtual gene panel: custom-made list from panel 0: {input_customlist}") else: #logger.info("Virtual gene panel(s): " + str(pcgr_vars.GE_panels[virtual_panel_id])) logger.info(f"Diagnostic-grade genes in virtual panels (GE PanelApp): {diagnostic_grade_set}") logger.info(f"Include incidental findings (ACMG recommended list v3.0): {secondary_findings_set}") logger.info(f"Include low to moderate cancer risk variants from genome-wide association studies: {gwas_findings_set}") logger.info(f"Reference population, germline variant frequencies (gnomAD): {str(arg_dict['pop_gnomad']).upper()}") logger.info(f"Genome assembly: {arg_dict['genome_assembly']}") if not input_vcf == 'None': ## Define input, output and temporary file names pcgr_model = 'cpsr' output_vcf = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.vcf.gz') output_pass_vcf = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.pass.vcf.gz') output_pass_tsv = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.pass.tsv') input_vcf_cpsr_ready = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf.gz', cpsr_paths['input_vcf_basename'])) input_vcf_cpsr_ready_uncompressed = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf', cpsr_paths['input_vcf_basename'])) vep_vcf = re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_vep.vcf',input_vcf_cpsr_ready) vep_vcfanno_vcf = re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_vep.vcfanno.vcf',input_vcf_cpsr_ready) vep_vcfanno_annotated_vcf = re.sub(r'\.vcfanno','.vcfanno.annotated',vep_vcfanno_vcf) + '.gz' vep_vcfanno_annotated_pass_vcf = re.sub(r'\.vcfanno','.vcfanno.annotated.pass',vep_vcfanno_vcf) + '.gz' custom_bed = os.path.join(output_dir, str(arg_dict['sample_id']) + '.' + str(pcgr_model) + '.' + str(arg_dict['genome_assembly']) + '.custom_list.bed') ## File names for assembly-specific genome fasta files (VEP) fasta_assembly = os.path.join(vep_dir, f"homo_sapiens/{VEP_VERSION}_{VEP_ASSEMBLY}/Homo_sapiens.{VEP_ASSEMBLY}.dna.primary_assembly.fa.gz") ancestor_assembly = os.path.join(vep_dir, f"homo_sapiens/{VEP_VERSION}_{VEP_ASSEMBLY}/human_ancestor.fa.gz") ## Set all flags used in VEP run plugins_in_use = "NearestExonJB, LoF" vep_flags = ( f"--format vcf --vcf --check_ref --flag_pick_allele_gene --hgvs --dont_skip --failed 1 --af --af_1kg --af_gnomad " f"--variant_class --domains --symbol --protein --ccds --uniprot --appris --biotype --canonical --cache " f"--numbers --total_length --no_stats --allele_number --no_escape --xref_refseq --plugin NearestExonJB,max_range=50000" ) vep_options = ( f"--pick_order {arg_dict['vep_pick_order']} --force_overwrite --buffer_size {arg_dict['vep_buffer_size']} " f"--species homo_sapiens --assembly {VEP_ASSEMBLY} --offline --fork {arg_dict['vep_n_forks']} {vep_flags} --dir {vep_dir} " f"--cache_version {VEP_VERSION}" ) gencode_set_in_use = "GENCODE - all transcripts" if arg_dict['vep_gencode_all'] == 0: vep_options += ' --gencode_basic' gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)" if arg_dict['vep_no_intergenic'] == 1: vep_options = vep_options + " --no_intergenic" if arg_dict['vep_regulatory'] == 1: vep_options = vep_options + " --regulatory" if arg_dict['genome_assembly'] == "grch38": vep_options = vep_options + " --mane" loftee_dir = utils.get_loftee_dir() assert os.path.isdir(loftee_dir), f'LoF VEP plugin is not found in {loftee_dir}. Please make sure you installed pcgr conda package and have corresponding conda environment active.' vep_options += f" --plugin LoF,loftee_path:{loftee_dir},human_ancestor_fa:{ancestor_assembly},use_gerp_end_trunc:0 --dir_plugins {loftee_dir}" if not debug: vep_options += " --quiet" ## Compose full VEP command vep_main_command = f'{utils.get_perl_exports()} && vep --input_file {input_vcf_cpsr_ready} --output_file {vep_vcf} {vep_options} --fasta {fasta_assembly}' vep_bgzip_command = f'bgzip -f {vep_vcf}' vep_tabix_command = f'tabix -f -p vcf {vep_vcf}.gz' logger = getlogger('cpsr-vep') ## CPSR|VEP - run Variant Effect Predictor on query VCF with LoF and NearestExonJB plugins logger.info(f"CPSR - STEP 1: Basic variant annotation with Variant Effect Predictor ({VEP_VERSION}, GENCODE {GENCODE_VERSION}, {arg_dict['genome_assembly']})") logger.info(f"VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)") logger.info(f"VEP configuration - transcript pick order: {arg_dict['vep_pick_order']}") logger.info(f"VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options") logger.info(f"VEP configuration - GENCODE set: {gencode_set_in_use}") logger.info(f"VEP configuration - skip intergenic: {arg_dict['vep_no_intergenic']}") logger.info(f"VEP configuration - look for overlap with regulatory regions: {vep_regulatory}") logger.info(f"VEP configuration - plugins in use: {plugins_in_use}") logger.info(f"VEP configuration - buffer_size/number of forks: {arg_dict['vep_buffer_size']}/{arg_dict['vep_n_forks']}") check_subprocess(logger, vep_main_command, debug) check_subprocess(logger, vep_bgzip_command, debug) check_subprocess(logger, vep_tabix_command, debug) logger.info("Finished cpsr-vep") print('----') ## CPSR|vcfanno - run vcfanno on query VCF with a number of relevant annotated VCFs logger = getlogger('cpsr-vcfanno') logger.info("CPSR - STEP 2: Annotation for cancer predisposition with cpsr-vcfanno") logger.info("(ClinVar, CIViC, dbNSFP, dbMTS, UniProtKB, cancerhotspots.org, ncER, GERP RS scores, GWAS catalog, gnomAD non-cancer subset)") pcgr_vcfanno_command = ( f"pcgr_vcfanno.py --num_processes {arg_dict['vcfanno_n_proc']} --dbnsfp --clinvar " f"--cancer_hotspots --dbmts --ncer --gerp --civic --uniprot --gnomad_cpsr --pcgr_onco_xref " f"--gwas --rmsk {vep_vcf}.gz {vep_vcfanno_vcf} {os.path.join(data_dir, 'data', str(arg_dict['genome_assembly']))}" ) check_subprocess(logger, pcgr_vcfanno_command, debug) logger.info("Finished cpsr-vcfanno") print('----') ## CPSR|summarise - expand annotations with separate VCF INFO tags logger = getlogger("cpsr-summarise") pcgr_summarise_command = ( f'pcgr_summarise.py {vep_vcfanno_vcf}.gz 0 {vep_regulatory} ' f'{os.path.join(data_dir, "data", arg_dict["genome_assembly"])} ' f'--cpsr {"--debug" if debug else ""}' ) logger.info("CPSR - STEP 3: Cancer gene annotations with cpsr-summarise") check_subprocess(logger, pcgr_summarise_command, debug) ## CPSR|clean - rename output files, remove temporary files os.rename(vep_vcfanno_annotated_vcf, output_vcf) os.rename(f'{vep_vcfanno_annotated_vcf}.tbi', f'{output_vcf}.tbi') os.rename(vep_vcfanno_annotated_pass_vcf, output_pass_vcf) os.rename(f'{vep_vcfanno_annotated_pass_vcf}.tbi', f'{output_pass_vcf}.tbi') delete_files = ( glob(f'{vep_vcf}*') + glob(f'{vep_vcfanno_annotated_vcf}') + glob(f'{vep_vcfanno_annotated_pass_vcf}*') + glob(f'{vep_vcfanno_vcf}*') + glob(f'{input_vcf_cpsr_ready_uncompressed}*') ) # do not delete if debugging if not debug: for fn in delete_files: #print(f"Deleting {fn}") utils.remove(fn) logger.info('Finished cpsr-summarise main command') ## CPSR|vcf2tsv - perform vcf2tsv conversion on the final annotated VCF file cpsr_vcf2tsv_command = f"vcf2tsv.py {output_pass_vcf} --compress {output_pass_tsv}" logger.info("Converting VCF to TSV with https://github.com/sigven/vcf2tsv") check_subprocess(logger, cpsr_vcf2tsv_command, debug) logger.info('Finished cpsr-summarise-vcf2tsv') logger.info('Finished cpsr-summarise') print('----') ## Generation of HTML reports for VEP/vcfanno-annotated VCF file if not arg_dict['basic']: logger = getlogger('cpsr-writer') logger.info("CPSR - STEP 4: Generation of output files - Cancer predisposition sequencing report") # export PATH to R conda env Rscript rscript = utils.script_path("pcgrr", "bin/Rscript") cpsrr_script = utils.script_path('pcgr', 'bin/cpsr.R') cpsr_report_command = ( f"{rscript} {cpsrr_script} " f"{output_dir} " f"{output_pass_tsv}.gz " f"{arg_dict['sample_id']} " f"{pcgr_vars.PCGR_VERSION} " f"{pcgr_vars.DB_VERSION} " f"{arg_dict['genome_assembly']} " f"{data_dir} " f"{virtual_panel_id} " f"{preserved_info_tags} " f"{custom_bed} " f"{arg_dict['custom_list_name']} " f"{arg_dict['report_theme']} " f"{arg_dict['report_table_display']} " f"{report_nonfloating_toc} " f"{gwas_findings} " f"{arg_dict['gwas_p_value']} " f"{arg_dict['pop_gnomad']} " f"{arg_dict['maf_upper_threshold']} " f"{arg_dict['vep_pick_order']} " f"{arg_dict['vep_n_forks']} " f"{arg_dict['vep_buffer_size']} " f"{arg_dict['vep_gencode_all']} " f"{vep_no_intergenic} " f"{vep_regulatory} " f"{secondary_findings} " f"{classify_all} " f"{ignore_noncoding} " f"{clinvar_ignore_noncancer} " f"{diagnostic_grade_only}" ) if debug: print(cpsr_report_command) check_subprocess(logger, cpsr_report_command, debug) logger.info("Finished CPSR!") print('----') print()
def run_vcfanno(num_processes, query_vcf, panel_normal_vcf, query_info_tags, vcfheader_file, pcgr_db_directory, conf_fname, output_vcf, docm, clinvar, ncer, dbmts, gerp, tcga, tcga_pcdm, chasmplus, dbnsfp, civic, cgi, icgc, uniprot, cancer_hotspots, pcgr_onco_xref, gwas, rmsk, simplerepeats, winmsk, gnomad_cpsr, keep_logs, debug, logger): """ Function that annotates a VCF file with vcfanno against a user-defined set of germline and somatic VCF files """ civic_info_tags = ["CIVIC_ID","CIVIC_ID_SEGMENT"] cgi_info_tags = ["CGI_ID","CGI_ID_SEGMENT"] icgc_info_tags = ["ICGC_PCAWG_OCCURRENCE","ICGC_PCAWG_AFFECTED_DONORS"] docm_info_tags = ["DOCM_PMID"] tcga_info_tags = ["TCGA_FREQUENCY","TCGA_PANCANCER_COUNT"] tcga_pcdm_info_tags = ["PUTATIVE_DRIVER_MUTATION"] chasmplus_info_tags = ["CHASMPLUS_DRIVER","CHASMPLUS_TTYPE","CHASMPLUS_PANCAN"] ncer_info_tags = ["NCER_PERCENTILE"] clinvar_info_tags = ["CLINVAR_MSID","CLINVAR_PMID","CLINVAR_CLNSIG","CLINVAR_VARIANT_ORIGIN","CLINVAR_CONFLICTED","CLINVAR_UMLS_CUI","CLINVAR_HGVSP", "CLINVAR_UMLS_CUI_SOMATIC","CLINVAR_CLNSIG_SOMATIC","CLINVAR_PMID_SOMATIC","CLINVAR_ALLELE_ID","CLINVAR_MOLECULAR_EFFECT", "CLINVAR_REVIEW_STATUS_STARS","CLINVAR_CLASSIFICATION","CLINVAR_ENTREZGENE"] cancer_hotspots_info_tags = ["MUTATION_HOTSPOT","MUTATION_HOTSPOT_TRANSCRIPT","MUTATION_HOTSPOT_CANCERTYPE"] dbnsfp_info_tags = ["DBNSFP"] uniprot_info_tags = ["UNIPROT_FEATURE"] pcgr_onco_xref_info_tags = ["PCGR_ONCO_XREF"] gwas_info_tags = ["GWAS_HIT"] rmsk_info_tags = ["RMSK_HIT"] simplerepeats_info_tags = ["SIMPLEREPEATS_HIT"] winmsk_info_tags = ["WINMASKER_HIT"] panel_normal_tags = ["PANEL_OF_NORMALS"] dbmts_info_tags = ["DBMTS"] gerp_info_tags = ['GERP_SCORE'] gnomad_cpsr_tags = [] gnomad_cpsr_tags.append('NON_CANCER_AC_GLOBAL') gnomad_cpsr_tags.append('NON_CANCER_NHOMALT_GLOBAL') gnomad_cpsr_tags.append('NON_CANCER_AN_GLOBAL') gnomad_cpsr_tags.append('NON_CANCER_AF_GLOBAL') for pop in ['ASJ','NFE','SAS','FIN','EAS','AMR','AFR','OTH']: gnomad_cpsr_tags.append('NON_CANCER_AC_' + str(pop)) gnomad_cpsr_tags.append('NON_CANCER_AN_' + str(pop)) gnomad_cpsr_tags.append('NON_CANCER_AF_' + str(pop)) gnomad_cpsr_tags.append('NON_CANCER_NHOMALT_' + str(pop)) if icgc is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, icgc_info_tags, query_info_tags, "icgc") if clinvar is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, clinvar_info_tags, query_info_tags, "clinvar") if ncer is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, ncer_info_tags, query_info_tags, "ncer") if gerp is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, gerp_info_tags, query_info_tags, "gerp") if dbmts is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, dbmts_info_tags, query_info_tags, "dbmts") if dbnsfp is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, dbnsfp_info_tags, query_info_tags, "dbnsfp") if cgi is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, cgi_info_tags, query_info_tags, "cgi") if tcga is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, tcga_info_tags, query_info_tags, "tcga") if tcga_pcdm is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, tcga_pcdm_info_tags, query_info_tags, "tcga_pcdm") if chasmplus is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, chasmplus_info_tags, query_info_tags, "chasmplus") if civic is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, civic_info_tags, query_info_tags, "civic") if cancer_hotspots is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, cancer_hotspots_info_tags, query_info_tags, "cancer_hotspots") if uniprot is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, uniprot_info_tags, query_info_tags, "uniprot") if docm is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, docm_info_tags, query_info_tags, "docm") if pcgr_onco_xref is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, pcgr_onco_xref_info_tags, query_info_tags, "pcgr_onco_xref") if gwas is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, gwas_info_tags, query_info_tags, "gwas") if rmsk is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, rmsk_info_tags, query_info_tags, "rmsk") if simplerepeats is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, simplerepeats_info_tags, query_info_tags, "simplerepeats") if winmsk is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, winmsk_info_tags, query_info_tags, "winmsk") if gnomad_cpsr is True: prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, gnomad_cpsr_tags, query_info_tags, "gnomad_cpsr") if not panel_normal_vcf is None: if "PANEL_OF_NORMALS" in query_info_tags: logger.warning("Query VCF has INFO tag \"PANEL_OF_NORMALS\" - this is also present in the panel of normal VCF file. This tag will be overwritten if not renamed in the query VCF") append_to_vcf_header(pcgr_db_directory, "panel_of_normals", vcfheader_file, logger) fh = open(conf_fname,'a') fh.write('[[annotation]]\n') fh.write('file="' + str(panel_normal_vcf) + '"\n') fields_string = 'fields = ["' + '","'.join(panel_normal_tags) + '"]' ops = ['self'] * len(panel_normal_tags) ops_string = 'ops=["' + '","'.join(ops) + '"]' fh.write(fields_string + '\n') fh.write(ops_string + '\n\n') fh.close() out_vcf_vcfanno_unsorted1 = output_vcf + '.tmp.unsorted.1' query_prefix = re.sub(r'\.vcf.gz$','',query_vcf) print_vcf_header(query_vcf, vcfheader_file, logger, chromline_only = True) command1 = f"vcfanno -p={num_processes} {conf_fname} {query_vcf} > {out_vcf_vcfanno_unsorted1} 2> {query_prefix}.vcfanno.log" check_subprocess(logger, command1, debug) check_subprocess(logger, f'cat {vcfheader_file} > {output_vcf}', debug=False) check_subprocess(logger, f'cat {out_vcf_vcfanno_unsorted1} | grep -v \'^#\' >> {output_vcf}', debug=False) check_subprocess(logger, f'bgzip -f {output_vcf}', debug) check_subprocess(logger, f'tabix -f -p vcf {output_vcf}.gz', debug) if not keep_logs: for tmpf in glob.glob(f"{output_vcf}.tmp*"): utils.remove(tmpf)
def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, genome_assembly, logger, debug): """ Function that checks whether the custom genelist contains valid entries from the complete exploratory track """ genelist_reader = csv.DictReader(open(genelist_fname,'r'), delimiter='\n', fieldnames=['ensembl_gene_id']) superpanel_track_bed = os.path.join(pcgr_dir, "data", genome_assembly, "virtual_panels", "0." + genome_assembly + ".bed.gz") superpanel_track_tsv = os.path.join(pcgr_dir, "data", genome_assembly, "virtual_panels", "cpsr_superpanel." + genome_assembly + ".tsv") genelist_bed_fname_unsorted = genelist_bed_fname + '.tmp_unsorted' customlist_identifiers = {} superpanel_track = [] superpanel_identifiers_all = {} valid_custom_identifiers = [] valid_custom_symbols = [] for row in genelist_reader: if not re.match(r'^ENSG[0-9]{1,}$',str(row['ensembl_gene_id']).rstrip()): err_msg = "Custom list of genes from CPSR superpanel (panel 0) should be provided as Ensembl gene identifiers, '" + str(row['ensembl_gene_id']) + "' is not a valid identifier" return error_message(err_msg, logger) else: customlist_identifiers[str(row['ensembl_gene_id']).strip()] = 1 superpanel_reader = csv.DictReader(open(superpanel_track_tsv, 'r'), delimiter = '\t') for row in superpanel_reader: superpanel_track.append(dict(row)) #superpanel_track = list(set(superpanel_track)) i = 0 while i < len(superpanel_track): superpanel_identifiers_all[superpanel_track[i]['ensembl_gene_id']] = superpanel_track[i]['symbol'] i = i + 1 for g in customlist_identifiers.keys(): if g in superpanel_identifiers_all.keys(): valid_custom_identifiers.append(g) valid_custom_symbols.append(superpanel_identifiers_all[g]) else: logger.warning("Ignoring custom-provided gene identifier (" + str(g) + ") NOT found in CPSR superpanel (panel 0)") logger.warning("Choose only Ensembl gene identifiers from this set in data bundle: data/" + str(genome_assembly) + "/virtual_panels/cpsr_superpanel." + str(genome_assembly) + ".tsv") all_valid_custom_geneset = ', '.join(sorted(valid_custom_symbols)) logger.info('Detected n = ' + str(len(valid_custom_identifiers)) + ' valid targets in custom-provided gene list file (--custom_list)):') logger.info(all_valid_custom_geneset) if len(valid_custom_identifiers) == 0: logger.info('') logger.info("NO valid gene identifiers from panel 0 in custom-provided genelist - exiting") logger.info('') exit(1) ## Add secondary findings genes to target BED cmd_secondary_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'\|ACMG_SF30\|\' > ' + str(genelist_bed_fname_unsorted) check_subprocess(logger, cmd_secondary_regions_bed, debug) ## Add GWAS hits to target BED cmd_gwas_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'rs[0-9]{3,}\|\' >> ' + str(genelist_bed_fname_unsorted) check_subprocess(logger, cmd_gwas_regions_bed, debug) ## Add custom set genes to target BED logger.info('Creating BED file with custom target genes: ' + str(genelist_bed_fname)) for g in valid_custom_identifiers: cmd_target_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'\|' + g + '\|\' >> ' + str(genelist_bed_fname_unsorted) check_subprocess(logger, cmd_target_regions_bed, debug) ## Sort regions in target BED if os.path.exists(genelist_bed_fname_unsorted) and os.stat(genelist_bed_fname_unsorted).st_size != 0: cmd_sort_custom_bed1 = 'egrep \'^[0-9]\' ' + str(genelist_bed_fname_unsorted) + ' | sort -k1,1n -k2,2n -k3,3n > ' + str(genelist_bed_fname) cmd_sort_custom_bed2 = 'egrep -v \'^[0-9]\' ' + str(genelist_bed_fname_unsorted) + ' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3n >> ' + str(genelist_bed_fname) check_subprocess(logger, cmd_sort_custom_bed1, debug) check_subprocess(logger, cmd_sort_custom_bed2, debug) if not debug: utils.remove(str(genelist_bed_fname_unsorted)) #else: #print('balle') return 0
def simplify_vcf(input_vcf, vcf, custom_bed, pcgr_directory, genome_assembly, virtual_panel_id, sample_id, diagnostic_grade_only, output_dir, logger, debug): """ Function that performs four separate checks/filters on the validated input VCF: 1. Remove/Strip off any genotype data (not needed for annotation) 2. If VCF have variants with multiple alternative alleles ("multiallelic", e.g. 'A,T'), these are decomposed into variants with a single alternative allele 3. Filters against predisposition loci (virtual panel id or custom target) 4. Final VCF file is sorted and indexed (bgzip + tabix) """ input_vcf_cpsr_ready = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready.tmp.vcf', os.path.basename(input_vcf))) input_vcf_cpsr_ready_decomposed = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready.vcf', os.path.basename(input_vcf))) input_vcf_cpsr_ready_decomposed_target = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf', os.path.basename(input_vcf))) virtual_panels_tmp_bed = os.path.join(output_dir, "virtual_panels_all." + str(sample_id) + ".tmp.bed") virtual_panels_bed = os.path.join(output_dir, "virtual_panels_all." + str(sample_id) + ".bed") multiallelic_list = list() for rec in vcf: POS = rec.start + 1 alt = ",".join(str(n) for n in rec.ALT) if len(rec.ALT) > 1: variant_id = f"{rec.CHROM}:{POS}_{rec.REF}->{alt}" multiallelic_list.append(variant_id) is_gzipped = True if input_vcf.endswith('.gz') else False cat_vcf = f"bgzip -dc {input_vcf}" if is_gzipped else "cat {input_vcf}" command_vcf_sample_free1 = f'{cat_vcf} | egrep \'^##\' > {input_vcf_cpsr_ready}' command_vcf_sample_free2 = f'{cat_vcf} | egrep \'^#CHROM\' >> {input_vcf_cpsr_ready}' command_vcf_sample_free3 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k4,4 -k5,5 >> {input_vcf_cpsr_ready}' command_vcf_sample_free4 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k4,4 -k5,5 >> {input_vcf_cpsr_ready}' command_vcf_sample_free5 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k4,4 -k5,5 >> {input_vcf_cpsr_ready}' check_subprocess(logger, command_vcf_sample_free1, debug) check_subprocess(logger, command_vcf_sample_free2, debug) check_subprocess(logger, command_vcf_sample_free3, debug) check_subprocess(logger, command_vcf_sample_free4, debug) check_subprocess(logger, command_vcf_sample_free5, debug) if multiallelic_list: logger.warning(f"There were {len(multiallelic_list)} multiallelic sites detected. Showing (up to) the first 100:") print('----') print(', '.join(multiallelic_list[:100])) print('----') logger.info('Decomposing multi-allelic sites in input VCF file using \'vt decompose\'') command_decompose = f'vt decompose -s {input_vcf_cpsr_ready} > {input_vcf_cpsr_ready_decomposed} 2> {os.path.join(output_dir, "decompose.log")}' check_subprocess(logger, command_decompose, debug) else: command_copy = f'cp {input_vcf_cpsr_ready} {input_vcf_cpsr_ready_decomposed}' check_subprocess(logger, command_copy, debug) if not custom_bed == 'None': logger.info('Limiting variant set to user-defined screening loci (custom list from panel 0)') if os.path.exists(custom_bed) and os.stat(custom_bed).st_size != 0: target_variants_intersect_cmd = "bedtools intersect -wa -u -header -a " + str(input_vcf_cpsr_ready_decomposed) + " -b " + str(custom_bed) + " > " + str(input_vcf_cpsr_ready_decomposed_target) check_subprocess(logger, target_variants_intersect_cmd, debug) else: logger.info('Custom BED file has a filesize of zero or does not exist') else: logger.info('Limiting variant set to cancer predisposition loci - virtual panel id(s): ' + str(virtual_panel_id)) ## Concatenate all panel BEDs to one big virtual panel BED, sort and make unique panel_ids = str(virtual_panel_id).split(',') for pid in panel_ids: target_bed_gz = os.path.join(pcgr_directory,'data',genome_assembly, 'virtual_panels', str(pid) + "." + genome_assembly + ".bed.gz") if diagnostic_grade_only == 1 and virtual_panel_id != 0: logger.info('Considering diagnostic-grade only genes in panel ' + str(pid) + ' - (GREEN status in Genomics England PanelApp)') target_bed_gz = os.path.join(pcgr_directory, 'data', genome_assembly, 'virtual_panels', str(pid) + "." + genome_assembly + ".GREEN.bed.gz") check_subprocess(logger, f'bgzip -dc {target_bed_gz} >> {virtual_panels_tmp_bed}', debug) ## sort the collection of virtual panels if os.path.exists(virtual_panels_tmp_bed) and os.stat(virtual_panels_tmp_bed).st_size != 0: cmd_sort_virtual_panel_bed1 = 'egrep \'^[0-9]\' ' + str(virtual_panels_tmp_bed) + ' | sort -k1,1n -k2,2n -k3,3n | uniq > ' + str(virtual_panels_bed) cmd_sort_virtual_panel_bed2 = 'egrep -v \'^[0-9]\' ' + str(virtual_panels_tmp_bed) + ' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3n | uniq >> ' + str(virtual_panels_bed) check_subprocess(logger, cmd_sort_virtual_panel_bed1, debug) check_subprocess(logger, cmd_sort_virtual_panel_bed2, debug) if not debug: utils.remove(str(virtual_panels_tmp_bed)) if os.path.exists(virtual_panels_bed): target_variants_intersect_cmd = f'bedtools intersect -wa -u -header -a {input_vcf_cpsr_ready_decomposed} -b {virtual_panels_bed} > {input_vcf_cpsr_ready_decomposed_target}' check_subprocess(logger, target_variants_intersect_cmd, debug) check_subprocess(logger, f'bgzip -cf {input_vcf_cpsr_ready_decomposed_target} > {input_vcf_cpsr_ready_decomposed_target}.gz', debug) check_subprocess(logger, f'tabix -p vcf {input_vcf_cpsr_ready_decomposed_target}.gz', debug) if not debug: for fn in [input_vcf_cpsr_ready, virtual_panels_bed, input_vcf_cpsr_ready_decomposed, os.path.join(output_dir, "decompose.log")]: #print(f"Deleting {fn}") utils.remove(fn) if os.path.exists(input_vcf_cpsr_ready_decomposed_target + '.gz') and os.path.getsize(input_vcf_cpsr_ready_decomposed_target + '.gz') > 0: vcf = VCF(input_vcf_cpsr_ready_decomposed_target + '.gz') i = 0 for rec in vcf: i = i + 1 if len(vcf.seqnames) == 0 or i == 0: logger.info('') logger.info("Query VCF contains NO variants within the selected cancer predisposition geneset or ACMG-recommended genes for secondary findings - quitting workflow") logger.info('') exit(1)
def simplify_vcf(input_vcf, vcf, output_dir, keep_uncompressed, logger, debug): """ Function that performs the following on the validated input VCF: 1. Strip of any genotype data 2. If VCF has variants with multiple alternative alleles ("multiallelic", e.g. 'A,T'), these are decomposed into variants with a single alternative allele 3. Final VCF file is sorted and indexed (bgzip + tabix) """ input_vcf_pcgr_ready = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.tmp.vcf', os.path.basename(input_vcf))) input_vcf_pcgr_ready_decomposed = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.vcf', os.path.basename(input_vcf))) multiallelic_list = list() for rec in vcf: POS = rec.start + 1 alt = ",".join(str(n) for n in rec.ALT) if len(rec.ALT) > 1: variant_id = f"{rec.CHROM}:{POS}_{rec.REF}->{alt}" multiallelic_list.append(variant_id) is_gzipped = True if input_vcf.endswith('.gz') else False cat_vcf = f"bgzip -dc {input_vcf}" if is_gzipped else "cat {input_vcf}" # Remove FORMAT metadata lines command_vcf_sample_free1 = f'{cat_vcf} | egrep \'^##\' | egrep -v \'^##FORMAT=\' > {input_vcf_pcgr_ready}' # Output first 8 column names (CHROM-INFO, so ignore FORMAT + sample columns) command_vcf_sample_free2 = f'{cat_vcf} | egrep \'^#CHROM\' | cut -f1-8 >> {input_vcf_pcgr_ready}' # Looking at variant rows, remove chr prefix, grab CHROM-INFO, sort separately for auto/XYM/rest chrom by cols 1+2 (CHROM+POS) then cols 4+5 (REF+ALT) command_vcf_sample_free3 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k4,4 -k5,5 >> {input_vcf_pcgr_ready}' command_vcf_sample_free4 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k4,4 -k5,5 >> {input_vcf_pcgr_ready}' command_vcf_sample_free5 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k4,4 -k5,5 >> {input_vcf_pcgr_ready}' check_subprocess(logger, command_vcf_sample_free1, debug) check_subprocess(logger, command_vcf_sample_free2, debug) check_subprocess(logger, command_vcf_sample_free3, debug) check_subprocess(logger, command_vcf_sample_free4, debug) check_subprocess(logger, command_vcf_sample_free5, debug) if multiallelic_list: logger.warning(f"There were {len(multiallelic_list)} multiallelic sites detected. Showing (up to) the first 100:") print('----') print(', '.join(multiallelic_list[:100])) print('----') logger.info('Decomposing multi-allelic sites in input VCF file using \'vt decompose\'') command_decompose = f'vt decompose -s {input_vcf_pcgr_ready} > {input_vcf_pcgr_ready_decomposed} 2> {os.path.join(output_dir, "decompose.log")}' check_subprocess(logger, command_decompose, debug) else: logger.info('All sites seem to be decomposed - skipping decomposition!') check_subprocess(logger, f'cp {input_vcf_pcgr_ready} {input_vcf_pcgr_ready_decomposed}', debug) # need to keep uncompressed copy for vcf2maf.pl if selected bgzip_cmd = f"bgzip -cf {input_vcf_pcgr_ready_decomposed} > {input_vcf_pcgr_ready_decomposed}.gz" if keep_uncompressed else f"bgzip -f {input_vcf_pcgr_ready_decomposed}" check_subprocess(logger, bgzip_cmd, debug) check_subprocess(logger, f'tabix -p vcf {input_vcf_pcgr_ready_decomposed}.gz', debug) if os.path.exists(f'{input_vcf_pcgr_ready_decomposed}.gz') and os.path.getsize(f'{input_vcf_pcgr_ready_decomposed}.gz') > 0: vcf = VCF(f'{input_vcf_pcgr_ready_decomposed}.gz') i = 0 for rec in vcf: i = i + 1 if len(vcf.seqnames) == 0 or i == 0: logger.info('') logger.info("Input VCF contains NO valid variants after VCF cleaning - quitting workflow") logger.info('') exit(1) utils.remove(input_vcf_pcgr_ready) utils.remove(os.path.join(output_dir, "decompose.log"))