def __main__(): parser = argparse.ArgumentParser(description='Cancer gene annotations from PCGR pipeline (SNVs/InDels)') parser.add_argument('vcf_file', help='VCF file with VEP-annotated query variants (SNVs/InDels)') parser.add_argument('pon_annotation',default=0,type=int,help='Include Panel of Normals annotation') parser.add_argument('regulatory_annotation',default=0,type=int,help='Inclusion of VEP regulatory annotations (0/1)') parser.add_argument('pcgr_db_dir',help='PCGR data directory') parser.add_argument('--cpsr',action="store_true",help="Aggregate cancer gene annotations for Cancer Predisposition Sequencing Reporter (CPSR)") parser.add_argument("--debug", action="store_true", default=False, help="Print full commands to log, default: %(default)s") args = parser.parse_args() logger = utils.getlogger('pcgr-gene-annotate') if args.cpsr is True: logger = utils.getlogger('cpsr-gene-annotate') extend_vcf_annotations(args.vcf_file, args.pcgr_db_dir, logger, args.pon_annotation, args.regulatory_annotation, args.cpsr, args.debug)
def __main__(): parser = argparse.ArgumentParser(description='Run brentp/vcfanno - annotate a VCF file against multiple VCF files in parallel', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('query_vcf', help='Bgzipped input VCF file with query variants (SNVs/InDels)') parser.add_argument('out_vcf', help='Output VCF file with appended annotations from multiple VCF files') parser.add_argument('pcgr_db_dir', help='PCGR assembly-specific data directory') parser.add_argument('--num_processes', help="Number of processes vcfanno can use during annotation", default=4) parser.add_argument("--docm",action = "store_true", help="Annotate VCF with annotations from Database of Curated Mutations") parser.add_argument("--clinvar",action = "store_true", help="Annotate VCF with annotations from ClinVar") parser.add_argument("--ncer",action = "store_true", help="Annotate VCF with ranking of variant deleteriousness in non-coding regions (ncER)") parser.add_argument('--dbmts',action = "store_true", help="Annotate VCF file with variants predicted to cause loss/gain of miRNA target sites in 3'UTR regions") parser.add_argument('--gerp',action = "store_true", help="Annotate VCF file with GERP RS scores (cancer predisposition gene/SF/GWAS loci only)") parser.add_argument("--dbnsfp",action = "store_true", help="Annotate VCF with annotations from database of non-synonymous functional predictions") parser.add_argument("--tcga",action = "store_true", help="Annotate VCF with variant frequencies from the The Cancer Genome Atlas") parser.add_argument("--tcga_pcdm",action = "store_true", help="Annotate VCF with putative cancer driver mutations from The Cancer Genome Atlas") parser.add_argument("--chasmplus", action="store_true",help="Annotate VCF with putative cancer driver mutations from CHASMplus algorithm") parser.add_argument("--civic",action = "store_true", help="Annotate VCF with annotations from the Clinical Interpretation of Variants in Cancer database") parser.add_argument("--cgi",action = "store_true", help="Annotate VCF with annotations from the Cancer bioMarkers database") parser.add_argument("--icgc",action = "store_true", help="Annotate VCF with known variants found in the ICGC-PCAWG sequencing project") parser.add_argument("--cancer_hotspots",action = "store_true", help="Annotate VCF with mutation hotspots from cancerhotspots.org") parser.add_argument("--uniprot",action = "store_true", help="Annotate VCF with protein functional features from the UniProt Knowledgebase") parser.add_argument("--pcgr_onco_xref",action = "store_true", help="Annotate VCF with transcript annotations from PCGR (targeted drugs, protein complexes, cancer gene associations, etc)") parser.add_argument("--gwas",action = "store_true", help="Annotate VCF against known loci associated with cancer, as identified from genome-wide association studies (GWAS)") parser.add_argument("--rmsk",action = "store_true", help="Annotate VCF against known sequence repeats, as identified by RepeatMasker (rmsk)") parser.add_argument("--simplerepeats",action = "store_true", help="Annotate VCF against known sequence repeats, as identified by Tandem Repeats Finder (simplerepeats)") parser.add_argument("--winmsk",action = "store_true", help="Annotate VCF against known sequence repeats, as identified by Windowmasker (winmsk)") parser.add_argument("--gnomad_cpsr",action = "store_true",help="Annotate VCF with population-specific allelic counts and frequencies in cancer predisposition genes (gnomAD non-cancer subset)") parser.add_argument("--panel_normal_vcf",dest="panel_normal_vcf",help="Annotate VCF with germline calls from panel of normals") parser.add_argument("--keep_logs",action = "store_true") parser.add_argument("--debug", action="store_true", default=False, help="Print full commands to log, default: %(default)s") args = parser.parse_args() logger = utils.getlogger('pcgr-vcfanno') query_info_tags = get_vcf_info_tags(args.query_vcf) vcfheader_file = args.out_vcf + '.tmp.' + str(random.randrange(0,10000000)) + '.header.txt' conf_fname = args.out_vcf + '.tmp.conf.toml' print_vcf_header(args.query_vcf, vcfheader_file, logger, chromline_only = False) run_vcfanno(args.num_processes, args.query_vcf, args.panel_normal_vcf, query_info_tags, vcfheader_file, args.pcgr_db_dir, conf_fname, args.out_vcf, args.docm, args.clinvar, args.ncer, args.dbmts, args.gerp, args.tcga, args.tcga_pcdm, args.chasmplus, args.dbnsfp, args.civic, args.cgi, args.icgc, args.uniprot, args.cancer_hotspots, args.pcgr_onco_xref, args.gwas, args.rmsk, args.simplerepeats, args.winmsk, args.gnomad_cpsr, args.keep_logs, args.debug, logger)
def validate_cpsr_input(pcgr_directory, input_vcf, custom_list_fname, preserved_info_tags, vcf_validation, genome_assembly, sample_id, virtual_panel_id, diagnostic_grade_only, output_dir, debug): """ Function that reads the input files to CPSR (VCF file) and performs the following checks: 1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file) 2. Check that no INFO annotation tags in the query VCF coincides with those generated by CPSR 3. Check that custom VCF INFO tags set by user as retained for output is found in query VCF 4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose 5. Check that VCF contains a single sample column 6. The resulting VCF file is sorted and indexed (bgzip + tabix) """ logger = utils.getlogger('cpsr-validate-input-arguments') custom_list_bed_fname = 'None' if not custom_list_fname == 'None': logger.info('Establishing BED track with custom list of genes from panel 0') custom_list_bed_fname = os.path.join(output_dir, sample_id + '.cpsr.' + genome_assembly + '.custom_list.bed') get_valid_custom_genelist(custom_list_fname, custom_list_bed_fname, pcgr_directory, genome_assembly, logger, debug) #config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'cpsr') if not input_vcf == 'None': if vcf_validation == 1: logger.info('Skipping validation of VCF file (deprecated as of Dec 2021)') else: logger.info('Skipping validation of VCF file as provided by option --no_vcf_validate') tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger) if tag_check == -1: return -1 if preserved_info_tags != "None": custom_check = check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger) if custom_check == -1: return -1 vcf = VCF(input_vcf) samples = vcf.samples if len(samples) > 1: err_msg = "Query VCF contains more than one sample column (" + ', '.join(samples) + ") - CPSR expects a germline VCF with a single sample column - exiting" return error_message(err_msg, logger) simplify_vcf(input_vcf, vcf, custom_list_bed_fname, pcgr_directory, genome_assembly, virtual_panel_id, sample_id, diagnostic_grade_only, output_dir, logger, debug) return 0
def run_pcgr(pcgr_paths, config_options): """ Main function to run the PCGR workflow """ debug = config_options['debug'] report_nonfloating_toc = 1 if config_options['other']['nonfloating_toc'] else 0 vep_regulatory_annotation = 'ON' if config_options['other']['vep_regulatory'] == 1 else 'OFF' clinical_trials_set = 'ON' if config_options['clinicaltrials']['run'] else 'OFF' msi_prediction_set = 'ON' if config_options['msi']['run'] else 'OFF' msig_estimation_set = 'ON' if config_options['msigs']['run'] else 'OFF' tmb_estimation_set = 'ON' if config_options['tmb']['run'] else 'OFF' vcf_validation = 0 if config_options['other']['no_vcf_validate'] else 1 run_vcf2maf = config_options['other']['vcf2maf'] assay_mode = 'Tumor vs. Control' tumor_only = 0 cell_line = 0 if config_options['tumor_only']['tumor_only']: assay_mode = 'Tumor-Only' tumor_only = 1 if config_options['tumor_only']['cell_line']: cell_line = 1 assay_mode = 'Tumor-Only (cell line)' # set basic run commands output_vcf = 'None' output_pass_vcf = 'None' output_pass_tsv = 'None' output_maf = 'None' GENCODE_VERSION = pcgr_vars.GENCODE_VERSION NCBI_BUILD_MAF = pcgr_vars.NCBI_BUILD_MAF VEP_ASSEMBLY = pcgr_vars.VEP_ASSEMBLY MAX_VARIANTS_FOR_REPORT = pcgr_vars.MAX_VARIANTS_FOR_REPORT if config_options['genome_assembly'] == 'grch37': NCBI_BUILD_MAF = 'GRCh37' GENCODE_VERSION = 'release 19' VEP_ASSEMBLY = 'GRCh37' logger = getlogger('pcgr-get-OS') vep_dir = os.path.join(str(pcgr_paths['db_dir']), '.vep') input_vcf = 'None' input_cna = 'None' input_rna_fusion = 'None' input_rna_expression = 'None' input_cpsr_report = 'None' panel_normal = 'None' # panel-of-normals annotation pon_annotation = 0 # Specify paths for input files and directories if pcgr_paths['input_vcf_basename'] != 'NA': input_vcf = os.path.join(pcgr_paths['input_vcf_dir'], pcgr_paths['input_vcf_basename']) if pcgr_paths['input_cna_basename'] != 'NA': input_cna = os.path.join(pcgr_paths['input_cna_dir'], pcgr_paths['input_cna_basename']) if pcgr_paths['input_rna_fusion_basename'] != 'NA': input_rna_fusion = os.path.join(pcgr_paths['input_rna_fusion_dir'], pcgr_paths['input_rna_fusion_basename']) if pcgr_paths['input_rna_expression_basename'] != 'NA': input_rna_expression = os.path.join(pcgr_paths['input_rna_expression_dir'], pcgr_paths['input_rna_expression_basename']) if pcgr_paths['input_cpsr_report_basename'] != 'NA': input_cpsr_report = os.path.join(pcgr_paths['input_cpsr_report_dir'], pcgr_paths['input_cpsr_report_basename']) if pcgr_paths['panel_normal_vcf_basename'] != 'NA': panel_normal = os.path.join(pcgr_paths['panel_normal_vcf_dir'], pcgr_paths['panel_normal_vcf_basename']) data_dir = pcgr_paths['base_dir'] output_dir = pcgr_paths['output_dir'] # PCGR|validate_input - verify that VCF and CNA segment file is of appropriate format logger = getlogger("pcgr-validate-input-arguments") logger.info("PCGR - STEP 0: Validate input data and options") vcf_validate_command = ( f'pcgr_validate_input.py ' f'{data_dir} ' f'{input_vcf} ' f'{input_cna} ' f'{input_rna_fusion} ' f'{input_rna_expression} ' f'{panel_normal} ' f'{vcf_validation} ' f'{tumor_only} ' f'{config_options["genome_assembly"]} ' f'{config_options["other"]["preserved_info_tags"]} ' f'{config_options["allelic_support"]["tumor_dp_tag"]} {config_options["allelic_support"]["tumor_af_tag"]} ' f'{config_options["allelic_support"]["control_dp_tag"]} {config_options["allelic_support"]["control_af_tag"]} ' f'{config_options["allelic_support"]["call_conf_tag"]} ' f'{config_options["tumor_only"]["exclude_likely_hom_germline"]} ' f'{config_options["tumor_only"]["exclude_likely_het_germline"]} ' f'--output_dir {output_dir} ' f'{"--debug " if debug else ""}' f'{"--keep_uncompressed" if run_vcf2maf else ""} ' ) check_subprocess(logger, vcf_validate_command, debug) logger.info('Finished pcgr-validate-input-arguments') print('----') # PCGR|start - Log key information about sample, options and sequencing assay/design logger = getlogger('pcgr-start') logger.info('--- Personal Cancer Genome Reporter workflow ----') logger.info(f'Sample name: {config_options["sample_id"]}') if config_options['tumor_type']['type'] == 'Cancer_NOS': logger.info('Tumor type: Cancer_NOS (Any tumortype)') else: logger.info(f'Tumor type: {config_options["tumor_type"]["type"]}') logger.info(f'Sequencing assay - type: {config_options["assay"]}') logger.info(f'Sequencing assay - mode: {assay_mode}') logger.info(f'Sequencing assay - coding target size: {config_options["tmb"]["target_size_mb"]}Mb') logger.info(f'Genome assembly: {config_options["genome_assembly"]}') logger.info(f'Mutational signature estimation: {msig_estimation_set}') logger.info(f'MSI classification: {msi_prediction_set}') logger.info(f'Mutational burden estimation: {tmb_estimation_set}') logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}') if not input_vcf == 'None': # Define temporary output file names prefix = os.path.join(output_dir, f'{config_options["sample_id"]}.pcgr_acmg.{config_options["genome_assembly"]}') output_vcf = f'{prefix}.vcf.gz' output_pass_vcf = f'{prefix}.pass.vcf.gz' output_pass_tsv = f'{prefix}.pass.tsv' output_pass_raw_tsv_gz = f'{prefix}.pass.raw.tsv.gz' output_maf = f'{prefix}.tmp.maf' output_vcf2maf_log = f'{prefix}.maf.log' input_vcf_pcgr_ready = os.path.join(output_dir, re.sub(r"(\.vcf$|\.vcf\.gz$)", ".pcgr_ready.vcf.gz", pcgr_paths["input_vcf_basename"])) # needs to be uncompressed for vcf2maf input_vcf_pcgr_ready_uncompressed = os.path.join(output_dir, re.sub(r"(\.vcf$|\.vcf\.gz$)", ".pcgr_ready.vcf", pcgr_paths["input_vcf_basename"])) vep_vcf = re.sub(r"(\.vcf$|\.vcf\.gz$)", ".vep.vcf.gz", input_vcf_pcgr_ready) vep_vcfanno_vcf = re.sub(r"(\.vcf$|\.vcf\.gz$)", ".vep.vcfanno.vcf", input_vcf_pcgr_ready) vep_vcfanno_annotated_vcf = re.sub(r"\.vcfanno", ".vcfanno.annotated", vep_vcfanno_vcf) + ".gz" vep_vcfanno_annotated_pass_vcf = re.sub(r"\.vcfanno", ".vcfanno.annotated.pass", vep_vcfanno_vcf) + ".gz" fasta_assembly = os.path.join(vep_dir, 'homo_sapiens', f'{pcgr_vars.VEP_VERSION}_{VEP_ASSEMBLY}', f'Homo_sapiens.{VEP_ASSEMBLY}.dna.primary_assembly.fa.gz') # List all VEP flags used when calling VEP vep_flags = ( f'--hgvs --af --af_1kg --af_gnomad --variant_class --domains --symbol --protein --ccds --mane ' f'--uniprot --appris --biotype --tsl --canonical --format vcf --cache --numbers --total_length --allele_number ' f'--no_stats --no_escape --xref_refseq --vcf --check_ref --dont_skip --flag_pick_allele --plugin NearestExonJB,max_range=50000 ' f'--force_overwrite --species homo_sapiens --offline --compress_output bgzip' ) vep_options = ( f'--dir {vep_dir} --assembly {VEP_ASSEMBLY} --cache_version {pcgr_vars.VEP_VERSION} ' f'--fasta {fasta_assembly} --pick_order {config_options["other"]["vep_pick_order"]} ' f'--buffer_size {config_options["other"]["vep_buffer_size"]} ' f'--fork {config_options["other"]["vep_n_forks"]} ' f'{vep_flags} ' f'{"--verbose" if debug else "--quiet"} ' ) gencode_set_in_use = "GENCODE - all transcripts" if config_options['other']['vep_no_intergenic'] == 1: vep_options += '--no_intergenic ' if config_options['other']['vep_regulatory'] == 1: vep_options += '--regulatory ' if config_options['other']['vep_gencode_all'] == 0: vep_options += '--gencode_basic ' gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)" # Compose full VEP command vep_main_command = f'{utils.get_perl_exports()} && vep --input_file {input_vcf_pcgr_ready} --output_file {vep_vcf} {vep_options}' vep_tabix_command = f'tabix -f -p vcf {vep_vcf}' # PCGR|VEP - run consequence annotation with Variant Effect Predictor print('----') logger = getlogger('pcgr-vep') logger.info(f'PCGR - STEP 1: Basic variant annotation with Variant Effect Predictor ({pcgr_vars.VEP_VERSION}, GENCODE {GENCODE_VERSION}, {config_options["genome_assembly"]})') logger.info(f'VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)') logger.info(f'VEP configuration - transcript pick order: {config_options["other"]["vep_pick_order"]}') logger.info(f'VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options') logger.info(f'VEP configuration - GENCODE set: {gencode_set_in_use}') logger.info(f'VEP configuration - skip intergenic: {"TRUE" if config_options["other"]["vep_no_intergenic"] else "FALSE"}') logger.info(f'VEP configuration - regulatory annotation: {vep_regulatory_annotation}') logger.info(f'VEP configuration - buffer_size/number of forks: {config_options["other"]["vep_buffer_size"]}/{config_options["other"]["vep_n_forks"]}') check_subprocess(logger, vep_main_command, debug) check_subprocess(logger, vep_tabix_command, debug) logger.info('Finished pcgr-vep') print('----') # PCGR|vcf2maf - if option set, convert VCF to MAF with https://github.com/mskcc/vcf2maf if run_vcf2maf: logger.info('Converting VEP-annotated VCF to MAF with https://github.com/mskcc/vcf2maf') vcf2maf_command = ( f'vcf2maf.pl --inhibit-vep --input-vcf {input_vcf_pcgr_ready_uncompressed} ' f'--tumor-id {config_options["sample_id"]} --output-maf {output_maf} --ref-fasta {fasta_assembly} ' f'--ncbi-build {NCBI_BUILD_MAF} > {output_vcf2maf_log} 2>&1' ) check_subprocess(logger, vcf2maf_command, debug) utils.remove(input_vcf_pcgr_ready_uncompressed) utils.remove(output_vcf2maf_log) logger.info('Finished pcgr-vep-vcf2maf') print('----') # PCGR|vcfanno - annotate VCF against a number of variant annotation resources logger = getlogger("pcgr-vcfanno") pcgr_vcfanno_command = ( f'pcgr_vcfanno.py {vep_vcf} {vep_vcfanno_vcf} {pcgr_paths["db_dir"]} ' f'--num_processes {config_options["other"]["vcfanno_n_proc"]} ' f'--chasmplus --dbnsfp --docm --clinvar --icgc --civic --cgi --tcga_pcdm --winmsk --simplerepeats ' f'--tcga --uniprot --cancer_hotspots --pcgr_onco_xref ' f'{"--debug " if debug else ""}' ) anno_src_msg = ( f"Annotation sources: {'Panel-of-Normals, ' if panel_normal != 'None' else ''}ClinVar, dbNSFP, " f"UniProtKB, cancerhotspots.org, CiVIC, CGI, DoCM, CHASMplus driver mutations, TCGA, ICGC-PCAWG" ) logger.info("PCGR - STEP 2: Annotation for precision oncology with pcgr-vcfanno") logger.info(anno_src_msg) if panel_normal != "None": pon_annotation = 1 pcgr_vcfanno_command += f'--panel_normal_vcf {panel_normal}' check_subprocess(logger, pcgr_vcfanno_command, debug) logger.info("Finished pcgr-vcfanno") print('----') # PCGR|pcgr_summarise - expand annotations in VCF file logger = getlogger("pcgr-summarise") pcgr_summarise_command = ( f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {pon_annotation} ' f'{config_options["other"]["vep_regulatory"]} ' f'{pcgr_paths["db_dir"]} ' f'{"--debug" if debug else ""}' ) logger.info("PCGR - STEP 3: Cancer gene annotations with pcgr-summarise") check_subprocess(logger, pcgr_summarise_command, debug) # PCGR|clean - move output files and clean up temporary files os.rename(vep_vcfanno_annotated_vcf, output_vcf) os.rename(f'{vep_vcfanno_annotated_vcf}.tbi', f'{output_vcf}.tbi') os.rename(vep_vcfanno_annotated_pass_vcf, output_pass_vcf) os.rename(f'{vep_vcfanno_annotated_pass_vcf}.tbi', f'{output_pass_vcf}.tbi') delete_files = ( glob(f'{vep_vcf}*') + glob(f'{vep_vcfanno_annotated_vcf}') + glob(f'{vep_vcfanno_annotated_pass_vcf}*') + glob(f'{vep_vcfanno_vcf}*') + glob(f'{input_vcf_pcgr_ready_uncompressed}*') ) # do not delete if debugging if not debug: for fn in delete_files: #print(f"Deleting {fn}") utils.remove(fn) logger.info('Finished pcgr-summarise main command') # PCGR|vcf2tsv - convert VCF to TSV with https://github.com/sigven/vcf2tsv pcgr_vcf2tsv_command = f'vcf2tsv.py {output_pass_vcf} --compress {output_pass_tsv}' logger.info("Converting VCF to TSV with https://github.com/sigven/vcf2tsv") check_subprocess(logger, pcgr_vcf2tsv_command, debug) logger.info('Finished pcgr-summarise-vcf2tsv') if config_options['assay'] == 'WGS' or config_options['assay'] == 'WES': output_pass_tsv_gz = f'{output_pass_tsv}.gz' # check that output file exist if os.path.exists(output_pass_tsv_gz): # get number of rows/variants annotated, using pandas var_data = pandas.read_csv(output_pass_tsv_gz, sep = '\t', low_memory = False, header = [1]) num_variants_raw = len(var_data) if num_variants_raw > MAX_VARIANTS_FOR_REPORT: logger.info(f'Number of raw variants in input VCF ({num_variants_raw}) exceeds {MAX_VARIANTS_FOR_REPORT} - intergenic/intronic variants will be excluded prior to reporting') # Exclude intronic and intergenic variants prior to analysis with pcgrr (reporting and further analysis) var_data_filtered = var_data[~var_data.Consequence.str.contains('^intron') & ~var_data.Consequence.str.contains('^intergenic')] num_variants_excluded1 = num_variants_raw - len(var_data_filtered) logger.info(f'Number of intergenic/intronic variants excluded: {num_variants_excluded1}') # Exclude upstream_gene/downstream_gene variants if size of filtered variant set is still above MAX_VARIANTS_FOR_REPORT # TODO: in this case, the TMB calculation will be an underestimate (but still likely huge) var_data_filtered_final = var_data_filtered if len(var_data_filtered) > MAX_VARIANTS_FOR_REPORT: var_data_filtered_final = var_data_filtered[~var_data_filtered.Consequence.str.contains('^upstream_gene') & ~var_data_filtered.Consequence.str.contains('^downstream_gene')] num_variants_excluded2 = len(var_data_filtered) - len(var_data_filtered_final) logger.info(f'Number of upstream_gene/downstream_gene variants excluded: {num_variants_excluded2}') # get vcf2tsv header and pipe to output TSV file get_vcf2tsv_header = f'gzip -dc {output_pass_tsv_gz} | egrep \'^#\' > {output_pass_tsv}' check_subprocess(logger, get_vcf2tsv_header, debug) # rename original vcf2tsv (gzipped) to 'raw' filename rename_output_tsv = f'mv {output_pass_tsv_gz} {output_pass_raw_tsv_gz}' check_subprocess(logger, rename_output_tsv, debug) # append filtered data output to output TSV file var_data_filtered_final.to_csv(output_pass_tsv, sep='\t', encoding='utf-8', mode = 'a', index = False) # gzip filtered output TSV file gzip_filtered_output_tsv = f'gzip -f {output_pass_tsv}' check_subprocess(logger, gzip_filtered_output_tsv, debug) logger.info('Finished pcgr-summarise') print('----') # Generation of HTML reports for VEP/vcfanno-annotated VCF and copy number segment file if not config_options['other']['basic']: co = config_options ttype = co['tumor_type']['type'].replace(' ', '_').replace('/', '@') logger = getlogger('pcgr-writer') logger.info('PCGR - STEP 4: Generation of output files - variant interpretation report for precision oncology') # export PATH to R conda env Rscript rscript = utils.script_path('pcgrr', 'bin/Rscript') pcgrr_script = utils.script_path('pcgr', 'bin/pcgrr.R') pcgr_report_command = ( f"{rscript} {pcgrr_script} " f"{output_dir} " f"{output_pass_tsv}.gz " f"{input_cna} " f"{input_rna_fusion} " f"{input_rna_expression} " f"{input_cpsr_report} " f"{config_options['sample_id']} " f"{pcgr_vars.PCGR_VERSION} " f"{pcgr_vars.DB_VERSION} " f"{config_options['genome_assembly']} " f"{data_dir} " f"{co['tumor_purity']} " f"{co['tumor_ploidy']} " f"{ttype} " f"{co['tmb']['target_size_mb']} " f"{co['assay']} " f"{tumor_only} " f"{cell_line} " f"{co['tumor_only']['maf_onekg_afr']} " f"{co['tumor_only']['maf_onekg_amr']} " f"{co['tumor_only']['maf_onekg_eas']} " f"{co['tumor_only']['maf_onekg_eur']} " f"{co['tumor_only']['maf_onekg_sas']} " f"{co['tumor_only']['maf_onekg_global']} " f"{co['tumor_only']['maf_gnomad_afr']} " f"{co['tumor_only']['maf_gnomad_amr']} " f"{co['tumor_only']['maf_gnomad_asj']} " f"{co['tumor_only']['maf_gnomad_eas']} " f"{co['tumor_only']['maf_gnomad_fin']} " f"{co['tumor_only']['maf_gnomad_nfe']} " f"{co['tumor_only']['maf_gnomad_oth']} " f"{co['tumor_only']['maf_gnomad_sas']} " f"{co['tumor_only']['maf_gnomad_global']} " f"{co['tumor_only']['exclude_pon']} " f"{co['tumor_only']['exclude_likely_hom_germline']} " f"{co['tumor_only']['exclude_likely_het_germline']} " f"{co['tumor_only']['exclude_dbsnp_nonsomatic']} " f"{co['tumor_only']['exclude_nonexonic']} " f"{co['tmb']['run']} " f"{co['tmb']['algorithm']} " f"{co['msi']['run']} " f"{co['msigs']['run']} " f"{co['msigs']['mutation_limit']} " f"{co['msigs']['all_reference_signatures']} " f"{co['msigs']['include_artefact_signatures']} " f"{co['msigs']['prevalence_reference_signatures']} " f"{co['cna']['logR_homdel']} " f"{co['cna']['logR_gain']} " f"{co['cna']['cna_overlap_pct']} " f"{co['allelic_support']['tumor_af_min']} " f"{co['allelic_support']['tumor_dp_min']} " f"{co['allelic_support']['control_dp_min']} " f"{co['allelic_support']['control_af_max']} " f"{co['allelic_support']['tumor_af_tag']} " f"{co['allelic_support']['tumor_dp_tag']} " f"{co['allelic_support']['control_af_tag']} " f"{co['allelic_support']['control_dp_tag']} " f"{co['allelic_support']['call_conf_tag']} " f"{co['clinicaltrials']['run']} " f"{co['other']['vep_n_forks']} " f"{co['other']['vep_buffer_size']} " f"{co['other']['vep_no_intergenic']} " f"{co['other']['vep_pick_order']} " f"{co['other']['vep_regulatory']} " f"{co['other']['vep_gencode_all']} " f"{co['other']['vcf2maf']} " f"{co['other']['list_noncoding']} " f"{co['other']['preserved_info_tags']} " f"{co['other']['visual_theme']} " f"{report_nonfloating_toc} " f"{co['other']['no_vcf_validate']}" ) if debug: print(pcgr_report_command) check_subprocess(logger, pcgr_report_command, debug) logger.info("Finished PCGR!") print('----') print()
def run_cpsr(arg_dict, cpsr_paths): """ Main function to run the CPSR workflow """ debug = arg_dict['debug'] diagnostic_grade_only = 0 vcf_validation = 1 virtual_panel_id = "-1" ignore_noncoding = 0 gwas_findings = 0 secondary_findings = 0 classify_all = 0 clinvar_ignore_noncancer = 0 report_nonfloating_toc = 0 vep_no_intergenic = 0 vep_regulatory = 0 preserved_info_tags = arg_dict['preserved_info_tags'] diagnostic_grade_set = "OFF" secondary_findings_set = "OFF" gwas_findings_set = "OFF" if arg_dict['vep_regulatory']: vep_regulatory = 1 if arg_dict["vep_no_intergenic"]: vep_no_intergenic = 1 if arg_dict['clinvar_ignore_noncancer']: clinvar_ignore_noncancer = 1 if arg_dict['classify_all']: classify_all = 1 if arg_dict['gwas_findings']: gwas_findings = 1 gwas_findings_set = "ON" if arg_dict['secondary_findings']: secondary_findings = 1 secondary_findings_set = "ON" if arg_dict['diagnostic_grade_only']: diagnostic_grade_only = 1 diagnostic_grade_set = "ON" if arg_dict['report_nonfloating_toc']: report_nonfloating_toc = 1 if arg_dict['no_vcf_validate']: vcf_validation = 0 if arg_dict['virtual_panel_id'] != "-1": virtual_panel_id = arg_dict['virtual_panel_id'] if arg_dict['custom_list']: virtual_panel_id = "-1" if arg_dict['ignore_noncoding']: ignore_noncoding = 1 output_vcf = 'None' output_pass_vcf = 'None' output_pass_tsv = 'None' uid = '' GENCODE_VERSION = pcgr_vars.GENCODE_VERSION VEP_ASSEMBLY = pcgr_vars.VEP_ASSEMBLY VEP_VERSION = pcgr_vars.VEP_VERSION if arg_dict['genome_assembly'] == 'grch37': GENCODE_VERSION = '19' VEP_ASSEMBLY = 'GRCh37' vepdb_dir = os.path.join(str(cpsr_paths['db_dir']),'.vep') input_vcf = 'None' input_customlist = 'None' if cpsr_paths['input_vcf_basename'] != 'NA': input_vcf = os.path.join(cpsr_paths['input_vcf_dir'], cpsr_paths['input_vcf_basename']) if cpsr_paths['input_customlist_basename'] != 'NA': input_customlist = os.path.join(cpsr_paths['input_customlist_dir'], cpsr_paths['input_customlist_basename']) data_dir = cpsr_paths['base_dir'] output_dir = cpsr_paths['output_dir'] vep_dir = vepdb_dir logger = getlogger('cpsr-validate-input-arguments') logger.info("CPSR - STEP 0: Validate input data") check_subprocess(logger, f'mkdir -p {output_dir}', debug) ## CPSR|Validate input VCF - check formatting, non-overlap with CPSR INFO tags, and whether sample contains any variants in cancer predisposition loci vcf_validate_command = ( f'cpsr_validate_input.py ' f'{data_dir} ' f'{input_vcf} ' f'{input_customlist} ' f'{preserved_info_tags} ' f'{vcf_validation} ' f'{arg_dict["genome_assembly"]} ' f'{arg_dict["sample_id"]} ' f'{virtual_panel_id} ' f'{diagnostic_grade_only} ' f'--output_dir {output_dir} {"--debug" if debug else ""}' ) check_subprocess(logger, vcf_validate_command, debug) logger.info('Finished cpsr-validate-input-arguments') print('----') ## CPSR|Start - log key information about run logger = getlogger("cpsr-start") logger.info("--- Cancer Predisposition Sequencing Reporter workflow ----") logger.info(f"Sample name: {arg_dict['sample_id']}") if not input_customlist == 'None': logger.info(f"Virtual gene panel: custom-made list from panel 0: {input_customlist}") else: #logger.info("Virtual gene panel(s): " + str(pcgr_vars.GE_panels[virtual_panel_id])) logger.info(f"Diagnostic-grade genes in virtual panels (GE PanelApp): {diagnostic_grade_set}") logger.info(f"Include incidental findings (ACMG recommended list v3.0): {secondary_findings_set}") logger.info(f"Include low to moderate cancer risk variants from genome-wide association studies: {gwas_findings_set}") logger.info(f"Reference population, germline variant frequencies (gnomAD): {str(arg_dict['pop_gnomad']).upper()}") logger.info(f"Genome assembly: {arg_dict['genome_assembly']}") if not input_vcf == 'None': ## Define input, output and temporary file names pcgr_model = 'cpsr' output_vcf = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.vcf.gz') output_pass_vcf = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.pass.vcf.gz') output_pass_tsv = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.pass.tsv') input_vcf_cpsr_ready = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf.gz', cpsr_paths['input_vcf_basename'])) input_vcf_cpsr_ready_uncompressed = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf', cpsr_paths['input_vcf_basename'])) vep_vcf = re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_vep.vcf',input_vcf_cpsr_ready) vep_vcfanno_vcf = re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_vep.vcfanno.vcf',input_vcf_cpsr_ready) vep_vcfanno_annotated_vcf = re.sub(r'\.vcfanno','.vcfanno.annotated',vep_vcfanno_vcf) + '.gz' vep_vcfanno_annotated_pass_vcf = re.sub(r'\.vcfanno','.vcfanno.annotated.pass',vep_vcfanno_vcf) + '.gz' custom_bed = os.path.join(output_dir, str(arg_dict['sample_id']) + '.' + str(pcgr_model) + '.' + str(arg_dict['genome_assembly']) + '.custom_list.bed') ## File names for assembly-specific genome fasta files (VEP) fasta_assembly = os.path.join(vep_dir, f"homo_sapiens/{VEP_VERSION}_{VEP_ASSEMBLY}/Homo_sapiens.{VEP_ASSEMBLY}.dna.primary_assembly.fa.gz") ancestor_assembly = os.path.join(vep_dir, f"homo_sapiens/{VEP_VERSION}_{VEP_ASSEMBLY}/human_ancestor.fa.gz") ## Set all flags used in VEP run plugins_in_use = "NearestExonJB, LoF" vep_flags = ( f"--format vcf --vcf --check_ref --flag_pick_allele_gene --hgvs --dont_skip --failed 1 --af --af_1kg --af_gnomad " f"--variant_class --domains --symbol --protein --ccds --uniprot --appris --biotype --canonical --cache " f"--numbers --total_length --no_stats --allele_number --no_escape --xref_refseq --plugin NearestExonJB,max_range=50000" ) vep_options = ( f"--pick_order {arg_dict['vep_pick_order']} --force_overwrite --buffer_size {arg_dict['vep_buffer_size']} " f"--species homo_sapiens --assembly {VEP_ASSEMBLY} --offline --fork {arg_dict['vep_n_forks']} {vep_flags} --dir {vep_dir} " f"--cache_version {VEP_VERSION}" ) gencode_set_in_use = "GENCODE - all transcripts" if arg_dict['vep_gencode_all'] == 0: vep_options += ' --gencode_basic' gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)" if arg_dict['vep_no_intergenic'] == 1: vep_options = vep_options + " --no_intergenic" if arg_dict['vep_regulatory'] == 1: vep_options = vep_options + " --regulatory" if arg_dict['genome_assembly'] == "grch38": vep_options = vep_options + " --mane" loftee_dir = utils.get_loftee_dir() assert os.path.isdir(loftee_dir), f'LoF VEP plugin is not found in {loftee_dir}. Please make sure you installed pcgr conda package and have corresponding conda environment active.' vep_options += f" --plugin LoF,loftee_path:{loftee_dir},human_ancestor_fa:{ancestor_assembly},use_gerp_end_trunc:0 --dir_plugins {loftee_dir}" if not debug: vep_options += " --quiet" ## Compose full VEP command vep_main_command = f'{utils.get_perl_exports()} && vep --input_file {input_vcf_cpsr_ready} --output_file {vep_vcf} {vep_options} --fasta {fasta_assembly}' vep_bgzip_command = f'bgzip -f {vep_vcf}' vep_tabix_command = f'tabix -f -p vcf {vep_vcf}.gz' logger = getlogger('cpsr-vep') ## CPSR|VEP - run Variant Effect Predictor on query VCF with LoF and NearestExonJB plugins logger.info(f"CPSR - STEP 1: Basic variant annotation with Variant Effect Predictor ({VEP_VERSION}, GENCODE {GENCODE_VERSION}, {arg_dict['genome_assembly']})") logger.info(f"VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)") logger.info(f"VEP configuration - transcript pick order: {arg_dict['vep_pick_order']}") logger.info(f"VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options") logger.info(f"VEP configuration - GENCODE set: {gencode_set_in_use}") logger.info(f"VEP configuration - skip intergenic: {arg_dict['vep_no_intergenic']}") logger.info(f"VEP configuration - look for overlap with regulatory regions: {vep_regulatory}") logger.info(f"VEP configuration - plugins in use: {plugins_in_use}") logger.info(f"VEP configuration - buffer_size/number of forks: {arg_dict['vep_buffer_size']}/{arg_dict['vep_n_forks']}") check_subprocess(logger, vep_main_command, debug) check_subprocess(logger, vep_bgzip_command, debug) check_subprocess(logger, vep_tabix_command, debug) logger.info("Finished cpsr-vep") print('----') ## CPSR|vcfanno - run vcfanno on query VCF with a number of relevant annotated VCFs logger = getlogger('cpsr-vcfanno') logger.info("CPSR - STEP 2: Annotation for cancer predisposition with cpsr-vcfanno") logger.info("(ClinVar, CIViC, dbNSFP, dbMTS, UniProtKB, cancerhotspots.org, ncER, GERP RS scores, GWAS catalog, gnomAD non-cancer subset)") pcgr_vcfanno_command = ( f"pcgr_vcfanno.py --num_processes {arg_dict['vcfanno_n_proc']} --dbnsfp --clinvar " f"--cancer_hotspots --dbmts --ncer --gerp --civic --uniprot --gnomad_cpsr --pcgr_onco_xref " f"--gwas --rmsk {vep_vcf}.gz {vep_vcfanno_vcf} {os.path.join(data_dir, 'data', str(arg_dict['genome_assembly']))}" ) check_subprocess(logger, pcgr_vcfanno_command, debug) logger.info("Finished cpsr-vcfanno") print('----') ## CPSR|summarise - expand annotations with separate VCF INFO tags logger = getlogger("cpsr-summarise") pcgr_summarise_command = ( f'pcgr_summarise.py {vep_vcfanno_vcf}.gz 0 {vep_regulatory} ' f'{os.path.join(data_dir, "data", arg_dict["genome_assembly"])} ' f'--cpsr {"--debug" if debug else ""}' ) logger.info("CPSR - STEP 3: Cancer gene annotations with cpsr-summarise") check_subprocess(logger, pcgr_summarise_command, debug) ## CPSR|clean - rename output files, remove temporary files os.rename(vep_vcfanno_annotated_vcf, output_vcf) os.rename(f'{vep_vcfanno_annotated_vcf}.tbi', f'{output_vcf}.tbi') os.rename(vep_vcfanno_annotated_pass_vcf, output_pass_vcf) os.rename(f'{vep_vcfanno_annotated_pass_vcf}.tbi', f'{output_pass_vcf}.tbi') delete_files = ( glob(f'{vep_vcf}*') + glob(f'{vep_vcfanno_annotated_vcf}') + glob(f'{vep_vcfanno_annotated_pass_vcf}*') + glob(f'{vep_vcfanno_vcf}*') + glob(f'{input_vcf_cpsr_ready_uncompressed}*') ) # do not delete if debugging if not debug: for fn in delete_files: #print(f"Deleting {fn}") utils.remove(fn) logger.info('Finished cpsr-summarise main command') ## CPSR|vcf2tsv - perform vcf2tsv conversion on the final annotated VCF file cpsr_vcf2tsv_command = f"vcf2tsv.py {output_pass_vcf} --compress {output_pass_tsv}" logger.info("Converting VCF to TSV with https://github.com/sigven/vcf2tsv") check_subprocess(logger, cpsr_vcf2tsv_command, debug) logger.info('Finished cpsr-summarise-vcf2tsv') logger.info('Finished cpsr-summarise') print('----') ## Generation of HTML reports for VEP/vcfanno-annotated VCF file if not arg_dict['basic']: logger = getlogger('cpsr-writer') logger.info("CPSR - STEP 4: Generation of output files - Cancer predisposition sequencing report") # export PATH to R conda env Rscript rscript = utils.script_path("pcgrr", "bin/Rscript") cpsrr_script = utils.script_path('pcgr', 'bin/cpsr.R') cpsr_report_command = ( f"{rscript} {cpsrr_script} " f"{output_dir} " f"{output_pass_tsv}.gz " f"{arg_dict['sample_id']} " f"{pcgr_vars.PCGR_VERSION} " f"{pcgr_vars.DB_VERSION} " f"{arg_dict['genome_assembly']} " f"{data_dir} " f"{virtual_panel_id} " f"{preserved_info_tags} " f"{custom_bed} " f"{arg_dict['custom_list_name']} " f"{arg_dict['report_theme']} " f"{arg_dict['report_table_display']} " f"{report_nonfloating_toc} " f"{gwas_findings} " f"{arg_dict['gwas_p_value']} " f"{arg_dict['pop_gnomad']} " f"{arg_dict['maf_upper_threshold']} " f"{arg_dict['vep_pick_order']} " f"{arg_dict['vep_n_forks']} " f"{arg_dict['vep_buffer_size']} " f"{arg_dict['vep_gencode_all']} " f"{vep_no_intergenic} " f"{vep_regulatory} " f"{secondary_findings} " f"{classify_all} " f"{ignore_noncoding} " f"{clinvar_ignore_noncancer} " f"{diagnostic_grade_only}" ) if debug: print(cpsr_report_command) check_subprocess(logger, cpsr_report_command, debug) logger.info("Finished CPSR!") print('----') print()
def check_args(arg_dict): logger = getlogger("pcgr-validate-arguments-input-a") # Check the existence of required arguments if arg_dict['pcgr_dir'] is None or not os.path.exists( arg_dict['pcgr_dir']): err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})." error_message(err_msg, logger) if arg_dict['genome_assembly'] is None: err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})." error_message(err_msg, logger) if arg_dict['input_vcf'] is None: err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})." error_message(err_msg, logger) if arg_dict['sample_id'] is None: err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})." error_message(err_msg, logger) if len(arg_dict['sample_id']) <= 2 or len(arg_dict['sample_id']) > 35: err_msg = f"Sample name identifier ('--sample_id' = {arg_dict['sample_id']}) must be between 2 and 35 characters long" error_message(err_msg, logger) # Optional arguments # check if input is cancer cell line, requires --tumor_only if arg_dict['cell_line'] and not arg_dict['tumor_only']: err_msg = 'Analysis of cell line (--cell_line) needs option --tumor_only' error_message(err_msg, logger) # check that tumor primary site/type is set correctly (integer between 0 and 30) if arg_dict['tsite'] > max( pcgr_vars.tsites.keys()) or arg_dict['tsite'] < 0: err_msg = f"Tumor type code ('--tumor_site' = {arg_dict['tsite']}) must be within [0, {max(pcgr_vars.tsites.keys())}]" error_message(err_msg, logger) # check that tumor purity and tumor ploidy is set correctly if not arg_dict['tumor_purity'] is None: if not (arg_dict['tumor_purity'] > 0 and arg_dict['tumor_purity'] <= 1): err_msg = f"Tumor purity value ('--tumor_purity' = {arg_dict['tumor_purity']}) must be within (0, 1]" error_message(err_msg, logger) if not arg_dict['tumor_ploidy'] is None: if not arg_dict['tumor_ploidy'] > 0: err_msg = f"Tumor ploidy value ('--tumor_ploidy' = {arg_dict['tumor_ploidy']}) must be > 0" error_message(err_msg, logger) # check that minimum/maximum depth/allelic fractions are set correctly if arg_dict['tumor_dp_min'] < 0: err_msg = f"Minimum depth tumor ('tumor_dp_min' = {arg_dict['tumor_dp_min']}) must be >= 0" error_message(err_msg, logger) if arg_dict['tumor_af_min'] < 0 or arg_dict['tumor_af_min'] > 1: err_msg = f"Minimum AF tumor ('tumor_af_min' = {arg_dict['tumor_af_min']}) must be within [0, 1]" error_message(err_msg, logger) if arg_dict['control_dp_min'] < 0: err_msg = f"Minimum depth control ('control_dp_min' = {arg_dict['control_dp_min']}) must be >= 0" error_message(err_msg, logger) if arg_dict['control_af_max'] < 0 or arg_dict['control_af_max'] > 1: err_msg = f"Maximum AF control ('control_af_max' = {arg_dict['control_af_max']}) must be within [0, 1]" error_message(err_msg, logger) # Check that coding target size region of sequencing assay is set correctly if arg_dict['target_size_mb'] < 0 or arg_dict['target_size_mb'] > 34: err_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) is not positive or larger than the likely maximum size of the coding human genome (34 Mb))" error_message(err_msg, logger) if arg_dict['target_size_mb'] < 1: warn_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) must be greater than 1 Mb for mutational burden estimate to be robust" warn_message(warn_msg, logger) if arg_dict['target_size_mb'] < 34 and arg_dict['assay'] != 'TARGETED': warn_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) is less than default for WES/WGS (34Mb), assay must be set to 'TARGETED'" warn_message(warn_msg, logger) # if assay is targeted or mode is Tumor-Only, MSI prediction will not be performed/switched off assay_type = 'Tumor-Control' if arg_dict['estimate_msi_status'] is True and ( arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True): if arg_dict['tumor_only'] is True: assay_type = 'Tumor-Only' warn_msg = f"MSI status prediction can be applied for WGS/WES tumor-control assays only (query type: {arg_dict['assay']}|{assay_type}) - analysis will be omitted" warn_message(warn_msg, logger) arg_dict['estimate_msi_status'] = 0 # minimum number of mutations required for mutational signature reconstruction cannot be less than 100 (somewhat arbitrary lower threshold, recommended value is 200) if arg_dict['min_mutations_signatures'] < 200: warn_msg = f"Minimum number of mutations required for mutational signature analysis ('--min_mutations_signatures' = {arg_dict['min_mutations_signatures']}) is less than the recommended number (n = 200)" warn_message(warn_msg, logger) if arg_dict['min_mutations_signatures'] < 100: err_msg = f"Minimum number of mutations required for mutational signature analysis ('--min_mutations_signatures' = {arg_dict['min_mutations_signatures']}) must be >= 100" error_message(err_msg, logger) # if MSI status is to be estimated, mutational burden must be turned on if arg_dict['estimate_msi_status'] is True and arg_dict[ 'estimate_tmb'] is False: err_msg = "Prediction of MSI status ('--estimate_msi_status') requires mutational burden analysis ('--estimate_tmb')" error_message(err_msg, logger) if arg_dict['tumor_only'] is True: for t in [ 'exclude_likely_het_germline', 'exclude_likely_hom_germline' ]: if arg_dict[t]: if arg_dict['tumor_af_tag'] == "_NA_": err_msg = f"Option '--{t}' requires '--tumor_af_tag' option to be set" error_message(err_msg, logger) # Emit warning if panel-of-normals VCF is not present and exclude_pon is set if arg_dict['pon_vcf'] is None and arg_dict['exclude_pon'] is True: warn_msg = "Panel-of-normals VCF is NOT provided ('--pon_vcf') - exclusion of calls found in panel-of-normals ('--exclude_pon') will be ignored" warn_message(warn_msg, logger) arg_dict['exclude_pon'] = False # Emit warnings that mutational burden and mutational signatures are less accurate for assays with tumor-only data if arg_dict['estimate_tmb'] is True: warn_msg = "Estimation of mutational burden in tumor-only mode is suboptimal - results must be interpreted with caution" warn_message(warn_msg, logger) if arg_dict['estimate_signatures'] is True: warn_msg = "Estimation of mutational signatures in tumor-only mode is suboptimal - results must be interpreted with caution" warn_message(warn_msg, logger) # Emit errors when tumor-only filtering thresholds are not properly set for pop in ['eur', 'afr', 'amr', 'eas', 'sas', 'global']: tag = f'maf_onekg_{pop}' if arg_dict[tag]: if float(arg_dict[tag]) < 0 or float(arg_dict[tag]) > 1: err_msg = f"MAF threshold (tumor-only germline filter) for 1000 Genomes Project (pop '{pop.upper()}') must be within the [0, 1] range, current value is {arg_dict[tag]}" error_message(err_msg, logger) for pop in [ 'nfe', 'fin', 'amr', 'eas', 'sas', 'asj', 'oth', 'afr', 'global' ]: tag = f'maf_gnomad_{pop}' if arg_dict[tag]: if float(arg_dict[tag]) < 0 or float(arg_dict[tag]) > 1: err_msg = f"MAF threshold (tumor-only germline filter) for gnomAD (pop '{pop.upper()}') must be within the [0, 1] range, current value is {arg_dict[tag]}" error_message(err_msg, logger) ## tumor-only is False # else: # for t in ["exclude_pon","exclude_likely_het_germline","exclude_likely_hom_germline","exclude_dbsnp_nonsomatic","exclude_nonexonic"]: # if arg_dict[t] is True: # warn_msg = "Option "--" + str(t) + "" requires "--tumor_only" option (not currently set)" # warn_message(warn_msg, logger) # Emit warning that mutational signature estimation is (likely) not optimal for small/targeted sequencing assays if arg_dict['estimate_signatures'] is True and arg_dict[ 'assay'] == 'TARGETED': warn_msg = "Estimation of mutational signatures ('--estimate_signatures') is not optimal for TARGETED sequencing assays - results must be interpreted with caution" warn_message(warn_msg, logger) # Check that log ratio thresholds for homozygous deletions and amplifications are properly set, and that segment overlap with transcripts are set appropriately if arg_dict['logr_homdel'] >= 0: err_msg = f"Log ratio for homozygous deletions ('--logr_homdel' = {arg_dict['logr_homdel']}) should be < 0" error_message(err_msg, logger) if arg_dict['logr_gain'] <= 0: err_msg = f"Log ratio for copy number gains/amplifications ('--logr_gain' = {arg_dict['logr_gain']}) should be > 0" error_message(err_msg, logger) if arg_dict['cna_overlap_pct'] > 100 or arg_dict['cna_overlap_pct'] <= 0: err_msg = f"Minimum percent overlap between copy number segment and gene transcript ('--cna_overlap_pct' = {arg_dict['cna_overlap_pct']}) must be within (0, 100]" error_message(err_msg, logger) # VEP options if arg_dict['vep_n_forks'] <= 0 or arg_dict['vep_n_forks'] > 4: err_msg = f"Number of forks that VEP can use during annotation ('--vep_n_forks' = {arg_dict['vep_n_forks']}must be within (0, 4]" error_message(err_msg, logger) if arg_dict['vep_buffer_size'] <= 0 or arg_dict['vep_buffer_size'] > 30000: err_msg = f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously ('--vep_buffer_size' = {arg_dict['vep_buffer_size']}), must be within (0, 30000]" error_message(err_msg, logger) # Check that VEP pick criteria is formatted correctly if not arg_dict['vep_pick_order'] is None: values = str(arg_dict['vep_pick_order']).split(',') permitted_sources = [ 'canonical', 'appris', 'tsl', 'biotype', 'ccds', 'rank', 'length', 'mane' ] num_permitted_sources = 0 for v in values: if v in permitted_sources: num_permitted_sources += 1 if num_permitted_sources != 8: err_msg = ( f"'--vep_pick_order' = {arg_dict['vep_pick_order']} is formatted incorrectly, should be " "a comma-separated string of the following values: canonical,appris,tsl,biotype,ccds,rank,length,mane" ) error_message(err_msg, logger) return
def verify_input_files_cpsr(arg_dict): logger = getlogger('cpsr-validate-input-arguments-b') input_vcf_dir = "NA" db_dir = "NA" base_dir = "NA" output_dir_full = "NA" input_vcf_basename = "NA" input_customlist_basename = "NA" input_customlist_dir = "NA" # create output folder (if not already exists) output_dir_full = utils.safe_makedir( os.path.abspath(arg_dict['output_dir'])) ## check if input BED exist if not arg_dict['custom_list'] is None: if not os.path.exists(os.path.abspath(arg_dict['custom_list'])): err_msg = f"Input file ({arg_dict['custom_list']}) does not exist" error_message(err_msg, logger) input_customlist_basename = os.path.basename( str(arg_dict['custom_list'])) input_customlist_dir = os.path.dirname( os.path.abspath(arg_dict['custom_list'])) ## check if input vcf exist if not arg_dict['input_vcf'] is None: if not os.path.exists(os.path.abspath(arg_dict['input_vcf'])): err_msg = f"Input file ({arg_dict['input_vcf']}) does not exist" error_message(err_msg, logger) if not (os.path.abspath(arg_dict['input_vcf']).endswith('.vcf') or os.path.abspath(arg_dict['input_vcf']).endswith('.vcf.gz')): err_msg = f"VCF input file ({os.path.abspath(arg_dict['input_vcf'])}) does not have the correct file extension (.vcf or .vcf.gz)" error_message(err_msg, logger) ## check that tabix file exist if bgzipped files is given if os.path.abspath(arg_dict['input_vcf']).endswith('.vcf.gz'): tabix_file = arg_dict['input_vcf'] + '.tbi' if not os.path.exists(os.path.abspath(tabix_file)): err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped VCF input file (" + os.path.abspath( arg_dict['input_vcf'] ) + "). Please make sure your input VCF is properly compressed and indexed (bgzip + tabix)" error_message(err_msg, logger) input_vcf_basename = os.path.basename(str(arg_dict['input_vcf'])) input_vcf_dir = os.path.dirname(os.path.abspath(arg_dict['input_vcf'])) ## if output vcf exist and overwrite not set output_vcf = os.path.join(str(output_dir_full), str(arg_dict['sample_id'])) + '.cpsr.' + str( arg_dict['genome_assembly']) + '.vcf.gz' if os.path.exists(output_vcf) and arg_dict['force_overwrite'] is False: err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different sample_id or add option --force_overwrite" error_message(err_msg, logger) ## check the existence of base folder base_dir = os.path.abspath(arg_dict['pcgr_dir']) if not os.path.isdir(base_dir): err_msg = f"Base directory ({base_dir}) does not exist" error_message(err_msg, logger) ## check the existence of data folder within the base folder db_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data') if not os.path.isdir(db_dir): err_msg = f"Data directory ({db_dir}) does not exist" error_message(err_msg, logger) ## check the existence of specified assembly data folder within the base folder db_assembly_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data', arg_dict['genome_assembly']) if not os.path.isdir(db_assembly_dir): err_msg = f"Data directory for the specified genome assembly ({db_assembly_dir}) does not exist" error_message(err_msg, logger) ## check the existence of RELEASE_NOTES rel_notes_file = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data', arg_dict['genome_assembly'], 'RELEASE_NOTES') if not os.path.exists(rel_notes_file): err_msg = 'The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/cpsr for instructions)' error_message(err_msg, logger) f_rel_not = open(rel_notes_file, 'r') compliant_data_bundle = 0 for line in f_rel_not: if pcgr_vars.DB_VERSION in line: compliant_data_bundle = 1 f_rel_not.close() if compliant_data_bundle == 0: err_msg = 'The PCGR data bundle is not compliant with the software version - please download the latest software and data bundle (see https://github.com/sigven/cpsr for instructions)' error_message(err_msg, logger) cpsr_paths = { "input_vcf_dir": input_vcf_dir, "input_customlist_dir": input_customlist_dir, "db_dir": db_assembly_dir, "base_dir": base_dir, "output_dir": output_dir_full, "input_vcf_basename": input_vcf_basename, "input_customlist_basename": input_customlist_basename, } return cpsr_paths
def check_args_cpsr(arg_dict): logger = getlogger('cpsr-validate-input-arguments-a') arg_dict['vep_regulatory'] = True ## Required arguments ## Check that query VCF is set and exists if arg_dict['input_vcf'] is None or not os.path.exists( arg_dict['input_vcf']): err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})." error_message(err_msg, logger) ## Check that PCGR directory (with data bundle) is provided and exists if arg_dict['pcgr_dir'] is None or not os.path.exists( arg_dict['pcgr_dir']): err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})." error_message(err_msg, logger) ## Check that genome assembly is set if arg_dict['genome_assembly'] is None: err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})." error_message(err_msg, logger) ## Check that sample identifier is set and is of appropriate length (minimum two characters) if arg_dict['sample_id'] is None: err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})." error_message(err_msg, logger) if len(arg_dict['sample_id']) <= 2: err_msg = f"Sample name identifier ('--sample_id') requires a name with more than two characters ({arg_dict['sample_id']})." error_message(err_msg, logger) ### Optional arguments ## Provide virtual_panel_id or a custom list from panel 0 if arg_dict['virtual_panel_id'] == "-1" and not arg_dict['custom_list']: err_msg = 'Provide valid virtual panel identifier(s) through --panel_id (0 - 42) or provide custom list of panel 0 genes (single column text file) through --custom_list' error_message(err_msg, logger) if arg_dict['custom_list'] and arg_dict['virtual_panel_id'] != "-1": err_msg = "Option --panel_id cannot be used in conjunction with --custom_list" error_message(err_msg, logger) if arg_dict['maf_upper_threshold'] <= 0 or arg_dict[ 'maf_upper_threshold'] > 1: err_msg = f"MAF upper threshold must be greater than 0 and below 1, current value is {arg_dict['maf_upper_threshold']}" error_message(err_msg, logger) if arg_dict['vcfanno_n_proc'] <= 0 or arg_dict['vcfanno_n_proc'] > 15: err_msg = f"Number of processes that vcfanno can use during annotation must be above 0 and not more than 15, current value is {arg_dict['vcfanno_n_proc']}." error_message(err_msg, logger) ## Check that panel identifier(s) are set appropriately if arg_dict['virtual_panel_id'] != "-1" and not arg_dict['custom_list']: if not ',' in arg_dict['virtual_panel_id']: if str(arg_dict['virtual_panel_id']).isdigit(): panel_id = int(arg_dict['virtual_panel_id']) if not (panel_id >= 0 and panel_id <= 42): err_msg = 'A single panel chosen with \'--panel_id\' must be in the range 0 - 42' error_message(err_msg, logger) else: err_msg = 'A single panel chosen with \'--panel_id\' must be a proper integer - not \'' + str( arg_dict['virtual_panel_id']) + '\'' error_message(err_msg, logger) else: panels = str(arg_dict['virtual_panel_id']).split(',') for p in panels: #p = int(p) if str(p).isdigit(): panel_id = int(p) if panel_id < 1 or panel_id > 42: err_msg = 'Multiple panels submitted as comma-separated string with \'--panel_id\' must take values in the range 1 - 42' error_message(err_msg, logger) else: err_msg = f"Multiple panels submitted as comma-separated string with '--panel_id' must contain proper integer values only - \'{arg_dict['virtual_panel_id']}\' contains non-integer entries." error_message(err_msg, logger) if (arg_dict['custom_list'] or arg_dict['virtual_panel_id'] == "0") and arg_dict['diagnostic_grade_only']: warn_msg = 'Option \'--diagnostic_grade_only\' applies ONLY to panel identifiers from Genomics England PanelApp - will be ignored' warn_message(warn_msg, logger) ## VEP options if arg_dict['vep_n_forks'] <= 0 or arg_dict['vep_n_forks'] > 4: err_msg = f"Number of forks that VEP can use during annotation must be above 0 and not more than 4, current value is {arg_dict['vep_n_forks']}" error_message(err_msg, logger) if arg_dict['vep_buffer_size'] <= 0 or arg_dict['vep_buffer_size'] > 30000: err_msg = "Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously, must be above 0 and not more than 30,000, current value is {arg_dict['vep_buffer_size']}" error_message(err_msg, logger) ## Check that VEP pick criteria is formatted correctly if not arg_dict['vep_pick_order'] is None: values = str(arg_dict['vep_pick_order']).split(',') permitted_sources = [ 'canonical', 'appris', 'tsl', 'biotype', 'ccds', 'rank', 'length', 'mane' ] num_permitted_sources = 0 for v in values: if v in permitted_sources: num_permitted_sources += 1 if num_permitted_sources != 8: err_msg = "Option 'vep_pick_order' = " + str(arg_dict['vep_pick_order']) + " is formatted incorrectly, should be " + \ "a comma-separated string of the following values: canonical,appris,tsl,biotype,ccds,rank,length,mane" error_message(err_msg, logger) return
def verify_input_files(arg_dict): """ 1. Checks existence of input files/dirs (arg_dict) 2. Checks that the data bundle is of correct date """ logger = getlogger("pcgr-validate-arguments-input-b") input_vcf_dir = 'NA' input_cna_dir = 'NA' input_rna_fusion_dir = 'NA' input_cpsr_report_dir = 'NA' input_rna_expression_dir = 'NA' input_cna_plot_dir = 'NA' panel_normal_vcf_dir = 'NA' db_dir = 'NA' base_dir = 'NA' output_dir_full = 'NA' panel_normal_vcf_basename = 'NA' input_vcf_basename = 'NA' input_cna_basename = 'NA' input_rna_fusion_basename = 'NA' input_rna_expression_basename = 'NA' input_cpsr_report_basename = 'NA' input_cna_plot_basename = 'NA' arg_dict['rna_fusion_tumor'] = None arg_dict['rna_exp_tumor'] = None # check that either input vcf or cna segments exist if arg_dict['input_vcf'] is None and arg_dict['input_cna'] is None: err_msg = 'Please specifiy either a VCF input file (--input_vcf) or a copy number segment file (--input_cna)' error_message(err_msg, logger) # create output folder (if not already exists) output_dir_full = utils.safe_makedir( os.path.abspath(arg_dict['output_dir'])) # check if panel of normal VCF exist if not arg_dict["pon_vcf"] is None: if not os.path.exists(os.path.abspath(arg_dict["pon_vcf"])): err_msg = "Input file (" + \ str(arg_dict["pon_vcf"]) + ") does not exist" error_message(err_msg, logger) if not (os.path.abspath(arg_dict["pon_vcf"]).endswith(".vcf.gz")): err_msg = "Panel of normals VCF file (" + os.path.abspath( arg_dict["pon_vcf"] ) + ") does not have the correct file extension (.vcf.gz)" error_message(err_msg, logger) # check that tabix file exist if bgzipped files is given if os.path.abspath(arg_dict["pon_vcf"]).endswith(".vcf.gz"): tabix_file = arg_dict["pon_vcf"] + ".tbi" if not os.path.exists(os.path.abspath(tabix_file)): err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped panel of normal VCF file (" + os.path.abspath(arg_dict["pon_vcf"]) + \ "). Please make sure your the VCF is properly compressed and indexed (bgzip + tabix)" error_message(err_msg, logger) if arg_dict["input_vcf"] is None: warn_msg = "Ignoring panel of normal VCF file, --input_vcf missing" warn_message(warn_msg, logger) else: panel_normal_vcf_basename = os.path.basename( str(arg_dict["pon_vcf"])) panel_normal_vcf_dir = os.path.dirname( os.path.abspath(arg_dict["pon_vcf"])) # check if input vcf exists if not arg_dict["input_vcf"] is None: if not os.path.exists(os.path.abspath(arg_dict["input_vcf"])): err_msg = f'Input file ({arg_dict["input_vcf"]}) does not exist' error_message(err_msg, logger) if not (os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf") or os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf.gz")): err_msg = f'VCF input file ({os.path.abspath(arg_dict["input_vcf"])}) does not have the correct file extension (.vcf or .vcf.gz)' error_message(err_msg, logger) # check that tabix file exists if bgzipped file is given if os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf.gz"): tabix_file = arg_dict["input_vcf"] + ".tbi" if not os.path.exists(os.path.abspath(tabix_file)): err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped VCF input file (" + os.path.abspath(arg_dict["input_vcf"]) + \ "). Please make sure your input VCF is properly compressed and indexed (bgzip + tabix)" error_message(err_msg, logger) input_vcf_basename = os.path.basename(str(arg_dict["input_vcf"])) input_vcf_dir = os.path.dirname(os.path.abspath(arg_dict["input_vcf"])) # if output vcf exist and overwrite not set output_vcf = os.path.join( str(output_dir_full), f"{arg_dict['sample_id']}.pcgr_acmg.{arg_dict['genome_assembly']}.vcf.gz" ) if os.path.exists(output_vcf) and arg_dict["force_overwrite"] is False: err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different sample_id or add option --force_overwrite" error_message(err_msg, logger) # check if input cna plot file exist # if not arg_dict["input_cna_plot"] is None: # if not os.path.exists(os.path.abspath(arg_dict["input_cna_plot"])): # err_msg = "Input file (" + str(arg_dict["input_cna_plot"]) + ") does not exist" # error_message(err_msg,logger) # if not (os.path.abspath(arg_dict["input_cna_plot"]).endswith(".png")): # err_msg = "CNA segment input file (" + os.path.abspath(arg_dict["input_cna_plot"]) + ") does not have the correct file extension (.png)" # error_message(err_msg,logger) # if arg_dict["input_cna"] is None: # err_msg = "Input a CNA plot needs to come with a CNA segment file (--input_cna is missing)" # error_message(err_msg,logger) # input_cna_plot_basename = os.path.basename(str(arg_dict["input_cna_plot"])) # input_cna_plot_dir = os.path.dirname(os.path.abspath(arg_dict["input_cna_plot"])) # check if input cna segments exist if not arg_dict["input_cna"] is None: if not os.path.exists(os.path.abspath(arg_dict["input_cna"])): err_msg = "Input file (" + \ str(arg_dict["input_cna"]) + ") does not exist" error_message(err_msg, logger) if not (os.path.abspath(arg_dict["input_cna"]).endswith(".tsv") or os.path.abspath(arg_dict["input_cna"]).endswith(".txt")): err_msg = "CNA segment input file (" + os.path.abspath( arg_dict["input_cna"] ) + ") does not have the correct file extension (.tsv or .txt)" error_message(err_msg, logger) input_cna_basename = os.path.basename(str(arg_dict["input_cna"])) input_cna_dir = os.path.dirname(os.path.abspath(arg_dict["input_cna"])) # if output cna segments exist and overwrite not set output_cna_segments = os.path.join( str(output_dir_full), str( arg_dict["sample_id"])) + ".pcgr_acmg." + str( arg_dict["genome_assembly"]) + ".cna_segments.tsv.gz" if os.path.exists( output_cna_segments) and arg_dict["force_overwrite"] is False: err_msg = "Output files (e.g. " + str(output_cna_segments) + \ ") already exist - please specify different sample_id or add option --force_overwrite" error_message(err_msg, logger) # check if input rna fusion variants exist if not arg_dict["rna_fusion_tumor"] is None: if not os.path.exists(os.path.abspath(arg_dict["rna_fusion_tumor"])): err_msg = "Input file (" + \ str(arg_dict["rna_fusion_tumor"]) + ") does not exist" error_message(err_msg, logger) if not (os.path.abspath(arg_dict["rna_fusion_tumor"]).endswith(".tsv") or os.path.abspath( arg_dict["rna_fusion_tumor"]).endswith(".txt")): err_msg = "RNA fusion variants file (" + os.path.abspath( arg_dict["rna_fusion_tumor"] ) + ") does not have the correct file extension (.tsv or .txt)" error_message(err_msg, logger) input_rna_fusion_basename = os.path.basename( str(arg_dict["rna_fusion_tumor"])) input_rna_fusion_dir = os.path.dirname( os.path.abspath(arg_dict["rna_fusion_tumor"])) # check if input rna expression exist if not arg_dict["rna_exp_tumor"] is None: if not os.path.exists(os.path.abspath(arg_dict["rna_exp_tumor"])): err_msg = "Input file (" + \ str(arg_dict["rna_exp_tumor"]) + ") does not exist" error_message(err_msg, logger) if not (os.path.abspath(arg_dict["rna_exp_tumor"]).endswith(".tsv") or os.path.abspath(arg_dict["rna_exp_tumor"]).endswith(".txt")): err_msg = "RNA gene expression file (" + os.path.abspath( arg_dict["rna_exp_tumor"] ) + ") does not have the correct file extension (.tsv or .txt)" error_message(err_msg, logger) input_rna_expression_basename = os.path.basename( str(arg_dict["rna_exp_tumor"])) input_rna_expression_dir = os.path.dirname( os.path.abspath(arg_dict["rna_exp_tumor"])) # check if input rna fusion variants exist if not arg_dict["cpsr_report"] is None: if not os.path.exists(os.path.abspath(arg_dict["cpsr_report"])): err_msg = "Input file (" + \ str(arg_dict["cpsr_report"]) + ") does not exist" error_message(err_msg, logger) if not (os.path.abspath(arg_dict["cpsr_report"]).endswith(".json.gz")): err_msg = "CPSR report file (" + os.path.abspath( arg_dict["cpsr_report"] ) + ") does not have the correct file extension (.json.gz)" error_message(err_msg, logger) input_cpsr_report_basename = os.path.basename( str(arg_dict["cpsr_report"])) input_cpsr_report_dir = os.path.dirname( os.path.abspath(arg_dict["cpsr_report"])) # check the existence of base folder base_dir = os.path.abspath(arg_dict["pcgr_dir"]) if not os.path.isdir(base_dir): err_msg = "Base directory (" + str(base_dir) + ") does not exist" error_message(err_msg, logger) # check the existence of data folder within the base folder db_dir = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]), "data") if not os.path.isdir(db_dir): err_msg = "Data directory (" + str(db_dir) + ") does not exist" error_message(err_msg, logger) # check the existence of specified assembly data folder within the base folder db_assembly_dir = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]), "data", arg_dict["genome_assembly"]) if not os.path.isdir(db_assembly_dir): err_msg = "Data directory for the specified genome assembly (" + str( db_assembly_dir) + ") does not exist" error_message(err_msg, logger) # check the existence of RELEASE_NOTES (starting from 0.4.0) rel_notes_file = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]), "data", arg_dict["genome_assembly"], "RELEASE_NOTES") if not os.path.exists(rel_notes_file): err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)" error_message(err_msg, logger) f_rel_not = open(rel_notes_file, "r") compliant_data_bundle = 0 for line in f_rel_not: if pcgr_vars.DB_VERSION in line: compliant_data_bundle = 1 f_rel_not.close() if compliant_data_bundle == 0: err_msg = "The PCGR data bundle is not compliant with the software version - please download the latest software and data bundle (see https://github.com/sigven/pcgr for instructions)" error_message(err_msg, logger) pcgr_paths = { "input_vcf_dir": input_vcf_dir, "input_cna_dir": input_cna_dir, "input_rna_fusion_dir": input_rna_fusion_dir, "input_rna_expression_dir": input_rna_expression_dir, "input_cpsr_report_dir": input_cpsr_report_dir, "input_cna_plot_dir": input_cna_plot_dir, "panel_normal_vcf_dir": panel_normal_vcf_dir, "db_dir": db_assembly_dir, "base_dir": base_dir, "output_dir": output_dir_full, "panel_normal_vcf_basename": panel_normal_vcf_basename, "input_vcf_basename": input_vcf_basename, "input_cna_basename": input_cna_basename, "input_rna_fusion_basename": input_rna_fusion_basename, "input_rna_expression_basename": input_rna_expression_basename, "input_cpsr_report_basename": input_cpsr_report_basename, "input_cna_plot_basename": input_cna_plot_basename, } return pcgr_paths
def validate_pcgr_input(pcgr_directory, input_vcf, input_cna, input_rna_fusion, input_rna_expression, tumor_dp_tag, tumor_af_tag, control_dp_tag, control_af_tag, call_conf_tag, exclude_hom_germline, exclude_het_germline, panel_normal_vcf, preserved_info_tags, vcf_validation, tumor_only, genome_assembly, keep_uncompressed, output_dir, debug): """ Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks: 1. no INFO annotation tags in the query VCF coincides with those generated by PCGR 2. provided columns for tumor/normal coverage and allelic depths are found in VCF 3. provided preserved VCF columns are present in VCF file 4. if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose 5. panel-of-normals VCF adheres to the required format (PANEL_OF_NORMALS INFO tag in header) 6. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 7. Check that copy number segment file has required columns and correct data types (and range) 8. Check that RNA fusion variant file has required columns and correct data types 9. Check that RNA expression file has required columns and correct data types """ logger = utils.getlogger('pcgr-validate-input-arguments') # if panel_normal_vcf == "None" and tumor_only == 1 and config_options['tumor_only']['exclude_pon'] is True: # logger.warning('Panel-of-normals VCF is not present - exclusion of calls found in panel-of-normals will be ignored') if not input_vcf == 'None': ## Perform VCF validation if this option is set if vcf_validation == 1: logger.info('Skipping validation of VCF file (deprecated as of Dec 2021)') else: logger.info('Skipping validation of VCF file as provided by option --no_vcf_validate') ## Check that VCF does not contain INFO tags that will be appended with PCGR annotation tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger) if tag_check == -1: return -1 if preserved_info_tags != "None": custom_check = check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger) if custom_check == -1: return -1 ## Check whether specified tags for depth/allelic fraction are properly defined in VCF vcf = VCF(input_vcf) allelic_support_check = check_format_ad_dp_tags(vcf, tumor_dp_tag, tumor_af_tag, control_dp_tag, control_af_tag, call_conf_tag, exclude_hom_germline, exclude_het_germline, tumor_only, logger) if allelic_support_check == -1: return -1 ## Simplify VCF - remove multiallelic variants simplify_vcf(input_vcf, vcf, output_dir, keep_uncompressed, logger, debug) ## Validate panel-of-normals VCF is provided if not panel_normal_vcf == "None": valid_panel_normals = validate_panel_normal_vcf(panel_normal_vcf, logger) if valid_panel_normals == -1: return -1 ## Check whether file with copy number aberration segments is properly formatted if not input_cna == 'None': valid_cna = is_valid_cna(input_cna, logger) if valid_cna == -1: return -1 ## Check whether file with RNA fusion variants is properly formatted if not input_rna_fusion == 'None': valid_rna_fusion = is_valid_rna_fusion(input_rna_fusion, logger) if valid_rna_fusion == -1: return -1 ## Check whether file with RNA fusion variants is properly formatted if not input_rna_expression == 'None': valid_rna_expression = is_valid_rna_expression(input_rna_expression, logger) if valid_rna_expression == -1: return -1 return 0