예제 #1
0
def __main__():
    parser = argparse.ArgumentParser(description='Cancer gene annotations from PCGR pipeline (SNVs/InDels)')
    parser.add_argument('vcf_file', help='VCF file with VEP-annotated query variants (SNVs/InDels)')
    parser.add_argument('pon_annotation',default=0,type=int,help='Include Panel of Normals annotation')
    parser.add_argument('regulatory_annotation',default=0,type=int,help='Inclusion of VEP regulatory annotations (0/1)')
    parser.add_argument('pcgr_db_dir',help='PCGR data directory')
    parser.add_argument('--cpsr',action="store_true",help="Aggregate cancer gene annotations for Cancer Predisposition Sequencing Reporter (CPSR)")
    parser.add_argument("--debug", action="store_true", default=False, help="Print full commands to log, default: %(default)s")
    args = parser.parse_args()

    logger = utils.getlogger('pcgr-gene-annotate')
    if args.cpsr is True:
        logger = utils.getlogger('cpsr-gene-annotate')

    extend_vcf_annotations(args.vcf_file, args.pcgr_db_dir, logger, args.pon_annotation, args.regulatory_annotation, args.cpsr, args.debug)
예제 #2
0
def __main__():
    parser = argparse.ArgumentParser(description='Run brentp/vcfanno - annotate a VCF file against multiple VCF files in parallel', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('query_vcf', help='Bgzipped input VCF file with query variants (SNVs/InDels)')
    parser.add_argument('out_vcf', help='Output VCF file with appended annotations from multiple VCF files')
    parser.add_argument('pcgr_db_dir', help='PCGR assembly-specific data directory')
    parser.add_argument('--num_processes', help="Number of processes vcfanno can use during annotation", default=4)
    parser.add_argument("--docm",action = "store_true", help="Annotate VCF with annotations from Database of Curated Mutations")
    parser.add_argument("--clinvar",action = "store_true", help="Annotate VCF with annotations from ClinVar")
    parser.add_argument("--ncer",action = "store_true", help="Annotate VCF with ranking of variant deleteriousness in non-coding regions (ncER)")
    parser.add_argument('--dbmts',action = "store_true", help="Annotate VCF file with variants predicted to cause loss/gain of miRNA target sites in 3'UTR regions")
    parser.add_argument('--gerp',action = "store_true", help="Annotate VCF file with GERP RS scores (cancer predisposition gene/SF/GWAS loci only)")
    parser.add_argument("--dbnsfp",action = "store_true", help="Annotate VCF with annotations from database of non-synonymous functional predictions")
    parser.add_argument("--tcga",action = "store_true", help="Annotate VCF with variant frequencies from the The Cancer Genome Atlas")
    parser.add_argument("--tcga_pcdm",action = "store_true", help="Annotate VCF with putative cancer driver mutations from The Cancer Genome Atlas")
    parser.add_argument("--chasmplus", action="store_true",help="Annotate VCF with putative cancer driver mutations from CHASMplus algorithm")
    parser.add_argument("--civic",action = "store_true", help="Annotate VCF with annotations from the Clinical Interpretation of Variants in Cancer database")
    parser.add_argument("--cgi",action = "store_true", help="Annotate VCF with annotations from the Cancer bioMarkers database")
    parser.add_argument("--icgc",action = "store_true", help="Annotate VCF with known variants found in the ICGC-PCAWG sequencing project")
    parser.add_argument("--cancer_hotspots",action = "store_true", help="Annotate VCF with mutation hotspots from cancerhotspots.org")
    parser.add_argument("--uniprot",action = "store_true", help="Annotate VCF with protein functional features from the UniProt Knowledgebase")
    parser.add_argument("--pcgr_onco_xref",action = "store_true", help="Annotate VCF with transcript annotations from PCGR (targeted drugs, protein complexes, cancer gene associations, etc)")
    parser.add_argument("--gwas",action = "store_true", help="Annotate VCF against known loci associated with cancer, as identified from genome-wide association studies (GWAS)")
    parser.add_argument("--rmsk",action = "store_true", help="Annotate VCF against known sequence repeats, as identified by RepeatMasker (rmsk)")
    parser.add_argument("--simplerepeats",action = "store_true", help="Annotate VCF against known sequence repeats, as identified by Tandem Repeats Finder (simplerepeats)")
    parser.add_argument("--winmsk",action = "store_true", help="Annotate VCF against known sequence repeats, as identified by Windowmasker (winmsk)")
    parser.add_argument("--gnomad_cpsr",action = "store_true",help="Annotate VCF with population-specific allelic counts and frequencies in cancer predisposition genes (gnomAD non-cancer subset)")
    parser.add_argument("--panel_normal_vcf",dest="panel_normal_vcf",help="Annotate VCF with germline calls from panel of normals")
    parser.add_argument("--keep_logs",action = "store_true")
    parser.add_argument("--debug", action="store_true", default=False, help="Print full commands to log, default: %(default)s")

    args = parser.parse_args()

    logger = utils.getlogger('pcgr-vcfanno')

    query_info_tags = get_vcf_info_tags(args.query_vcf)
    vcfheader_file = args.out_vcf + '.tmp.' + str(random.randrange(0,10000000)) + '.header.txt'
    conf_fname = args.out_vcf + '.tmp.conf.toml'
    print_vcf_header(args.query_vcf, vcfheader_file, logger, chromline_only = False)
    run_vcfanno(args.num_processes, args.query_vcf, args.panel_normal_vcf, query_info_tags, vcfheader_file,
                args.pcgr_db_dir, conf_fname, args.out_vcf, args.docm, args.clinvar, args.ncer, args.dbmts, args.gerp, args.tcga, args.tcga_pcdm,
                args.chasmplus, args.dbnsfp, args.civic, args.cgi, args.icgc, args.uniprot, args.cancer_hotspots,
                args.pcgr_onco_xref, args.gwas, args.rmsk, args.simplerepeats, args.winmsk, args.gnomad_cpsr, args.keep_logs, args.debug, logger)
예제 #3
0
def validate_cpsr_input(pcgr_directory, input_vcf, custom_list_fname, preserved_info_tags, vcf_validation, genome_assembly, sample_id, virtual_panel_id, diagnostic_grade_only, output_dir, debug):
   """
   Function that reads the input files to CPSR (VCF file) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by CPSR
   3. Check that custom VCF INFO tags set by user as retained for output is found in query VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that VCF contains a single sample column
   6. The resulting VCF file is sorted and indexed (bgzip + tabix)
   """
   logger = utils.getlogger('cpsr-validate-input-arguments')

   custom_list_bed_fname = 'None'
   if not custom_list_fname == 'None':
      logger.info('Establishing BED track with custom list of genes from panel 0')
      custom_list_bed_fname = os.path.join(output_dir, sample_id + '.cpsr.' + genome_assembly + '.custom_list.bed')
      get_valid_custom_genelist(custom_list_fname, custom_list_bed_fname, pcgr_directory, genome_assembly, logger, debug)

   #config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'cpsr')
   if not input_vcf == 'None':
      if vcf_validation == 1:
         logger.info('Skipping validation of VCF file (deprecated as of Dec 2021)')
      else:
         logger.info('Skipping validation of VCF file as provided by option --no_vcf_validate')

      tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger)
      if tag_check == -1:
         return -1

      if preserved_info_tags != "None":
         custom_check = check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger)
         if custom_check == -1:
            return -1

      vcf = VCF(input_vcf)
      samples = vcf.samples
      if len(samples) > 1:
         err_msg = "Query VCF contains more than one sample column (" + ', '.join(samples) + ") - CPSR expects a germline VCF with a single sample column - exiting"
         return error_message(err_msg, logger)
      simplify_vcf(input_vcf, vcf, custom_list_bed_fname, pcgr_directory, genome_assembly, virtual_panel_id, sample_id, diagnostic_grade_only, output_dir, logger, debug)

   return 0
예제 #4
0
def run_pcgr(pcgr_paths, config_options):
    """
    Main function to run the PCGR workflow
    """

    debug = config_options['debug']

    report_nonfloating_toc = 1 if config_options['other']['nonfloating_toc'] else 0
    vep_regulatory_annotation = 'ON' if config_options['other']['vep_regulatory'] == 1 else 'OFF'
    clinical_trials_set = 'ON' if config_options['clinicaltrials']['run'] else 'OFF'
    msi_prediction_set = 'ON' if config_options['msi']['run'] else 'OFF'
    msig_estimation_set = 'ON' if config_options['msigs']['run'] else 'OFF'
    tmb_estimation_set = 'ON' if config_options['tmb']['run'] else 'OFF'
    vcf_validation = 0 if config_options['other']['no_vcf_validate'] else 1
    run_vcf2maf = config_options['other']['vcf2maf']
    assay_mode = 'Tumor vs. Control'
    tumor_only = 0
    cell_line = 0
    if config_options['tumor_only']['tumor_only']:
        assay_mode = 'Tumor-Only'
        tumor_only = 1
        if config_options['tumor_only']['cell_line']:
            cell_line = 1
            assay_mode = 'Tumor-Only (cell line)'
    # set basic run commands
    output_vcf = 'None'
    output_pass_vcf = 'None'
    output_pass_tsv = 'None'
    output_maf = 'None'
    GENCODE_VERSION = pcgr_vars.GENCODE_VERSION
    NCBI_BUILD_MAF = pcgr_vars.NCBI_BUILD_MAF
    VEP_ASSEMBLY = pcgr_vars.VEP_ASSEMBLY
    MAX_VARIANTS_FOR_REPORT = pcgr_vars.MAX_VARIANTS_FOR_REPORT
    if config_options['genome_assembly'] == 'grch37':
        NCBI_BUILD_MAF = 'GRCh37'
        GENCODE_VERSION = 'release 19'
        VEP_ASSEMBLY = 'GRCh37'
    logger = getlogger('pcgr-get-OS')

    vep_dir = os.path.join(str(pcgr_paths['db_dir']), '.vep')
    input_vcf = 'None'
    input_cna = 'None'
    input_rna_fusion = 'None'
    input_rna_expression = 'None'
    input_cpsr_report = 'None'
    panel_normal = 'None'
    # panel-of-normals annotation
    pon_annotation = 0

    # Specify paths for input files and directories
    if pcgr_paths['input_vcf_basename'] != 'NA':
        input_vcf = os.path.join(pcgr_paths['input_vcf_dir'], pcgr_paths['input_vcf_basename'])
    if pcgr_paths['input_cna_basename'] != 'NA':
        input_cna = os.path.join(pcgr_paths['input_cna_dir'], pcgr_paths['input_cna_basename'])
    if pcgr_paths['input_rna_fusion_basename'] != 'NA':
        input_rna_fusion = os.path.join(pcgr_paths['input_rna_fusion_dir'], pcgr_paths['input_rna_fusion_basename'])
    if pcgr_paths['input_rna_expression_basename'] != 'NA':
        input_rna_expression = os.path.join(pcgr_paths['input_rna_expression_dir'], pcgr_paths['input_rna_expression_basename'])
    if pcgr_paths['input_cpsr_report_basename'] != 'NA':
        input_cpsr_report = os.path.join(pcgr_paths['input_cpsr_report_dir'], pcgr_paths['input_cpsr_report_basename'])
    if pcgr_paths['panel_normal_vcf_basename'] != 'NA':
        panel_normal = os.path.join(pcgr_paths['panel_normal_vcf_dir'], pcgr_paths['panel_normal_vcf_basename'])

    data_dir = pcgr_paths['base_dir']
    output_dir = pcgr_paths['output_dir']

    # PCGR|validate_input - verify that VCF and CNA segment file is of appropriate format
    logger = getlogger("pcgr-validate-input-arguments")
    logger.info("PCGR - STEP 0: Validate input data and options")

    vcf_validate_command = (
            f'pcgr_validate_input.py '
            f'{data_dir} '
            f'{input_vcf} '
            f'{input_cna} '
            f'{input_rna_fusion} '
            f'{input_rna_expression} '
            f'{panel_normal} '
            f'{vcf_validation} '
            f'{tumor_only} '
            f'{config_options["genome_assembly"]} '
            f'{config_options["other"]["preserved_info_tags"]} '
            f'{config_options["allelic_support"]["tumor_dp_tag"]} {config_options["allelic_support"]["tumor_af_tag"]} '
            f'{config_options["allelic_support"]["control_dp_tag"]} {config_options["allelic_support"]["control_af_tag"]} '
            f'{config_options["allelic_support"]["call_conf_tag"]} '
            f'{config_options["tumor_only"]["exclude_likely_hom_germline"]} '
            f'{config_options["tumor_only"]["exclude_likely_het_germline"]} '
            f'--output_dir {output_dir} '
            f'{"--debug " if debug else ""}'
            f'{"--keep_uncompressed" if run_vcf2maf else ""} '
            )
    check_subprocess(logger, vcf_validate_command, debug)
    logger.info('Finished pcgr-validate-input-arguments')
    print('----')

    # PCGR|start - Log key information about sample, options and sequencing assay/design
    logger = getlogger('pcgr-start')
    logger.info('--- Personal Cancer Genome Reporter workflow ----')
    logger.info(f'Sample name: {config_options["sample_id"]}')
    if config_options['tumor_type']['type'] == 'Cancer_NOS':
        logger.info('Tumor type: Cancer_NOS (Any tumortype)')
    else:
        logger.info(f'Tumor type: {config_options["tumor_type"]["type"]}')
    logger.info(f'Sequencing assay - type: {config_options["assay"]}')
    logger.info(f'Sequencing assay - mode: {assay_mode}')
    logger.info(f'Sequencing assay - coding target size: {config_options["tmb"]["target_size_mb"]}Mb')
    logger.info(f'Genome assembly: {config_options["genome_assembly"]}')
    logger.info(f'Mutational signature estimation: {msig_estimation_set}')
    logger.info(f'MSI classification: {msi_prediction_set}')
    logger.info(f'Mutational burden estimation: {tmb_estimation_set}')
    logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}')

    if not input_vcf == 'None':
        # Define temporary output file names
        prefix = os.path.join(output_dir, f'{config_options["sample_id"]}.pcgr_acmg.{config_options["genome_assembly"]}')
        output_vcf =             f'{prefix}.vcf.gz'
        output_pass_vcf =        f'{prefix}.pass.vcf.gz'
        output_pass_tsv =        f'{prefix}.pass.tsv'
        output_pass_raw_tsv_gz = f'{prefix}.pass.raw.tsv.gz'
        output_maf =             f'{prefix}.tmp.maf'
        output_vcf2maf_log =     f'{prefix}.maf.log'
        input_vcf_pcgr_ready =   os.path.join(output_dir, re.sub(r"(\.vcf$|\.vcf\.gz$)", ".pcgr_ready.vcf.gz", pcgr_paths["input_vcf_basename"]))
        # needs to be uncompressed for vcf2maf
        input_vcf_pcgr_ready_uncompressed = os.path.join(output_dir, re.sub(r"(\.vcf$|\.vcf\.gz$)", ".pcgr_ready.vcf", pcgr_paths["input_vcf_basename"]))
        vep_vcf = re.sub(r"(\.vcf$|\.vcf\.gz$)", ".vep.vcf.gz", input_vcf_pcgr_ready)
        vep_vcfanno_vcf = re.sub(r"(\.vcf$|\.vcf\.gz$)", ".vep.vcfanno.vcf", input_vcf_pcgr_ready)
        vep_vcfanno_annotated_vcf = re.sub(r"\.vcfanno", ".vcfanno.annotated", vep_vcfanno_vcf) + ".gz"
        vep_vcfanno_annotated_pass_vcf = re.sub(r"\.vcfanno", ".vcfanno.annotated.pass", vep_vcfanno_vcf) + ".gz"
        fasta_assembly = os.path.join(vep_dir, 'homo_sapiens', f'{pcgr_vars.VEP_VERSION}_{VEP_ASSEMBLY}', f'Homo_sapiens.{VEP_ASSEMBLY}.dna.primary_assembly.fa.gz')
        # List all VEP flags used when calling VEP
        vep_flags = (
                f'--hgvs --af --af_1kg --af_gnomad --variant_class --domains --symbol --protein --ccds --mane '
                f'--uniprot --appris --biotype --tsl --canonical --format vcf --cache --numbers --total_length --allele_number '
                f'--no_stats --no_escape --xref_refseq --vcf --check_ref --dont_skip --flag_pick_allele --plugin NearestExonJB,max_range=50000 '
                f'--force_overwrite --species homo_sapiens --offline --compress_output bgzip'
                )
        vep_options = (
                f'--dir {vep_dir} --assembly {VEP_ASSEMBLY} --cache_version {pcgr_vars.VEP_VERSION} '
                f'--fasta {fasta_assembly} --pick_order {config_options["other"]["vep_pick_order"]} '
                f'--buffer_size {config_options["other"]["vep_buffer_size"]} '
                f'--fork {config_options["other"]["vep_n_forks"]} '
                f'{vep_flags} '
                f'{"--verbose" if debug else "--quiet"} '
                )
        gencode_set_in_use = "GENCODE - all transcripts"
        if config_options['other']['vep_no_intergenic'] == 1:
            vep_options += '--no_intergenic '
        if config_options['other']['vep_regulatory'] == 1:
            vep_options += '--regulatory '
        if config_options['other']['vep_gencode_all'] == 0:
            vep_options += '--gencode_basic '
            gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)"

        # Compose full VEP command
        vep_main_command = f'{utils.get_perl_exports()} && vep --input_file {input_vcf_pcgr_ready} --output_file {vep_vcf} {vep_options}'
        vep_tabix_command = f'tabix -f -p vcf {vep_vcf}'

        # PCGR|VEP - run consequence annotation with Variant Effect Predictor
        print('----')
        logger = getlogger('pcgr-vep')
        logger.info(f'PCGR - STEP 1: Basic variant annotation with Variant Effect Predictor ({pcgr_vars.VEP_VERSION}, GENCODE {GENCODE_VERSION}, {config_options["genome_assembly"]})')
        logger.info(f'VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)')
        logger.info(f'VEP configuration - transcript pick order: {config_options["other"]["vep_pick_order"]}')
        logger.info(f'VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options')
        logger.info(f'VEP configuration - GENCODE set: {gencode_set_in_use}')
        logger.info(f'VEP configuration - skip intergenic: {"TRUE" if config_options["other"]["vep_no_intergenic"] else "FALSE"}')
        logger.info(f'VEP configuration - regulatory annotation: {vep_regulatory_annotation}')
        logger.info(f'VEP configuration - buffer_size/number of forks: {config_options["other"]["vep_buffer_size"]}/{config_options["other"]["vep_n_forks"]}')

        check_subprocess(logger, vep_main_command, debug)
        check_subprocess(logger, vep_tabix_command, debug)
        logger.info('Finished pcgr-vep')
        print('----')

        # PCGR|vcf2maf - if option set, convert VCF to MAF with https://github.com/mskcc/vcf2maf
        if run_vcf2maf:
            logger.info('Converting VEP-annotated VCF to MAF with https://github.com/mskcc/vcf2maf')
            vcf2maf_command = (
                    f'vcf2maf.pl --inhibit-vep --input-vcf {input_vcf_pcgr_ready_uncompressed} '
                    f'--tumor-id {config_options["sample_id"]} --output-maf {output_maf} --ref-fasta {fasta_assembly} '
                    f'--ncbi-build {NCBI_BUILD_MAF} > {output_vcf2maf_log} 2>&1'
                    )
            check_subprocess(logger, vcf2maf_command, debug)
            utils.remove(input_vcf_pcgr_ready_uncompressed)
            utils.remove(output_vcf2maf_log)
            logger.info('Finished pcgr-vep-vcf2maf')
            print('----')

        # PCGR|vcfanno - annotate VCF against a number of variant annotation resources
        logger = getlogger("pcgr-vcfanno")
        pcgr_vcfanno_command = (
                f'pcgr_vcfanno.py {vep_vcf} {vep_vcfanno_vcf} {pcgr_paths["db_dir"]} '
                f'--num_processes {config_options["other"]["vcfanno_n_proc"]} '
                f'--chasmplus --dbnsfp --docm --clinvar --icgc --civic --cgi --tcga_pcdm --winmsk --simplerepeats '
                f'--tcga --uniprot --cancer_hotspots --pcgr_onco_xref '
                f'{"--debug " if debug else ""}'
                )
        anno_src_msg = (
                f"Annotation sources: {'Panel-of-Normals, ' if panel_normal != 'None' else ''}ClinVar, dbNSFP, "
                f"UniProtKB, cancerhotspots.org, CiVIC, CGI, DoCM, CHASMplus driver mutations, TCGA, ICGC-PCAWG"
                )
        logger.info("PCGR - STEP 2: Annotation for precision oncology with pcgr-vcfanno")
        logger.info(anno_src_msg)
        if panel_normal != "None":
            pon_annotation = 1
            pcgr_vcfanno_command += f'--panel_normal_vcf {panel_normal}'
        check_subprocess(logger, pcgr_vcfanno_command, debug)
        logger.info("Finished pcgr-vcfanno")
        print('----')

        # PCGR|pcgr_summarise - expand annotations in VCF file
        logger = getlogger("pcgr-summarise")
        pcgr_summarise_command = (
                f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {pon_annotation} '
                f'{config_options["other"]["vep_regulatory"]} '
                f'{pcgr_paths["db_dir"]} '
                f'{"--debug" if debug else ""}'
                )
        logger.info("PCGR - STEP 3: Cancer gene annotations with pcgr-summarise")
        check_subprocess(logger, pcgr_summarise_command, debug)

        # PCGR|clean - move output files and clean up temporary files
        os.rename(vep_vcfanno_annotated_vcf, output_vcf)
        os.rename(f'{vep_vcfanno_annotated_vcf}.tbi', f'{output_vcf}.tbi')
        os.rename(vep_vcfanno_annotated_pass_vcf, output_pass_vcf)
        os.rename(f'{vep_vcfanno_annotated_pass_vcf}.tbi', f'{output_pass_vcf}.tbi')
        delete_files = (
                glob(f'{vep_vcf}*') +
                glob(f'{vep_vcfanno_annotated_vcf}') +
                glob(f'{vep_vcfanno_annotated_pass_vcf}*') +
                glob(f'{vep_vcfanno_vcf}*') +
                glob(f'{input_vcf_pcgr_ready_uncompressed}*')
                )
        # do not delete if debugging
        if not debug:
            for fn in delete_files:
                #print(f"Deleting {fn}")
                utils.remove(fn)

        logger.info('Finished pcgr-summarise main command')

        # PCGR|vcf2tsv - convert VCF to TSV with https://github.com/sigven/vcf2tsv
        pcgr_vcf2tsv_command = f'vcf2tsv.py {output_pass_vcf} --compress {output_pass_tsv}'
        logger.info("Converting VCF to TSV with https://github.com/sigven/vcf2tsv")
        check_subprocess(logger, pcgr_vcf2tsv_command, debug)
        logger.info('Finished pcgr-summarise-vcf2tsv')

        if config_options['assay'] == 'WGS' or config_options['assay'] == 'WES':
            output_pass_tsv_gz = f'{output_pass_tsv}.gz'
            # check that output file exist
            if os.path.exists(output_pass_tsv_gz):
                # get number of rows/variants annotated, using pandas
                var_data = pandas.read_csv(output_pass_tsv_gz, sep = '\t', low_memory = False, header = [1])
                num_variants_raw = len(var_data)
                if num_variants_raw > MAX_VARIANTS_FOR_REPORT:
                    logger.info(f'Number of raw variants in input VCF ({num_variants_raw}) exceeds {MAX_VARIANTS_FOR_REPORT} - intergenic/intronic variants will be excluded prior to reporting')

                    # Exclude intronic and intergenic variants prior to analysis with pcgrr (reporting and further analysis)
                    var_data_filtered = var_data[~var_data.Consequence.str.contains('^intron') & ~var_data.Consequence.str.contains('^intergenic')]
                    num_variants_excluded1 = num_variants_raw - len(var_data_filtered)
                    logger.info(f'Number of intergenic/intronic variants excluded: {num_variants_excluded1}')

                    # Exclude upstream_gene/downstream_gene variants if size of filtered variant set is still above MAX_VARIANTS_FOR_REPORT
                    # TODO: in this case, the TMB calculation will be an underestimate (but still likely huge)
                    var_data_filtered_final = var_data_filtered
                    if len(var_data_filtered) > MAX_VARIANTS_FOR_REPORT:
                        var_data_filtered_final = var_data_filtered[~var_data_filtered.Consequence.str.contains('^upstream_gene') & ~var_data_filtered.Consequence.str.contains('^downstream_gene')]
                        num_variants_excluded2 = len(var_data_filtered) - len(var_data_filtered_final)
                        logger.info(f'Number of upstream_gene/downstream_gene variants excluded: {num_variants_excluded2}')


                    # get vcf2tsv header and pipe to output TSV file
                    get_vcf2tsv_header = f'gzip -dc {output_pass_tsv_gz} | egrep \'^#\' > {output_pass_tsv}'
                    check_subprocess(logger, get_vcf2tsv_header, debug)

                    # rename original vcf2tsv (gzipped) to 'raw' filename
                    rename_output_tsv = f'mv {output_pass_tsv_gz} {output_pass_raw_tsv_gz}'
                    check_subprocess(logger, rename_output_tsv, debug)

                    # append filtered data output to output TSV file
                    var_data_filtered_final.to_csv(output_pass_tsv, sep='\t', encoding='utf-8', mode = 'a', index = False)

                    # gzip filtered output TSV file
                    gzip_filtered_output_tsv = f'gzip -f {output_pass_tsv}'
                    check_subprocess(logger, gzip_filtered_output_tsv, debug)


        logger.info('Finished pcgr-summarise')
        print('----')

    # Generation of HTML reports for VEP/vcfanno-annotated VCF and copy number segment file
    if not config_options['other']['basic']:
        co = config_options
        ttype = co['tumor_type']['type'].replace(' ', '_').replace('/', '@')
        logger = getlogger('pcgr-writer')
        logger.info('PCGR - STEP 4: Generation of output files - variant interpretation report for precision oncology')

        # export PATH to R conda env Rscript
        rscript = utils.script_path('pcgrr', 'bin/Rscript')
        pcgrr_script = utils.script_path('pcgr', 'bin/pcgrr.R')
        pcgr_report_command = (
                f"{rscript} {pcgrr_script} "
                f"{output_dir} "
                f"{output_pass_tsv}.gz "
                f"{input_cna} "
                f"{input_rna_fusion} "
                f"{input_rna_expression} "
                f"{input_cpsr_report} "
                f"{config_options['sample_id']} "
                f"{pcgr_vars.PCGR_VERSION} "
                f"{pcgr_vars.DB_VERSION} "
                f"{config_options['genome_assembly']} "
                f"{data_dir} "
                f"{co['tumor_purity']} "
                f"{co['tumor_ploidy']} "
                f"{ttype} "
                f"{co['tmb']['target_size_mb']} "
                f"{co['assay']} "
                f"{tumor_only} "
                f"{cell_line} "
                f"{co['tumor_only']['maf_onekg_afr']} "
                f"{co['tumor_only']['maf_onekg_amr']} "
                f"{co['tumor_only']['maf_onekg_eas']} "
                f"{co['tumor_only']['maf_onekg_eur']} "
                f"{co['tumor_only']['maf_onekg_sas']} "
                f"{co['tumor_only']['maf_onekg_global']} "
                f"{co['tumor_only']['maf_gnomad_afr']} "
                f"{co['tumor_only']['maf_gnomad_amr']} "
                f"{co['tumor_only']['maf_gnomad_asj']} "
                f"{co['tumor_only']['maf_gnomad_eas']} "
                f"{co['tumor_only']['maf_gnomad_fin']} "
                f"{co['tumor_only']['maf_gnomad_nfe']} "
                f"{co['tumor_only']['maf_gnomad_oth']} "
                f"{co['tumor_only']['maf_gnomad_sas']} "
                f"{co['tumor_only']['maf_gnomad_global']} "
                f"{co['tumor_only']['exclude_pon']} "
                f"{co['tumor_only']['exclude_likely_hom_germline']} "
                f"{co['tumor_only']['exclude_likely_het_germline']} "
                f"{co['tumor_only']['exclude_dbsnp_nonsomatic']} "
                f"{co['tumor_only']['exclude_nonexonic']} "
                f"{co['tmb']['run']} "
                f"{co['tmb']['algorithm']} "
                f"{co['msi']['run']} "
                f"{co['msigs']['run']} "
                f"{co['msigs']['mutation_limit']} "
                f"{co['msigs']['all_reference_signatures']} "
                f"{co['msigs']['include_artefact_signatures']} "
                f"{co['msigs']['prevalence_reference_signatures']} "
                f"{co['cna']['logR_homdel']} "
                f"{co['cna']['logR_gain']} "
                f"{co['cna']['cna_overlap_pct']} "
                f"{co['allelic_support']['tumor_af_min']} "
                f"{co['allelic_support']['tumor_dp_min']} "
                f"{co['allelic_support']['control_dp_min']} "
                f"{co['allelic_support']['control_af_max']} "
                f"{co['allelic_support']['tumor_af_tag']} "
                f"{co['allelic_support']['tumor_dp_tag']} "
                f"{co['allelic_support']['control_af_tag']} "
                f"{co['allelic_support']['control_dp_tag']} "
                f"{co['allelic_support']['call_conf_tag']} "
                f"{co['clinicaltrials']['run']} "
                f"{co['other']['vep_n_forks']} "
                f"{co['other']['vep_buffer_size']} "
                f"{co['other']['vep_no_intergenic']} "
                f"{co['other']['vep_pick_order']} "
                f"{co['other']['vep_regulatory']} "
                f"{co['other']['vep_gencode_all']} "
                f"{co['other']['vcf2maf']} "
                f"{co['other']['list_noncoding']} "
                f"{co['other']['preserved_info_tags']} "
                f"{co['other']['visual_theme']} "
                f"{report_nonfloating_toc} "
                f"{co['other']['no_vcf_validate']}"
                )

        if debug:
            print(pcgr_report_command)
        check_subprocess(logger, pcgr_report_command, debug)
        logger.info("Finished PCGR!")
        print('----')

    print()
예제 #5
0
파일: cpsr.py 프로젝트: sigven/pcgr
def run_cpsr(arg_dict, cpsr_paths):
    """
    Main function to run the CPSR workflow
    """
    debug = arg_dict['debug']
    diagnostic_grade_only = 0
    vcf_validation = 1
    virtual_panel_id = "-1"
    ignore_noncoding = 0
    gwas_findings = 0
    secondary_findings = 0
    classify_all = 0
    clinvar_ignore_noncancer = 0
    report_nonfloating_toc = 0
    vep_no_intergenic = 0
    vep_regulatory = 0
    preserved_info_tags = arg_dict['preserved_info_tags']
    diagnostic_grade_set = "OFF"
    secondary_findings_set = "OFF"
    gwas_findings_set = "OFF"

    if arg_dict['vep_regulatory']:
        vep_regulatory = 1
    if arg_dict["vep_no_intergenic"]:
        vep_no_intergenic = 1
    if arg_dict['clinvar_ignore_noncancer']:
        clinvar_ignore_noncancer = 1
    if arg_dict['classify_all']:
        classify_all = 1
    if arg_dict['gwas_findings']:
        gwas_findings = 1
        gwas_findings_set = "ON"
    if arg_dict['secondary_findings']:
        secondary_findings = 1
        secondary_findings_set = "ON"
    if arg_dict['diagnostic_grade_only']:
        diagnostic_grade_only = 1
        diagnostic_grade_set = "ON"
    if arg_dict['report_nonfloating_toc']:
        report_nonfloating_toc = 1
    if arg_dict['no_vcf_validate']:
        vcf_validation = 0
    if arg_dict['virtual_panel_id'] != "-1":
        virtual_panel_id = arg_dict['virtual_panel_id']
    if arg_dict['custom_list']:
        virtual_panel_id = "-1"
    if arg_dict['ignore_noncoding']:
        ignore_noncoding = 1

    output_vcf = 'None'
    output_pass_vcf = 'None'
    output_pass_tsv = 'None'
    uid = ''
    GENCODE_VERSION = pcgr_vars.GENCODE_VERSION
    VEP_ASSEMBLY = pcgr_vars.VEP_ASSEMBLY
    VEP_VERSION = pcgr_vars.VEP_VERSION
    if arg_dict['genome_assembly'] == 'grch37':
        GENCODE_VERSION = '19'
        VEP_ASSEMBLY = 'GRCh37'

    vepdb_dir = os.path.join(str(cpsr_paths['db_dir']),'.vep')
    input_vcf = 'None'
    input_customlist = 'None'

    if cpsr_paths['input_vcf_basename'] != 'NA':
        input_vcf = os.path.join(cpsr_paths['input_vcf_dir'], cpsr_paths['input_vcf_basename'])
    if cpsr_paths['input_customlist_basename'] != 'NA':
        input_customlist = os.path.join(cpsr_paths['input_customlist_dir'], cpsr_paths['input_customlist_basename'])

    data_dir = cpsr_paths['base_dir']
    output_dir = cpsr_paths['output_dir']
    vep_dir = vepdb_dir

    logger = getlogger('cpsr-validate-input-arguments')
    logger.info("CPSR - STEP 0: Validate input data")
    check_subprocess(logger, f'mkdir -p {output_dir}', debug)

    ## CPSR|Validate input VCF - check formatting, non-overlap with CPSR INFO tags, and whether sample contains any variants in cancer predisposition loci
    vcf_validate_command = (
            f'cpsr_validate_input.py '
            f'{data_dir} '
            f'{input_vcf} '
            f'{input_customlist} '
            f'{preserved_info_tags} '
            f'{vcf_validation} '
            f'{arg_dict["genome_assembly"]} '
            f'{arg_dict["sample_id"]} '
            f'{virtual_panel_id} '
            f'{diagnostic_grade_only} '
            f'--output_dir {output_dir} {"--debug" if debug else ""}'
            )
    check_subprocess(logger, vcf_validate_command, debug)
    logger.info('Finished cpsr-validate-input-arguments')
    print('----')

    ## CPSR|Start - log key information about run
    logger = getlogger("cpsr-start")
    logger.info("--- Cancer Predisposition Sequencing Reporter workflow ----")
    logger.info(f"Sample name: {arg_dict['sample_id']}")
    if not input_customlist == 'None':
        logger.info(f"Virtual gene panel: custom-made list from panel 0: {input_customlist}")
    else:
        #logger.info("Virtual gene panel(s): " + str(pcgr_vars.GE_panels[virtual_panel_id]))
        logger.info(f"Diagnostic-grade genes in virtual panels (GE PanelApp): {diagnostic_grade_set}")
    logger.info(f"Include incidental findings (ACMG recommended list v3.0): {secondary_findings_set}")
    logger.info(f"Include low to moderate cancer risk variants from genome-wide association studies: {gwas_findings_set}")
    logger.info(f"Reference population, germline variant frequencies (gnomAD): {str(arg_dict['pop_gnomad']).upper()}")
    logger.info(f"Genome assembly: {arg_dict['genome_assembly']}")

    if not input_vcf == 'None':

        ## Define input, output and temporary file names
        pcgr_model = 'cpsr'
        output_vcf = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.vcf.gz')
        output_pass_vcf = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.pass.vcf.gz')
        output_pass_tsv = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.pass.tsv')
        input_vcf_cpsr_ready = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf.gz', cpsr_paths['input_vcf_basename']))
        input_vcf_cpsr_ready_uncompressed = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf', cpsr_paths['input_vcf_basename']))
        vep_vcf = re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_vep.vcf',input_vcf_cpsr_ready)
        vep_vcfanno_vcf = re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_vep.vcfanno.vcf',input_vcf_cpsr_ready)
        vep_vcfanno_annotated_vcf = re.sub(r'\.vcfanno','.vcfanno.annotated',vep_vcfanno_vcf) + '.gz'
        vep_vcfanno_annotated_pass_vcf = re.sub(r'\.vcfanno','.vcfanno.annotated.pass',vep_vcfanno_vcf) + '.gz'
        custom_bed = os.path.join(output_dir, str(arg_dict['sample_id']) + '.' + str(pcgr_model) + '.' + str(arg_dict['genome_assembly']) + '.custom_list.bed')

        ## File names for assembly-specific genome fasta files (VEP)
        fasta_assembly = os.path.join(vep_dir, f"homo_sapiens/{VEP_VERSION}_{VEP_ASSEMBLY}/Homo_sapiens.{VEP_ASSEMBLY}.dna.primary_assembly.fa.gz")
        ancestor_assembly = os.path.join(vep_dir, f"homo_sapiens/{VEP_VERSION}_{VEP_ASSEMBLY}/human_ancestor.fa.gz")

        ## Set all flags used in VEP run
        plugins_in_use = "NearestExonJB, LoF"
        vep_flags = (
            f"--format vcf --vcf --check_ref --flag_pick_allele_gene --hgvs --dont_skip --failed 1 --af --af_1kg --af_gnomad "
            f"--variant_class --domains --symbol --protein --ccds --uniprot --appris --biotype --canonical --cache "
            f"--numbers --total_length --no_stats --allele_number --no_escape --xref_refseq --plugin NearestExonJB,max_range=50000"
            )
        vep_options = (
            f"--pick_order {arg_dict['vep_pick_order']} --force_overwrite --buffer_size {arg_dict['vep_buffer_size']} "
            f"--species homo_sapiens --assembly {VEP_ASSEMBLY} --offline --fork {arg_dict['vep_n_forks']} {vep_flags} --dir {vep_dir} "
            f"--cache_version {VEP_VERSION}"
            )
        gencode_set_in_use = "GENCODE - all transcripts"
        if arg_dict['vep_gencode_all'] == 0:
            vep_options += ' --gencode_basic'
            gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)"
        if arg_dict['vep_no_intergenic'] == 1:
            vep_options = vep_options + " --no_intergenic"
        if arg_dict['vep_regulatory'] == 1:
            vep_options = vep_options + " --regulatory"
        if arg_dict['genome_assembly'] == "grch38":
            vep_options = vep_options +  " --mane"
        loftee_dir = utils.get_loftee_dir()
        assert os.path.isdir(loftee_dir), f'LoF VEP plugin is not found in {loftee_dir}. Please make sure you installed pcgr conda package and have corresponding conda environment active.'
        vep_options += f" --plugin LoF,loftee_path:{loftee_dir},human_ancestor_fa:{ancestor_assembly},use_gerp_end_trunc:0 --dir_plugins {loftee_dir}"
        if not debug:
            vep_options += " --quiet"

        ## Compose full VEP command
        vep_main_command = f'{utils.get_perl_exports()} && vep --input_file {input_vcf_cpsr_ready} --output_file {vep_vcf} {vep_options} --fasta {fasta_assembly}'
        vep_bgzip_command = f'bgzip -f {vep_vcf}'
        vep_tabix_command = f'tabix -f -p vcf {vep_vcf}.gz'
        logger = getlogger('cpsr-vep')

        ## CPSR|VEP - run Variant Effect Predictor on query VCF with LoF and NearestExonJB plugins
        logger.info(f"CPSR - STEP 1: Basic variant annotation with Variant Effect Predictor ({VEP_VERSION}, GENCODE {GENCODE_VERSION}, {arg_dict['genome_assembly']})")
        logger.info(f"VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)")
        logger.info(f"VEP configuration - transcript pick order: {arg_dict['vep_pick_order']}")
        logger.info(f"VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options")
        logger.info(f"VEP configuration - GENCODE set: {gencode_set_in_use}")
        logger.info(f"VEP configuration - skip intergenic: {arg_dict['vep_no_intergenic']}")
        logger.info(f"VEP configuration - look for overlap with regulatory regions: {vep_regulatory}")
        logger.info(f"VEP configuration - plugins in use: {plugins_in_use}")
        logger.info(f"VEP configuration - buffer_size/number of forks: {arg_dict['vep_buffer_size']}/{arg_dict['vep_n_forks']}")
        check_subprocess(logger, vep_main_command, debug)
        check_subprocess(logger, vep_bgzip_command, debug)
        check_subprocess(logger, vep_tabix_command, debug)
        logger.info("Finished cpsr-vep")
        print('----')

        ## CPSR|vcfanno - run vcfanno on query VCF with a number of relevant annotated VCFs
        logger = getlogger('cpsr-vcfanno')
        logger.info("CPSR - STEP 2: Annotation for cancer predisposition with cpsr-vcfanno")
        logger.info("(ClinVar, CIViC, dbNSFP, dbMTS, UniProtKB, cancerhotspots.org, ncER, GERP RS scores, GWAS catalog, gnomAD non-cancer subset)")
        pcgr_vcfanno_command = (
                f"pcgr_vcfanno.py --num_processes {arg_dict['vcfanno_n_proc']} --dbnsfp --clinvar "
                f"--cancer_hotspots --dbmts --ncer --gerp --civic --uniprot --gnomad_cpsr --pcgr_onco_xref "
                f"--gwas --rmsk {vep_vcf}.gz {vep_vcfanno_vcf} {os.path.join(data_dir, 'data', str(arg_dict['genome_assembly']))}"
                )
        check_subprocess(logger, pcgr_vcfanno_command, debug)
        logger.info("Finished cpsr-vcfanno")
        print('----')

        ## CPSR|summarise - expand annotations with separate VCF INFO tags
        logger = getlogger("cpsr-summarise")
        pcgr_summarise_command = (
                f'pcgr_summarise.py {vep_vcfanno_vcf}.gz 0 {vep_regulatory} '
                f'{os.path.join(data_dir, "data", arg_dict["genome_assembly"])} '
                f'--cpsr {"--debug" if debug else ""}'
                )
        logger.info("CPSR - STEP 3: Cancer gene annotations with cpsr-summarise")
        check_subprocess(logger, pcgr_summarise_command, debug)

        ## CPSR|clean - rename output files, remove temporary files
        os.rename(vep_vcfanno_annotated_vcf, output_vcf)
        os.rename(f'{vep_vcfanno_annotated_vcf}.tbi', f'{output_vcf}.tbi')
        os.rename(vep_vcfanno_annotated_pass_vcf, output_pass_vcf)
        os.rename(f'{vep_vcfanno_annotated_pass_vcf}.tbi', f'{output_pass_vcf}.tbi')
        delete_files = (
                glob(f'{vep_vcf}*') +
                glob(f'{vep_vcfanno_annotated_vcf}') +
                glob(f'{vep_vcfanno_annotated_pass_vcf}*') +
                glob(f'{vep_vcfanno_vcf}*') +
                glob(f'{input_vcf_cpsr_ready_uncompressed}*')
                )
        # do not delete if debugging
        if not debug:
            for fn in delete_files:
                #print(f"Deleting {fn}")
                utils.remove(fn)
        logger.info('Finished cpsr-summarise main command')
        ## CPSR|vcf2tsv - perform vcf2tsv conversion on the final annotated VCF file
        cpsr_vcf2tsv_command = f"vcf2tsv.py {output_pass_vcf} --compress {output_pass_tsv}"
        logger.info("Converting VCF to TSV with https://github.com/sigven/vcf2tsv")
        check_subprocess(logger, cpsr_vcf2tsv_command, debug)
        logger.info('Finished cpsr-summarise-vcf2tsv')
    logger.info('Finished cpsr-summarise')
    print('----')

    ## Generation of HTML reports for VEP/vcfanno-annotated VCF file
    if not arg_dict['basic']:
        logger = getlogger('cpsr-writer')
        logger.info("CPSR - STEP 4: Generation of output files - Cancer predisposition sequencing report")

        # export PATH to R conda env Rscript
        rscript = utils.script_path("pcgrr", "bin/Rscript")
        cpsrr_script = utils.script_path('pcgr', 'bin/cpsr.R')
        cpsr_report_command = (
                f"{rscript} {cpsrr_script} "
                f"{output_dir} "
                f"{output_pass_tsv}.gz "
                f"{arg_dict['sample_id']} "
                f"{pcgr_vars.PCGR_VERSION} "
                f"{pcgr_vars.DB_VERSION} "
                f"{arg_dict['genome_assembly']} "
                f"{data_dir} "
                f"{virtual_panel_id} "
                f"{preserved_info_tags} "
                f"{custom_bed} "
                f"{arg_dict['custom_list_name']} "
                f"{arg_dict['report_theme']} "
                f"{arg_dict['report_table_display']} "
                f"{report_nonfloating_toc} "
                f"{gwas_findings} "
                f"{arg_dict['gwas_p_value']} "
                f"{arg_dict['pop_gnomad']} "
                f"{arg_dict['maf_upper_threshold']} "
                f"{arg_dict['vep_pick_order']} "
                f"{arg_dict['vep_n_forks']} "
                f"{arg_dict['vep_buffer_size']} "
                f"{arg_dict['vep_gencode_all']} "
                f"{vep_no_intergenic} "
                f"{vep_regulatory} "
                f"{secondary_findings} "
                f"{classify_all} "
                f"{ignore_noncoding} "
                f"{clinvar_ignore_noncancer} "
                f"{diagnostic_grade_only}"
           )

        if debug:
            print(cpsr_report_command)
        check_subprocess(logger, cpsr_report_command, debug)
        logger.info("Finished CPSR!")
        print('----')
    print()
예제 #6
0
파일: arg_checker.py 프로젝트: sigven/pcgr
def check_args(arg_dict):

    logger = getlogger("pcgr-validate-arguments-input-a")
    # Check the existence of required arguments
    if arg_dict['pcgr_dir'] is None or not os.path.exists(
            arg_dict['pcgr_dir']):
        err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})."
        error_message(err_msg, logger)

    if arg_dict['genome_assembly'] is None:
        err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})."
        error_message(err_msg, logger)

    if arg_dict['input_vcf'] is None:
        err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
        error_message(err_msg, logger)

    if arg_dict['sample_id'] is None:
        err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})."
        error_message(err_msg, logger)

    if len(arg_dict['sample_id']) <= 2 or len(arg_dict['sample_id']) > 35:
        err_msg = f"Sample name identifier ('--sample_id' = {arg_dict['sample_id']}) must be between 2 and 35 characters long"
        error_message(err_msg, logger)

    # Optional arguments

    # check if input is cancer cell line, requires --tumor_only
    if arg_dict['cell_line'] and not arg_dict['tumor_only']:
        err_msg = 'Analysis of cell line (--cell_line) needs option --tumor_only'
        error_message(err_msg, logger)

    # check that tumor primary site/type is set correctly (integer between 0 and 30)
    if arg_dict['tsite'] > max(
            pcgr_vars.tsites.keys()) or arg_dict['tsite'] < 0:
        err_msg = f"Tumor type code ('--tumor_site' = {arg_dict['tsite']}) must be within [0, {max(pcgr_vars.tsites.keys())}]"
        error_message(err_msg, logger)

    # check that tumor purity and tumor ploidy is set correctly
    if not arg_dict['tumor_purity'] is None:
        if not (arg_dict['tumor_purity'] > 0
                and arg_dict['tumor_purity'] <= 1):
            err_msg = f"Tumor purity value ('--tumor_purity' = {arg_dict['tumor_purity']}) must be within (0, 1]"
            error_message(err_msg, logger)

    if not arg_dict['tumor_ploidy'] is None:
        if not arg_dict['tumor_ploidy'] > 0:
            err_msg = f"Tumor ploidy value ('--tumor_ploidy' = {arg_dict['tumor_ploidy']}) must be > 0"
            error_message(err_msg, logger)

    # check that minimum/maximum depth/allelic fractions are set correctly
    if arg_dict['tumor_dp_min'] < 0:
        err_msg = f"Minimum depth tumor ('tumor_dp_min' = {arg_dict['tumor_dp_min']}) must be >= 0"
        error_message(err_msg, logger)

    if arg_dict['tumor_af_min'] < 0 or arg_dict['tumor_af_min'] > 1:
        err_msg = f"Minimum AF tumor ('tumor_af_min' = {arg_dict['tumor_af_min']}) must be within [0, 1]"
        error_message(err_msg, logger)

    if arg_dict['control_dp_min'] < 0:
        err_msg = f"Minimum depth control ('control_dp_min' = {arg_dict['control_dp_min']}) must be >= 0"
        error_message(err_msg, logger)

    if arg_dict['control_af_max'] < 0 or arg_dict['control_af_max'] > 1:
        err_msg = f"Maximum AF control ('control_af_max' = {arg_dict['control_af_max']}) must be within [0, 1]"
        error_message(err_msg, logger)

    # Check that coding target size region of sequencing assay is set correctly
    if arg_dict['target_size_mb'] < 0 or arg_dict['target_size_mb'] > 34:
        err_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) is not positive or larger than the likely maximum size of the coding human genome (34 Mb))"
        error_message(err_msg, logger)
    if arg_dict['target_size_mb'] < 1:
        warn_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) must be greater than 1 Mb for mutational burden estimate to be robust"
        warn_message(warn_msg, logger)
    if arg_dict['target_size_mb'] < 34 and arg_dict['assay'] != 'TARGETED':
        warn_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) is less than default for WES/WGS (34Mb), assay must be set to 'TARGETED'"
        warn_message(warn_msg, logger)

    # if assay is targeted or mode is Tumor-Only, MSI prediction will not be performed/switched off
    assay_type = 'Tumor-Control'
    if arg_dict['estimate_msi_status'] is True and (
            arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True):
        if arg_dict['tumor_only'] is True:
            assay_type = 'Tumor-Only'
        warn_msg = f"MSI status prediction can be applied for WGS/WES tumor-control assays only (query type: {arg_dict['assay']}|{assay_type}) - analysis will be omitted"
        warn_message(warn_msg, logger)
        arg_dict['estimate_msi_status'] = 0

    # minimum number of mutations required for mutational signature reconstruction cannot be less than 100 (somewhat arbitrary lower threshold, recommended value is 200)
    if arg_dict['min_mutations_signatures'] < 200:
        warn_msg = f"Minimum number of mutations required for mutational signature analysis ('--min_mutations_signatures' = {arg_dict['min_mutations_signatures']}) is less than the recommended number (n = 200)"
        warn_message(warn_msg, logger)
        if arg_dict['min_mutations_signatures'] < 100:
            err_msg = f"Minimum number of mutations required for mutational signature analysis ('--min_mutations_signatures' = {arg_dict['min_mutations_signatures']}) must be >= 100"
            error_message(err_msg, logger)

    # if MSI status is to be estimated, mutational burden must be turned on
    if arg_dict['estimate_msi_status'] is True and arg_dict[
            'estimate_tmb'] is False:
        err_msg = "Prediction of MSI status ('--estimate_msi_status') requires mutational burden analysis ('--estimate_tmb')"
        error_message(err_msg, logger)

    if arg_dict['tumor_only'] is True:
        for t in [
                'exclude_likely_het_germline', 'exclude_likely_hom_germline'
        ]:
            if arg_dict[t]:
                if arg_dict['tumor_af_tag'] == "_NA_":
                    err_msg = f"Option '--{t}' requires '--tumor_af_tag' option to be set"
                    error_message(err_msg, logger)

        # Emit warning if panel-of-normals VCF is not present and exclude_pon is set
        if arg_dict['pon_vcf'] is None and arg_dict['exclude_pon'] is True:
            warn_msg = "Panel-of-normals VCF is NOT provided ('--pon_vcf') - exclusion of calls found in panel-of-normals ('--exclude_pon') will be ignored"
            warn_message(warn_msg, logger)
            arg_dict['exclude_pon'] = False

        # Emit warnings that mutational burden and mutational signatures are less accurate for assays with tumor-only data
        if arg_dict['estimate_tmb'] is True:
            warn_msg = "Estimation of mutational burden in tumor-only mode is suboptimal - results must be interpreted with caution"
            warn_message(warn_msg, logger)
        if arg_dict['estimate_signatures'] is True:
            warn_msg = "Estimation of mutational signatures in tumor-only mode is suboptimal - results must be interpreted with caution"
            warn_message(warn_msg, logger)

        # Emit errors when tumor-only filtering thresholds are not properly set
        for pop in ['eur', 'afr', 'amr', 'eas', 'sas', 'global']:
            tag = f'maf_onekg_{pop}'
            if arg_dict[tag]:
                if float(arg_dict[tag]) < 0 or float(arg_dict[tag]) > 1:
                    err_msg = f"MAF threshold (tumor-only germline filter) for 1000 Genomes Project (pop '{pop.upper()}') must be within the [0, 1] range, current value is {arg_dict[tag]}"
                    error_message(err_msg, logger)

        for pop in [
                'nfe', 'fin', 'amr', 'eas', 'sas', 'asj', 'oth', 'afr',
                'global'
        ]:
            tag = f'maf_gnomad_{pop}'
            if arg_dict[tag]:
                if float(arg_dict[tag]) < 0 or float(arg_dict[tag]) > 1:
                    err_msg = f"MAF threshold (tumor-only germline filter) for gnomAD (pop '{pop.upper()}') must be within the [0, 1] range, current value is {arg_dict[tag]}"
                    error_message(err_msg, logger)

    ## tumor-only is False
    # else:
    #    for t in ["exclude_pon","exclude_likely_het_germline","exclude_likely_hom_germline","exclude_dbsnp_nonsomatic","exclude_nonexonic"]:
    #       if arg_dict[t] is True:
    #          warn_msg = "Option "--" + str(t) + "" requires "--tumor_only" option (not currently set)"
    #          warn_message(warn_msg, logger)

    # Emit warning that mutational signature estimation is (likely) not optimal for small/targeted sequencing assays
    if arg_dict['estimate_signatures'] is True and arg_dict[
            'assay'] == 'TARGETED':
        warn_msg = "Estimation of mutational signatures ('--estimate_signatures') is not optimal for TARGETED sequencing assays - results must be interpreted with caution"
        warn_message(warn_msg, logger)

    # Check that log ratio thresholds for homozygous deletions and amplifications are properly set, and that segment overlap with transcripts are set appropriately
    if arg_dict['logr_homdel'] >= 0:
        err_msg = f"Log ratio for homozygous deletions ('--logr_homdel' = {arg_dict['logr_homdel']}) should be < 0"
        error_message(err_msg, logger)
    if arg_dict['logr_gain'] <= 0:
        err_msg = f"Log ratio for copy number gains/amplifications ('--logr_gain' = {arg_dict['logr_gain']}) should be > 0"
        error_message(err_msg, logger)
    if arg_dict['cna_overlap_pct'] > 100 or arg_dict['cna_overlap_pct'] <= 0:
        err_msg = f"Minimum percent overlap between copy number segment and gene transcript ('--cna_overlap_pct' = {arg_dict['cna_overlap_pct']}) must be within (0, 100]"
        error_message(err_msg, logger)

    # VEP options
    if arg_dict['vep_n_forks'] <= 0 or arg_dict['vep_n_forks'] > 4:
        err_msg = f"Number of forks that VEP can use during annotation ('--vep_n_forks' = {arg_dict['vep_n_forks']}must be within (0, 4]"
        error_message(err_msg, logger)

    if arg_dict['vep_buffer_size'] <= 0 or arg_dict['vep_buffer_size'] > 30000:
        err_msg = f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously ('--vep_buffer_size' = {arg_dict['vep_buffer_size']}),  must be within (0, 30000]"
        error_message(err_msg, logger)

    # Check that VEP pick criteria is formatted correctly
    if not arg_dict['vep_pick_order'] is None:
        values = str(arg_dict['vep_pick_order']).split(',')
        permitted_sources = [
            'canonical', 'appris', 'tsl', 'biotype', 'ccds', 'rank', 'length',
            'mane'
        ]
        num_permitted_sources = 0
        for v in values:
            if v in permitted_sources:
                num_permitted_sources += 1

        if num_permitted_sources != 8:
            err_msg = (
                f"'--vep_pick_order' = {arg_dict['vep_pick_order']} is formatted incorrectly, should be "
                "a comma-separated string of the following values: canonical,appris,tsl,biotype,ccds,rank,length,mane"
            )
            error_message(err_msg, logger)
    return
예제 #7
0
파일: arg_checker.py 프로젝트: sigven/pcgr
def verify_input_files_cpsr(arg_dict):

    logger = getlogger('cpsr-validate-input-arguments-b')
    input_vcf_dir = "NA"
    db_dir = "NA"
    base_dir = "NA"
    output_dir_full = "NA"
    input_vcf_basename = "NA"
    input_customlist_basename = "NA"
    input_customlist_dir = "NA"

    # create output folder (if not already exists)
    output_dir_full = utils.safe_makedir(
        os.path.abspath(arg_dict['output_dir']))

    ## check if input BED exist
    if not arg_dict['custom_list'] is None:
        if not os.path.exists(os.path.abspath(arg_dict['custom_list'])):
            err_msg = f"Input file ({arg_dict['custom_list']}) does not exist"
            error_message(err_msg, logger)

        input_customlist_basename = os.path.basename(
            str(arg_dict['custom_list']))
        input_customlist_dir = os.path.dirname(
            os.path.abspath(arg_dict['custom_list']))

    ## check if input vcf exist
    if not arg_dict['input_vcf'] is None:
        if not os.path.exists(os.path.abspath(arg_dict['input_vcf'])):
            err_msg = f"Input file ({arg_dict['input_vcf']}) does not exist"
            error_message(err_msg, logger)

        if not (os.path.abspath(arg_dict['input_vcf']).endswith('.vcf')
                or os.path.abspath(arg_dict['input_vcf']).endswith('.vcf.gz')):
            err_msg = f"VCF input file ({os.path.abspath(arg_dict['input_vcf'])}) does not have the correct file extension (.vcf or .vcf.gz)"
            error_message(err_msg, logger)

        ## check that tabix file exist if bgzipped files is given
        if os.path.abspath(arg_dict['input_vcf']).endswith('.vcf.gz'):
            tabix_file = arg_dict['input_vcf'] + '.tbi'
            if not os.path.exists(os.path.abspath(tabix_file)):
                err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped VCF input file (" + os.path.abspath(
                    arg_dict['input_vcf']
                ) + "). Please make sure your input VCF is properly compressed and indexed (bgzip + tabix)"
                error_message(err_msg, logger)

        input_vcf_basename = os.path.basename(str(arg_dict['input_vcf']))
        input_vcf_dir = os.path.dirname(os.path.abspath(arg_dict['input_vcf']))

        ## if output vcf exist and overwrite not set
        output_vcf = os.path.join(str(output_dir_full),
                                  str(arg_dict['sample_id'])) + '.cpsr.' + str(
                                      arg_dict['genome_assembly']) + '.vcf.gz'
        if os.path.exists(output_vcf) and arg_dict['force_overwrite'] is False:
            err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different sample_id or add option --force_overwrite"
            error_message(err_msg, logger)

    ## check the existence of base folder
    base_dir = os.path.abspath(arg_dict['pcgr_dir'])
    if not os.path.isdir(base_dir):
        err_msg = f"Base directory ({base_dir}) does not exist"
        error_message(err_msg, logger)

    ## check the existence of data folder within the base folder
    db_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data')
    if not os.path.isdir(db_dir):
        err_msg = f"Data directory ({db_dir}) does not exist"
        error_message(err_msg, logger)

    ## check the existence of specified assembly data folder within the base folder
    db_assembly_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']),
                                   'data', arg_dict['genome_assembly'])
    if not os.path.isdir(db_assembly_dir):
        err_msg = f"Data directory for the specified genome assembly ({db_assembly_dir}) does not exist"
        error_message(err_msg, logger)

    ## check the existence of RELEASE_NOTES
    rel_notes_file = os.path.join(os.path.abspath(arg_dict['pcgr_dir']),
                                  'data', arg_dict['genome_assembly'],
                                  'RELEASE_NOTES')
    if not os.path.exists(rel_notes_file):
        err_msg = 'The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/cpsr for instructions)'
        error_message(err_msg, logger)

    f_rel_not = open(rel_notes_file, 'r')
    compliant_data_bundle = 0
    for line in f_rel_not:
        if pcgr_vars.DB_VERSION in line:
            compliant_data_bundle = 1

    f_rel_not.close()

    if compliant_data_bundle == 0:
        err_msg = 'The PCGR data bundle is not compliant with the software version - please download the latest software and data bundle (see https://github.com/sigven/cpsr for instructions)'
        error_message(err_msg, logger)

    cpsr_paths = {
        "input_vcf_dir": input_vcf_dir,
        "input_customlist_dir": input_customlist_dir,
        "db_dir": db_assembly_dir,
        "base_dir": base_dir,
        "output_dir": output_dir_full,
        "input_vcf_basename": input_vcf_basename,
        "input_customlist_basename": input_customlist_basename,
    }

    return cpsr_paths
예제 #8
0
파일: arg_checker.py 프로젝트: sigven/pcgr
def check_args_cpsr(arg_dict):

    logger = getlogger('cpsr-validate-input-arguments-a')
    arg_dict['vep_regulatory'] = True
    ## Required arguments
    ## Check that query VCF is set and exists
    if arg_dict['input_vcf'] is None or not os.path.exists(
            arg_dict['input_vcf']):
        err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
        error_message(err_msg, logger)
    ## Check that PCGR directory (with data bundle) is provided and exists
    if arg_dict['pcgr_dir'] is None or not os.path.exists(
            arg_dict['pcgr_dir']):
        err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})."
        error_message(err_msg, logger)
    ## Check that genome assembly is set
    if arg_dict['genome_assembly'] is None:
        err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})."
        error_message(err_msg, logger)
    ## Check that sample identifier is set and is of appropriate length (minimum two characters)
    if arg_dict['sample_id'] is None:
        err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})."
        error_message(err_msg, logger)

    if len(arg_dict['sample_id']) <= 2:
        err_msg = f"Sample name identifier ('--sample_id') requires a name with more than two characters ({arg_dict['sample_id']})."
        error_message(err_msg, logger)

    ### Optional arguments
    ## Provide virtual_panel_id or a custom list from panel 0
    if arg_dict['virtual_panel_id'] == "-1" and not arg_dict['custom_list']:
        err_msg = 'Provide valid virtual panel identifier(s) through --panel_id (0 - 42) or provide custom list of panel 0 genes (single column text file) through --custom_list'
        error_message(err_msg, logger)
    if arg_dict['custom_list'] and arg_dict['virtual_panel_id'] != "-1":
        err_msg = "Option --panel_id cannot be used in conjunction with --custom_list"
        error_message(err_msg, logger)
    if arg_dict['maf_upper_threshold'] <= 0 or arg_dict[
            'maf_upper_threshold'] > 1:
        err_msg = f"MAF upper threshold must be greater than 0 and below 1, current value is {arg_dict['maf_upper_threshold']}"
        error_message(err_msg, logger)
    if arg_dict['vcfanno_n_proc'] <= 0 or arg_dict['vcfanno_n_proc'] > 15:
        err_msg = f"Number of processes that vcfanno can use during annotation must be above 0 and not more than 15, current value is {arg_dict['vcfanno_n_proc']}."
        error_message(err_msg, logger)

    ## Check that panel identifier(s) are set appropriately
    if arg_dict['virtual_panel_id'] != "-1" and not arg_dict['custom_list']:
        if not ',' in arg_dict['virtual_panel_id']:
            if str(arg_dict['virtual_panel_id']).isdigit():
                panel_id = int(arg_dict['virtual_panel_id'])
                if not (panel_id >= 0 and panel_id <= 42):
                    err_msg = 'A single panel chosen with \'--panel_id\' must be in the range 0 - 42'
                    error_message(err_msg, logger)
            else:
                err_msg = 'A single panel chosen with \'--panel_id\' must be a proper integer - not \'' + str(
                    arg_dict['virtual_panel_id']) + '\''
                error_message(err_msg, logger)
        else:
            panels = str(arg_dict['virtual_panel_id']).split(',')
            for p in panels:
                #p = int(p)
                if str(p).isdigit():
                    panel_id = int(p)
                    if panel_id < 1 or panel_id > 42:
                        err_msg = 'Multiple panels submitted as comma-separated string with \'--panel_id\' must take values in the range 1 - 42'
                        error_message(err_msg, logger)
                else:
                    err_msg = f"Multiple panels submitted as comma-separated string with '--panel_id' must contain proper integer values only - \'{arg_dict['virtual_panel_id']}\' contains non-integer entries."
                    error_message(err_msg, logger)

    if (arg_dict['custom_list'] or arg_dict['virtual_panel_id']
            == "0") and arg_dict['diagnostic_grade_only']:
        warn_msg = 'Option \'--diagnostic_grade_only\' applies ONLY to panel identifiers from Genomics England PanelApp - will be ignored'
        warn_message(warn_msg, logger)

    ## VEP options
    if arg_dict['vep_n_forks'] <= 0 or arg_dict['vep_n_forks'] > 4:
        err_msg = f"Number of forks that VEP can use during annotation must be above 0 and not more than 4, current value is {arg_dict['vep_n_forks']}"
        error_message(err_msg, logger)

    if arg_dict['vep_buffer_size'] <= 0 or arg_dict['vep_buffer_size'] > 30000:
        err_msg = "Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously, must be above 0 and not more than 30,000, current value is {arg_dict['vep_buffer_size']}"
        error_message(err_msg, logger)

    ## Check that VEP pick criteria is formatted correctly
    if not arg_dict['vep_pick_order'] is None:
        values = str(arg_dict['vep_pick_order']).split(',')
        permitted_sources = [
            'canonical', 'appris', 'tsl', 'biotype', 'ccds', 'rank', 'length',
            'mane'
        ]
        num_permitted_sources = 0
        for v in values:
            if v in permitted_sources:
                num_permitted_sources += 1

        if num_permitted_sources != 8:
            err_msg = "Option 'vep_pick_order' = " + str(arg_dict['vep_pick_order']) + " is formatted incorrectly, should be " + \
               "a comma-separated string of the following values: canonical,appris,tsl,biotype,ccds,rank,length,mane"
            error_message(err_msg, logger)
    return
예제 #9
0
파일: arg_checker.py 프로젝트: sigven/pcgr
def verify_input_files(arg_dict):
    """
    1. Checks existence of input files/dirs (arg_dict)
    2. Checks that the data bundle is of correct date
    """
    logger = getlogger("pcgr-validate-arguments-input-b")

    input_vcf_dir = 'NA'
    input_cna_dir = 'NA'
    input_rna_fusion_dir = 'NA'
    input_cpsr_report_dir = 'NA'
    input_rna_expression_dir = 'NA'
    input_cna_plot_dir = 'NA'
    panel_normal_vcf_dir = 'NA'
    db_dir = 'NA'
    base_dir = 'NA'
    output_dir_full = 'NA'
    panel_normal_vcf_basename = 'NA'
    input_vcf_basename = 'NA'
    input_cna_basename = 'NA'
    input_rna_fusion_basename = 'NA'
    input_rna_expression_basename = 'NA'
    input_cpsr_report_basename = 'NA'
    input_cna_plot_basename = 'NA'

    arg_dict['rna_fusion_tumor'] = None
    arg_dict['rna_exp_tumor'] = None

    # check that either input vcf or cna segments exist
    if arg_dict['input_vcf'] is None and arg_dict['input_cna'] is None:
        err_msg = 'Please specifiy either a VCF input file (--input_vcf) or a copy number segment file (--input_cna)'
        error_message(err_msg, logger)

    # create output folder (if not already exists)
    output_dir_full = utils.safe_makedir(
        os.path.abspath(arg_dict['output_dir']))

    # check if panel of normal VCF exist
    if not arg_dict["pon_vcf"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["pon_vcf"])):
            err_msg = "Input file (" + \
                str(arg_dict["pon_vcf"]) + ") does not exist"
            error_message(err_msg, logger)

        if not (os.path.abspath(arg_dict["pon_vcf"]).endswith(".vcf.gz")):
            err_msg = "Panel of normals VCF file (" + os.path.abspath(
                arg_dict["pon_vcf"]
            ) + ") does not have the correct file extension (.vcf.gz)"
            error_message(err_msg, logger)

        # check that tabix file exist if bgzipped files is given
        if os.path.abspath(arg_dict["pon_vcf"]).endswith(".vcf.gz"):
            tabix_file = arg_dict["pon_vcf"] + ".tbi"
            if not os.path.exists(os.path.abspath(tabix_file)):
                err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped panel of normal VCF file (" + os.path.abspath(arg_dict["pon_vcf"]) + \
                    "). Please make sure your the VCF is properly compressed and indexed (bgzip + tabix)"
                error_message(err_msg, logger)

        if arg_dict["input_vcf"] is None:
            warn_msg = "Ignoring panel of normal VCF file, --input_vcf missing"
            warn_message(warn_msg, logger)
        else:
            panel_normal_vcf_basename = os.path.basename(
                str(arg_dict["pon_vcf"]))
            panel_normal_vcf_dir = os.path.dirname(
                os.path.abspath(arg_dict["pon_vcf"]))

    # check if input vcf exists
    if not arg_dict["input_vcf"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["input_vcf"])):
            err_msg = f'Input file ({arg_dict["input_vcf"]}) does not exist'
            error_message(err_msg, logger)

        if not (os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf")
                or os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf.gz")):
            err_msg = f'VCF input file ({os.path.abspath(arg_dict["input_vcf"])}) does not have the correct file extension (.vcf or .vcf.gz)'
            error_message(err_msg, logger)

        # check that tabix file exists if bgzipped file is given
        if os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf.gz"):
            tabix_file = arg_dict["input_vcf"] + ".tbi"
            if not os.path.exists(os.path.abspath(tabix_file)):
                err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped VCF input file (" + os.path.abspath(arg_dict["input_vcf"]) + \
                    "). Please make sure your input VCF is properly compressed and indexed (bgzip + tabix)"
                error_message(err_msg, logger)

        input_vcf_basename = os.path.basename(str(arg_dict["input_vcf"]))
        input_vcf_dir = os.path.dirname(os.path.abspath(arg_dict["input_vcf"]))

        # if output vcf exist and overwrite not set
        output_vcf = os.path.join(
            str(output_dir_full),
            f"{arg_dict['sample_id']}.pcgr_acmg.{arg_dict['genome_assembly']}.vcf.gz"
        )
        if os.path.exists(output_vcf) and arg_dict["force_overwrite"] is False:
            err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different sample_id or add option --force_overwrite"
            error_message(err_msg, logger)

    # check if input cna plot file exist
    # if not arg_dict["input_cna_plot"] is None:
    #    if not os.path.exists(os.path.abspath(arg_dict["input_cna_plot"])):
    #       err_msg = "Input file (" + str(arg_dict["input_cna_plot"]) + ") does not exist"
    #       error_message(err_msg,logger)
    #    if not (os.path.abspath(arg_dict["input_cna_plot"]).endswith(".png")):
    #       err_msg = "CNA segment input file (" + os.path.abspath(arg_dict["input_cna_plot"]) + ") does not have the correct file extension (.png)"
    #       error_message(err_msg,logger)
    #    if arg_dict["input_cna"] is None:
    #       err_msg = "Input a CNA plot needs to come with a CNA segment file (--input_cna is missing)"
    #       error_message(err_msg,logger)
    #    input_cna_plot_basename = os.path.basename(str(arg_dict["input_cna_plot"]))
    #    input_cna_plot_dir = os.path.dirname(os.path.abspath(arg_dict["input_cna_plot"]))

    # check if input cna segments exist
    if not arg_dict["input_cna"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["input_cna"])):
            err_msg = "Input file (" + \
                str(arg_dict["input_cna"]) + ") does not exist"
            error_message(err_msg, logger)
        if not (os.path.abspath(arg_dict["input_cna"]).endswith(".tsv")
                or os.path.abspath(arg_dict["input_cna"]).endswith(".txt")):
            err_msg = "CNA segment input file (" + os.path.abspath(
                arg_dict["input_cna"]
            ) + ") does not have the correct file extension (.tsv or .txt)"
            error_message(err_msg, logger)
        input_cna_basename = os.path.basename(str(arg_dict["input_cna"]))
        input_cna_dir = os.path.dirname(os.path.abspath(arg_dict["input_cna"]))

        # if output cna segments exist and overwrite not set
        output_cna_segments = os.path.join(
            str(output_dir_full), str(
                arg_dict["sample_id"])) + ".pcgr_acmg." + str(
                    arg_dict["genome_assembly"]) + ".cna_segments.tsv.gz"
        if os.path.exists(
                output_cna_segments) and arg_dict["force_overwrite"] is False:
            err_msg = "Output files (e.g. " + str(output_cna_segments) + \
                ") already exist - please specify different sample_id or add option --force_overwrite"
            error_message(err_msg, logger)

    # check if input rna fusion variants exist
    if not arg_dict["rna_fusion_tumor"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["rna_fusion_tumor"])):
            err_msg = "Input file (" + \
                str(arg_dict["rna_fusion_tumor"]) + ") does not exist"
            error_message(err_msg, logger)
        if not (os.path.abspath(arg_dict["rna_fusion_tumor"]).endswith(".tsv")
                or os.path.abspath(
                    arg_dict["rna_fusion_tumor"]).endswith(".txt")):
            err_msg = "RNA fusion variants file (" + os.path.abspath(
                arg_dict["rna_fusion_tumor"]
            ) + ") does not have the correct file extension (.tsv or .txt)"
            error_message(err_msg, logger)
        input_rna_fusion_basename = os.path.basename(
            str(arg_dict["rna_fusion_tumor"]))
        input_rna_fusion_dir = os.path.dirname(
            os.path.abspath(arg_dict["rna_fusion_tumor"]))

    # check if input rna expression exist
    if not arg_dict["rna_exp_tumor"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["rna_exp_tumor"])):
            err_msg = "Input file (" + \
                str(arg_dict["rna_exp_tumor"]) + ") does not exist"
            error_message(err_msg, logger)
        if not (os.path.abspath(arg_dict["rna_exp_tumor"]).endswith(".tsv") or
                os.path.abspath(arg_dict["rna_exp_tumor"]).endswith(".txt")):
            err_msg = "RNA gene expression file (" + os.path.abspath(
                arg_dict["rna_exp_tumor"]
            ) + ") does not have the correct file extension (.tsv or .txt)"
            error_message(err_msg, logger)
        input_rna_expression_basename = os.path.basename(
            str(arg_dict["rna_exp_tumor"]))
        input_rna_expression_dir = os.path.dirname(
            os.path.abspath(arg_dict["rna_exp_tumor"]))

    # check if input rna fusion variants exist
    if not arg_dict["cpsr_report"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["cpsr_report"])):
            err_msg = "Input file (" + \
                str(arg_dict["cpsr_report"]) + ") does not exist"
            error_message(err_msg, logger)
        if not (os.path.abspath(arg_dict["cpsr_report"]).endswith(".json.gz")):
            err_msg = "CPSR report file (" + os.path.abspath(
                arg_dict["cpsr_report"]
            ) + ") does not have the correct file extension (.json.gz)"
            error_message(err_msg, logger)
        input_cpsr_report_basename = os.path.basename(
            str(arg_dict["cpsr_report"]))
        input_cpsr_report_dir = os.path.dirname(
            os.path.abspath(arg_dict["cpsr_report"]))

    # check the existence of base folder
    base_dir = os.path.abspath(arg_dict["pcgr_dir"])
    if not os.path.isdir(base_dir):
        err_msg = "Base directory (" + str(base_dir) + ") does not exist"
        error_message(err_msg, logger)

    # check the existence of data folder within the base folder
    db_dir = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]), "data")
    if not os.path.isdir(db_dir):
        err_msg = "Data directory (" + str(db_dir) + ") does not exist"
        error_message(err_msg, logger)

    # check the existence of specified assembly data folder within the base folder
    db_assembly_dir = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]),
                                   "data", arg_dict["genome_assembly"])
    if not os.path.isdir(db_assembly_dir):
        err_msg = "Data directory for the specified genome assembly (" + str(
            db_assembly_dir) + ") does not exist"
        error_message(err_msg, logger)

    # check the existence of RELEASE_NOTES (starting from 0.4.0)
    rel_notes_file = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]),
                                  "data", arg_dict["genome_assembly"],
                                  "RELEASE_NOTES")
    if not os.path.exists(rel_notes_file):
        err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)"
        error_message(err_msg, logger)

    f_rel_not = open(rel_notes_file, "r")
    compliant_data_bundle = 0
    for line in f_rel_not:
        if pcgr_vars.DB_VERSION in line:
            compliant_data_bundle = 1

    f_rel_not.close()

    if compliant_data_bundle == 0:
        err_msg = "The PCGR data bundle is not compliant with the software version - please download the latest software and data bundle (see https://github.com/sigven/pcgr for instructions)"
        error_message(err_msg, logger)

    pcgr_paths = {
        "input_vcf_dir": input_vcf_dir,
        "input_cna_dir": input_cna_dir,
        "input_rna_fusion_dir": input_rna_fusion_dir,
        "input_rna_expression_dir": input_rna_expression_dir,
        "input_cpsr_report_dir": input_cpsr_report_dir,
        "input_cna_plot_dir": input_cna_plot_dir,
        "panel_normal_vcf_dir": panel_normal_vcf_dir,
        "db_dir": db_assembly_dir,
        "base_dir": base_dir,
        "output_dir": output_dir_full,
        "panel_normal_vcf_basename": panel_normal_vcf_basename,
        "input_vcf_basename": input_vcf_basename,
        "input_cna_basename": input_cna_basename,
        "input_rna_fusion_basename": input_rna_fusion_basename,
        "input_rna_expression_basename": input_rna_expression_basename,
        "input_cpsr_report_basename": input_cpsr_report_basename,
        "input_cna_plot_basename": input_cna_plot_basename,
    }

    return pcgr_paths
예제 #10
0
def validate_pcgr_input(pcgr_directory,
                        input_vcf,
                        input_cna,
                        input_rna_fusion,
                        input_rna_expression,
                        tumor_dp_tag,
                        tumor_af_tag,
                        control_dp_tag,
                        control_af_tag,
                        call_conf_tag,
                        exclude_hom_germline,
                        exclude_het_germline,
                        panel_normal_vcf,
                        preserved_info_tags,
                        vcf_validation,
                        tumor_only,
                        genome_assembly,
                        keep_uncompressed,
                        output_dir,
                        debug):
    """
    Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
    1. no INFO annotation tags in the query VCF coincides with those generated by PCGR
    2. provided columns for tumor/normal coverage and allelic depths are found in VCF
    3. provided preserved VCF columns are present in VCF file
    4. if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
    5. panel-of-normals VCF adheres to the required format (PANEL_OF_NORMALS INFO tag in header)
    6. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix)
    7. Check that copy number segment file has required columns and correct data types (and range)
    8. Check that RNA fusion variant file has required columns and correct data types
    9. Check that RNA expression file has required columns and correct data types
    """
    logger = utils.getlogger('pcgr-validate-input-arguments')

    # if panel_normal_vcf == "None" and tumor_only == 1 and config_options['tumor_only']['exclude_pon'] is True:
    #    logger.warning('Panel-of-normals VCF is not present - exclusion of calls found in panel-of-normals will be ignored')

    if not input_vcf == 'None':

       ## Perform VCF validation if this option is set
        if vcf_validation == 1:
            logger.info('Skipping validation of VCF file (deprecated as of Dec 2021)')
        else:
            logger.info('Skipping validation of VCF file as provided by option --no_vcf_validate')

        ## Check that VCF does not contain INFO tags that will be appended with PCGR annotation
        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger)
        if tag_check == -1:
            return -1

        if preserved_info_tags != "None":
            custom_check = check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger)
            if custom_check == -1:
                return -1

        ## Check whether specified tags for depth/allelic fraction are properly defined in VCF
        vcf = VCF(input_vcf)
        allelic_support_check = check_format_ad_dp_tags(vcf, tumor_dp_tag, tumor_af_tag, control_dp_tag,
                                                        control_af_tag, call_conf_tag, exclude_hom_germline,
                                                        exclude_het_germline, tumor_only, logger)
        if allelic_support_check == -1:
            return -1

        ## Simplify VCF - remove multiallelic variants
        simplify_vcf(input_vcf, vcf, output_dir, keep_uncompressed, logger, debug)


    ## Validate panel-of-normals VCF is provided
    if not panel_normal_vcf == "None":
        valid_panel_normals = validate_panel_normal_vcf(panel_normal_vcf, logger)
        if valid_panel_normals == -1:
            return -1

    ## Check whether file with copy number aberration segments is properly formatted
    if not input_cna == 'None':
        valid_cna = is_valid_cna(input_cna, logger)
        if valid_cna == -1:
            return -1

    ## Check whether file with RNA fusion variants is properly formatted
    if not input_rna_fusion == 'None':
        valid_rna_fusion = is_valid_rna_fusion(input_rna_fusion, logger)
        if valid_rna_fusion == -1:
            return -1

    ## Check whether file with RNA fusion variants is properly formatted
    if not input_rna_expression == 'None':
        valid_rna_expression = is_valid_rna_expression(input_rna_expression, logger)
        if valid_rna_expression == -1:
            return -1

    return 0