Пример #1
0
def run_pcgr(pcgr_paths, config_options):
    """
    Main function to run the PCGR workflow
    """

    debug = config_options['debug']

    report_nonfloating_toc = 1 if config_options['other']['nonfloating_toc'] else 0
    vep_regulatory_annotation = 'ON' if config_options['other']['vep_regulatory'] == 1 else 'OFF'
    clinical_trials_set = 'ON' if config_options['clinicaltrials']['run'] else 'OFF'
    msi_prediction_set = 'ON' if config_options['msi']['run'] else 'OFF'
    msig_estimation_set = 'ON' if config_options['msigs']['run'] else 'OFF'
    tmb_estimation_set = 'ON' if config_options['tmb']['run'] else 'OFF'
    vcf_validation = 0 if config_options['other']['no_vcf_validate'] else 1
    run_vcf2maf = config_options['other']['vcf2maf']
    assay_mode = 'Tumor vs. Control'
    tumor_only = 0
    cell_line = 0
    if config_options['tumor_only']['tumor_only']:
        assay_mode = 'Tumor-Only'
        tumor_only = 1
        if config_options['tumor_only']['cell_line']:
            cell_line = 1
            assay_mode = 'Tumor-Only (cell line)'
    # set basic run commands
    output_vcf = 'None'
    output_pass_vcf = 'None'
    output_pass_tsv = 'None'
    output_maf = 'None'
    GENCODE_VERSION = pcgr_vars.GENCODE_VERSION
    NCBI_BUILD_MAF = pcgr_vars.NCBI_BUILD_MAF
    VEP_ASSEMBLY = pcgr_vars.VEP_ASSEMBLY
    MAX_VARIANTS_FOR_REPORT = pcgr_vars.MAX_VARIANTS_FOR_REPORT
    if config_options['genome_assembly'] == 'grch37':
        NCBI_BUILD_MAF = 'GRCh37'
        GENCODE_VERSION = 'release 19'
        VEP_ASSEMBLY = 'GRCh37'
    logger = getlogger('pcgr-get-OS')

    vep_dir = os.path.join(str(pcgr_paths['db_dir']), '.vep')
    input_vcf = 'None'
    input_cna = 'None'
    input_rna_fusion = 'None'
    input_rna_expression = 'None'
    input_cpsr_report = 'None'
    panel_normal = 'None'
    # panel-of-normals annotation
    pon_annotation = 0

    # Specify paths for input files and directories
    if pcgr_paths['input_vcf_basename'] != 'NA':
        input_vcf = os.path.join(pcgr_paths['input_vcf_dir'], pcgr_paths['input_vcf_basename'])
    if pcgr_paths['input_cna_basename'] != 'NA':
        input_cna = os.path.join(pcgr_paths['input_cna_dir'], pcgr_paths['input_cna_basename'])
    if pcgr_paths['input_rna_fusion_basename'] != 'NA':
        input_rna_fusion = os.path.join(pcgr_paths['input_rna_fusion_dir'], pcgr_paths['input_rna_fusion_basename'])
    if pcgr_paths['input_rna_expression_basename'] != 'NA':
        input_rna_expression = os.path.join(pcgr_paths['input_rna_expression_dir'], pcgr_paths['input_rna_expression_basename'])
    if pcgr_paths['input_cpsr_report_basename'] != 'NA':
        input_cpsr_report = os.path.join(pcgr_paths['input_cpsr_report_dir'], pcgr_paths['input_cpsr_report_basename'])
    if pcgr_paths['panel_normal_vcf_basename'] != 'NA':
        panel_normal = os.path.join(pcgr_paths['panel_normal_vcf_dir'], pcgr_paths['panel_normal_vcf_basename'])

    data_dir = pcgr_paths['base_dir']
    output_dir = pcgr_paths['output_dir']

    # PCGR|validate_input - verify that VCF and CNA segment file is of appropriate format
    logger = getlogger("pcgr-validate-input-arguments")
    logger.info("PCGR - STEP 0: Validate input data and options")

    vcf_validate_command = (
            f'pcgr_validate_input.py '
            f'{data_dir} '
            f'{input_vcf} '
            f'{input_cna} '
            f'{input_rna_fusion} '
            f'{input_rna_expression} '
            f'{panel_normal} '
            f'{vcf_validation} '
            f'{tumor_only} '
            f'{config_options["genome_assembly"]} '
            f'{config_options["other"]["preserved_info_tags"]} '
            f'{config_options["allelic_support"]["tumor_dp_tag"]} {config_options["allelic_support"]["tumor_af_tag"]} '
            f'{config_options["allelic_support"]["control_dp_tag"]} {config_options["allelic_support"]["control_af_tag"]} '
            f'{config_options["allelic_support"]["call_conf_tag"]} '
            f'{config_options["tumor_only"]["exclude_likely_hom_germline"]} '
            f'{config_options["tumor_only"]["exclude_likely_het_germline"]} '
            f'--output_dir {output_dir} '
            f'{"--debug " if debug else ""}'
            f'{"--keep_uncompressed" if run_vcf2maf else ""} '
            )
    check_subprocess(logger, vcf_validate_command, debug)
    logger.info('Finished pcgr-validate-input-arguments')
    print('----')

    # PCGR|start - Log key information about sample, options and sequencing assay/design
    logger = getlogger('pcgr-start')
    logger.info('--- Personal Cancer Genome Reporter workflow ----')
    logger.info(f'Sample name: {config_options["sample_id"]}')
    if config_options['tumor_type']['type'] == 'Cancer_NOS':
        logger.info('Tumor type: Cancer_NOS (Any tumortype)')
    else:
        logger.info(f'Tumor type: {config_options["tumor_type"]["type"]}')
    logger.info(f'Sequencing assay - type: {config_options["assay"]}')
    logger.info(f'Sequencing assay - mode: {assay_mode}')
    logger.info(f'Sequencing assay - coding target size: {config_options["tmb"]["target_size_mb"]}Mb')
    logger.info(f'Genome assembly: {config_options["genome_assembly"]}')
    logger.info(f'Mutational signature estimation: {msig_estimation_set}')
    logger.info(f'MSI classification: {msi_prediction_set}')
    logger.info(f'Mutational burden estimation: {tmb_estimation_set}')
    logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}')

    if not input_vcf == 'None':
        # Define temporary output file names
        prefix = os.path.join(output_dir, f'{config_options["sample_id"]}.pcgr_acmg.{config_options["genome_assembly"]}')
        output_vcf =             f'{prefix}.vcf.gz'
        output_pass_vcf =        f'{prefix}.pass.vcf.gz'
        output_pass_tsv =        f'{prefix}.pass.tsv'
        output_pass_raw_tsv_gz = f'{prefix}.pass.raw.tsv.gz'
        output_maf =             f'{prefix}.tmp.maf'
        output_vcf2maf_log =     f'{prefix}.maf.log'
        input_vcf_pcgr_ready =   os.path.join(output_dir, re.sub(r"(\.vcf$|\.vcf\.gz$)", ".pcgr_ready.vcf.gz", pcgr_paths["input_vcf_basename"]))
        # needs to be uncompressed for vcf2maf
        input_vcf_pcgr_ready_uncompressed = os.path.join(output_dir, re.sub(r"(\.vcf$|\.vcf\.gz$)", ".pcgr_ready.vcf", pcgr_paths["input_vcf_basename"]))
        vep_vcf = re.sub(r"(\.vcf$|\.vcf\.gz$)", ".vep.vcf.gz", input_vcf_pcgr_ready)
        vep_vcfanno_vcf = re.sub(r"(\.vcf$|\.vcf\.gz$)", ".vep.vcfanno.vcf", input_vcf_pcgr_ready)
        vep_vcfanno_annotated_vcf = re.sub(r"\.vcfanno", ".vcfanno.annotated", vep_vcfanno_vcf) + ".gz"
        vep_vcfanno_annotated_pass_vcf = re.sub(r"\.vcfanno", ".vcfanno.annotated.pass", vep_vcfanno_vcf) + ".gz"
        fasta_assembly = os.path.join(vep_dir, 'homo_sapiens', f'{pcgr_vars.VEP_VERSION}_{VEP_ASSEMBLY}', f'Homo_sapiens.{VEP_ASSEMBLY}.dna.primary_assembly.fa.gz')
        # List all VEP flags used when calling VEP
        vep_flags = (
                f'--hgvs --af --af_1kg --af_gnomad --variant_class --domains --symbol --protein --ccds --mane '
                f'--uniprot --appris --biotype --tsl --canonical --format vcf --cache --numbers --total_length --allele_number '
                f'--no_stats --no_escape --xref_refseq --vcf --check_ref --dont_skip --flag_pick_allele --plugin NearestExonJB,max_range=50000 '
                f'--force_overwrite --species homo_sapiens --offline --compress_output bgzip'
                )
        vep_options = (
                f'--dir {vep_dir} --assembly {VEP_ASSEMBLY} --cache_version {pcgr_vars.VEP_VERSION} '
                f'--fasta {fasta_assembly} --pick_order {config_options["other"]["vep_pick_order"]} '
                f'--buffer_size {config_options["other"]["vep_buffer_size"]} '
                f'--fork {config_options["other"]["vep_n_forks"]} '
                f'{vep_flags} '
                f'{"--verbose" if debug else "--quiet"} '
                )
        gencode_set_in_use = "GENCODE - all transcripts"
        if config_options['other']['vep_no_intergenic'] == 1:
            vep_options += '--no_intergenic '
        if config_options['other']['vep_regulatory'] == 1:
            vep_options += '--regulatory '
        if config_options['other']['vep_gencode_all'] == 0:
            vep_options += '--gencode_basic '
            gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)"

        # Compose full VEP command
        vep_main_command = f'{utils.get_perl_exports()} && vep --input_file {input_vcf_pcgr_ready} --output_file {vep_vcf} {vep_options}'
        vep_tabix_command = f'tabix -f -p vcf {vep_vcf}'

        # PCGR|VEP - run consequence annotation with Variant Effect Predictor
        print('----')
        logger = getlogger('pcgr-vep')
        logger.info(f'PCGR - STEP 1: Basic variant annotation with Variant Effect Predictor ({pcgr_vars.VEP_VERSION}, GENCODE {GENCODE_VERSION}, {config_options["genome_assembly"]})')
        logger.info(f'VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)')
        logger.info(f'VEP configuration - transcript pick order: {config_options["other"]["vep_pick_order"]}')
        logger.info(f'VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options')
        logger.info(f'VEP configuration - GENCODE set: {gencode_set_in_use}')
        logger.info(f'VEP configuration - skip intergenic: {"TRUE" if config_options["other"]["vep_no_intergenic"] else "FALSE"}')
        logger.info(f'VEP configuration - regulatory annotation: {vep_regulatory_annotation}')
        logger.info(f'VEP configuration - buffer_size/number of forks: {config_options["other"]["vep_buffer_size"]}/{config_options["other"]["vep_n_forks"]}')

        check_subprocess(logger, vep_main_command, debug)
        check_subprocess(logger, vep_tabix_command, debug)
        logger.info('Finished pcgr-vep')
        print('----')

        # PCGR|vcf2maf - if option set, convert VCF to MAF with https://github.com/mskcc/vcf2maf
        if run_vcf2maf:
            logger.info('Converting VEP-annotated VCF to MAF with https://github.com/mskcc/vcf2maf')
            vcf2maf_command = (
                    f'vcf2maf.pl --inhibit-vep --input-vcf {input_vcf_pcgr_ready_uncompressed} '
                    f'--tumor-id {config_options["sample_id"]} --output-maf {output_maf} --ref-fasta {fasta_assembly} '
                    f'--ncbi-build {NCBI_BUILD_MAF} > {output_vcf2maf_log} 2>&1'
                    )
            check_subprocess(logger, vcf2maf_command, debug)
            utils.remove(input_vcf_pcgr_ready_uncompressed)
            utils.remove(output_vcf2maf_log)
            logger.info('Finished pcgr-vep-vcf2maf')
            print('----')

        # PCGR|vcfanno - annotate VCF against a number of variant annotation resources
        logger = getlogger("pcgr-vcfanno")
        pcgr_vcfanno_command = (
                f'pcgr_vcfanno.py {vep_vcf} {vep_vcfanno_vcf} {pcgr_paths["db_dir"]} '
                f'--num_processes {config_options["other"]["vcfanno_n_proc"]} '
                f'--chasmplus --dbnsfp --docm --clinvar --icgc --civic --cgi --tcga_pcdm --winmsk --simplerepeats '
                f'--tcga --uniprot --cancer_hotspots --pcgr_onco_xref '
                f'{"--debug " if debug else ""}'
                )
        anno_src_msg = (
                f"Annotation sources: {'Panel-of-Normals, ' if panel_normal != 'None' else ''}ClinVar, dbNSFP, "
                f"UniProtKB, cancerhotspots.org, CiVIC, CGI, DoCM, CHASMplus driver mutations, TCGA, ICGC-PCAWG"
                )
        logger.info("PCGR - STEP 2: Annotation for precision oncology with pcgr-vcfanno")
        logger.info(anno_src_msg)
        if panel_normal != "None":
            pon_annotation = 1
            pcgr_vcfanno_command += f'--panel_normal_vcf {panel_normal}'
        check_subprocess(logger, pcgr_vcfanno_command, debug)
        logger.info("Finished pcgr-vcfanno")
        print('----')

        # PCGR|pcgr_summarise - expand annotations in VCF file
        logger = getlogger("pcgr-summarise")
        pcgr_summarise_command = (
                f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {pon_annotation} '
                f'{config_options["other"]["vep_regulatory"]} '
                f'{pcgr_paths["db_dir"]} '
                f'{"--debug" if debug else ""}'
                )
        logger.info("PCGR - STEP 3: Cancer gene annotations with pcgr-summarise")
        check_subprocess(logger, pcgr_summarise_command, debug)

        # PCGR|clean - move output files and clean up temporary files
        os.rename(vep_vcfanno_annotated_vcf, output_vcf)
        os.rename(f'{vep_vcfanno_annotated_vcf}.tbi', f'{output_vcf}.tbi')
        os.rename(vep_vcfanno_annotated_pass_vcf, output_pass_vcf)
        os.rename(f'{vep_vcfanno_annotated_pass_vcf}.tbi', f'{output_pass_vcf}.tbi')
        delete_files = (
                glob(f'{vep_vcf}*') +
                glob(f'{vep_vcfanno_annotated_vcf}') +
                glob(f'{vep_vcfanno_annotated_pass_vcf}*') +
                glob(f'{vep_vcfanno_vcf}*') +
                glob(f'{input_vcf_pcgr_ready_uncompressed}*')
                )
        # do not delete if debugging
        if not debug:
            for fn in delete_files:
                #print(f"Deleting {fn}")
                utils.remove(fn)

        logger.info('Finished pcgr-summarise main command')

        # PCGR|vcf2tsv - convert VCF to TSV with https://github.com/sigven/vcf2tsv
        pcgr_vcf2tsv_command = f'vcf2tsv.py {output_pass_vcf} --compress {output_pass_tsv}'
        logger.info("Converting VCF to TSV with https://github.com/sigven/vcf2tsv")
        check_subprocess(logger, pcgr_vcf2tsv_command, debug)
        logger.info('Finished pcgr-summarise-vcf2tsv')

        if config_options['assay'] == 'WGS' or config_options['assay'] == 'WES':
            output_pass_tsv_gz = f'{output_pass_tsv}.gz'
            # check that output file exist
            if os.path.exists(output_pass_tsv_gz):
                # get number of rows/variants annotated, using pandas
                var_data = pandas.read_csv(output_pass_tsv_gz, sep = '\t', low_memory = False, header = [1])
                num_variants_raw = len(var_data)
                if num_variants_raw > MAX_VARIANTS_FOR_REPORT:
                    logger.info(f'Number of raw variants in input VCF ({num_variants_raw}) exceeds {MAX_VARIANTS_FOR_REPORT} - intergenic/intronic variants will be excluded prior to reporting')

                    # Exclude intronic and intergenic variants prior to analysis with pcgrr (reporting and further analysis)
                    var_data_filtered = var_data[~var_data.Consequence.str.contains('^intron') & ~var_data.Consequence.str.contains('^intergenic')]
                    num_variants_excluded1 = num_variants_raw - len(var_data_filtered)
                    logger.info(f'Number of intergenic/intronic variants excluded: {num_variants_excluded1}')

                    # Exclude upstream_gene/downstream_gene variants if size of filtered variant set is still above MAX_VARIANTS_FOR_REPORT
                    # TODO: in this case, the TMB calculation will be an underestimate (but still likely huge)
                    var_data_filtered_final = var_data_filtered
                    if len(var_data_filtered) > MAX_VARIANTS_FOR_REPORT:
                        var_data_filtered_final = var_data_filtered[~var_data_filtered.Consequence.str.contains('^upstream_gene') & ~var_data_filtered.Consequence.str.contains('^downstream_gene')]
                        num_variants_excluded2 = len(var_data_filtered) - len(var_data_filtered_final)
                        logger.info(f'Number of upstream_gene/downstream_gene variants excluded: {num_variants_excluded2}')


                    # get vcf2tsv header and pipe to output TSV file
                    get_vcf2tsv_header = f'gzip -dc {output_pass_tsv_gz} | egrep \'^#\' > {output_pass_tsv}'
                    check_subprocess(logger, get_vcf2tsv_header, debug)

                    # rename original vcf2tsv (gzipped) to 'raw' filename
                    rename_output_tsv = f'mv {output_pass_tsv_gz} {output_pass_raw_tsv_gz}'
                    check_subprocess(logger, rename_output_tsv, debug)

                    # append filtered data output to output TSV file
                    var_data_filtered_final.to_csv(output_pass_tsv, sep='\t', encoding='utf-8', mode = 'a', index = False)

                    # gzip filtered output TSV file
                    gzip_filtered_output_tsv = f'gzip -f {output_pass_tsv}'
                    check_subprocess(logger, gzip_filtered_output_tsv, debug)


        logger.info('Finished pcgr-summarise')
        print('----')

    # Generation of HTML reports for VEP/vcfanno-annotated VCF and copy number segment file
    if not config_options['other']['basic']:
        co = config_options
        ttype = co['tumor_type']['type'].replace(' ', '_').replace('/', '@')
        logger = getlogger('pcgr-writer')
        logger.info('PCGR - STEP 4: Generation of output files - variant interpretation report for precision oncology')

        # export PATH to R conda env Rscript
        rscript = utils.script_path('pcgrr', 'bin/Rscript')
        pcgrr_script = utils.script_path('pcgr', 'bin/pcgrr.R')
        pcgr_report_command = (
                f"{rscript} {pcgrr_script} "
                f"{output_dir} "
                f"{output_pass_tsv}.gz "
                f"{input_cna} "
                f"{input_rna_fusion} "
                f"{input_rna_expression} "
                f"{input_cpsr_report} "
                f"{config_options['sample_id']} "
                f"{pcgr_vars.PCGR_VERSION} "
                f"{pcgr_vars.DB_VERSION} "
                f"{config_options['genome_assembly']} "
                f"{data_dir} "
                f"{co['tumor_purity']} "
                f"{co['tumor_ploidy']} "
                f"{ttype} "
                f"{co['tmb']['target_size_mb']} "
                f"{co['assay']} "
                f"{tumor_only} "
                f"{cell_line} "
                f"{co['tumor_only']['maf_onekg_afr']} "
                f"{co['tumor_only']['maf_onekg_amr']} "
                f"{co['tumor_only']['maf_onekg_eas']} "
                f"{co['tumor_only']['maf_onekg_eur']} "
                f"{co['tumor_only']['maf_onekg_sas']} "
                f"{co['tumor_only']['maf_onekg_global']} "
                f"{co['tumor_only']['maf_gnomad_afr']} "
                f"{co['tumor_only']['maf_gnomad_amr']} "
                f"{co['tumor_only']['maf_gnomad_asj']} "
                f"{co['tumor_only']['maf_gnomad_eas']} "
                f"{co['tumor_only']['maf_gnomad_fin']} "
                f"{co['tumor_only']['maf_gnomad_nfe']} "
                f"{co['tumor_only']['maf_gnomad_oth']} "
                f"{co['tumor_only']['maf_gnomad_sas']} "
                f"{co['tumor_only']['maf_gnomad_global']} "
                f"{co['tumor_only']['exclude_pon']} "
                f"{co['tumor_only']['exclude_likely_hom_germline']} "
                f"{co['tumor_only']['exclude_likely_het_germline']} "
                f"{co['tumor_only']['exclude_dbsnp_nonsomatic']} "
                f"{co['tumor_only']['exclude_nonexonic']} "
                f"{co['tmb']['run']} "
                f"{co['tmb']['algorithm']} "
                f"{co['msi']['run']} "
                f"{co['msigs']['run']} "
                f"{co['msigs']['mutation_limit']} "
                f"{co['msigs']['all_reference_signatures']} "
                f"{co['msigs']['include_artefact_signatures']} "
                f"{co['msigs']['prevalence_reference_signatures']} "
                f"{co['cna']['logR_homdel']} "
                f"{co['cna']['logR_gain']} "
                f"{co['cna']['cna_overlap_pct']} "
                f"{co['allelic_support']['tumor_af_min']} "
                f"{co['allelic_support']['tumor_dp_min']} "
                f"{co['allelic_support']['control_dp_min']} "
                f"{co['allelic_support']['control_af_max']} "
                f"{co['allelic_support']['tumor_af_tag']} "
                f"{co['allelic_support']['tumor_dp_tag']} "
                f"{co['allelic_support']['control_af_tag']} "
                f"{co['allelic_support']['control_dp_tag']} "
                f"{co['allelic_support']['call_conf_tag']} "
                f"{co['clinicaltrials']['run']} "
                f"{co['other']['vep_n_forks']} "
                f"{co['other']['vep_buffer_size']} "
                f"{co['other']['vep_no_intergenic']} "
                f"{co['other']['vep_pick_order']} "
                f"{co['other']['vep_regulatory']} "
                f"{co['other']['vep_gencode_all']} "
                f"{co['other']['vcf2maf']} "
                f"{co['other']['list_noncoding']} "
                f"{co['other']['preserved_info_tags']} "
                f"{co['other']['visual_theme']} "
                f"{report_nonfloating_toc} "
                f"{co['other']['no_vcf_validate']}"
                )

        if debug:
            print(pcgr_report_command)
        check_subprocess(logger, pcgr_report_command, debug)
        logger.info("Finished PCGR!")
        print('----')

    print()
Пример #2
0
def run_cpsr(arg_dict, cpsr_paths):
    """
    Main function to run the CPSR workflow
    """
    debug = arg_dict['debug']
    diagnostic_grade_only = 0
    vcf_validation = 1
    virtual_panel_id = "-1"
    ignore_noncoding = 0
    gwas_findings = 0
    secondary_findings = 0
    classify_all = 0
    clinvar_ignore_noncancer = 0
    report_nonfloating_toc = 0
    vep_no_intergenic = 0
    vep_regulatory = 0
    preserved_info_tags = arg_dict['preserved_info_tags']
    diagnostic_grade_set = "OFF"
    secondary_findings_set = "OFF"
    gwas_findings_set = "OFF"

    if arg_dict['vep_regulatory']:
        vep_regulatory = 1
    if arg_dict["vep_no_intergenic"]:
        vep_no_intergenic = 1
    if arg_dict['clinvar_ignore_noncancer']:
        clinvar_ignore_noncancer = 1
    if arg_dict['classify_all']:
        classify_all = 1
    if arg_dict['gwas_findings']:
        gwas_findings = 1
        gwas_findings_set = "ON"
    if arg_dict['secondary_findings']:
        secondary_findings = 1
        secondary_findings_set = "ON"
    if arg_dict['diagnostic_grade_only']:
        diagnostic_grade_only = 1
        diagnostic_grade_set = "ON"
    if arg_dict['report_nonfloating_toc']:
        report_nonfloating_toc = 1
    if arg_dict['no_vcf_validate']:
        vcf_validation = 0
    if arg_dict['virtual_panel_id'] != "-1":
        virtual_panel_id = arg_dict['virtual_panel_id']
    if arg_dict['custom_list']:
        virtual_panel_id = "-1"
    if arg_dict['ignore_noncoding']:
        ignore_noncoding = 1

    output_vcf = 'None'
    output_pass_vcf = 'None'
    output_pass_tsv = 'None'
    uid = ''
    GENCODE_VERSION = pcgr_vars.GENCODE_VERSION
    VEP_ASSEMBLY = pcgr_vars.VEP_ASSEMBLY
    VEP_VERSION = pcgr_vars.VEP_VERSION
    if arg_dict['genome_assembly'] == 'grch37':
        GENCODE_VERSION = '19'
        VEP_ASSEMBLY = 'GRCh37'

    vepdb_dir = os.path.join(str(cpsr_paths['db_dir']),'.vep')
    input_vcf = 'None'
    input_customlist = 'None'

    if cpsr_paths['input_vcf_basename'] != 'NA':
        input_vcf = os.path.join(cpsr_paths['input_vcf_dir'], cpsr_paths['input_vcf_basename'])
    if cpsr_paths['input_customlist_basename'] != 'NA':
        input_customlist = os.path.join(cpsr_paths['input_customlist_dir'], cpsr_paths['input_customlist_basename'])

    data_dir = cpsr_paths['base_dir']
    output_dir = cpsr_paths['output_dir']
    vep_dir = vepdb_dir

    logger = getlogger('cpsr-validate-input-arguments')
    logger.info("CPSR - STEP 0: Validate input data")
    check_subprocess(logger, f'mkdir -p {output_dir}', debug)

    ## CPSR|Validate input VCF - check formatting, non-overlap with CPSR INFO tags, and whether sample contains any variants in cancer predisposition loci
    vcf_validate_command = (
            f'cpsr_validate_input.py '
            f'{data_dir} '
            f'{input_vcf} '
            f'{input_customlist} '
            f'{preserved_info_tags} '
            f'{vcf_validation} '
            f'{arg_dict["genome_assembly"]} '
            f'{arg_dict["sample_id"]} '
            f'{virtual_panel_id} '
            f'{diagnostic_grade_only} '
            f'--output_dir {output_dir} {"--debug" if debug else ""}'
            )
    check_subprocess(logger, vcf_validate_command, debug)
    logger.info('Finished cpsr-validate-input-arguments')
    print('----')

    ## CPSR|Start - log key information about run
    logger = getlogger("cpsr-start")
    logger.info("--- Cancer Predisposition Sequencing Reporter workflow ----")
    logger.info(f"Sample name: {arg_dict['sample_id']}")
    if not input_customlist == 'None':
        logger.info(f"Virtual gene panel: custom-made list from panel 0: {input_customlist}")
    else:
        #logger.info("Virtual gene panel(s): " + str(pcgr_vars.GE_panels[virtual_panel_id]))
        logger.info(f"Diagnostic-grade genes in virtual panels (GE PanelApp): {diagnostic_grade_set}")
    logger.info(f"Include incidental findings (ACMG recommended list v3.0): {secondary_findings_set}")
    logger.info(f"Include low to moderate cancer risk variants from genome-wide association studies: {gwas_findings_set}")
    logger.info(f"Reference population, germline variant frequencies (gnomAD): {str(arg_dict['pop_gnomad']).upper()}")
    logger.info(f"Genome assembly: {arg_dict['genome_assembly']}")

    if not input_vcf == 'None':

        ## Define input, output and temporary file names
        pcgr_model = 'cpsr'
        output_vcf = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.vcf.gz')
        output_pass_vcf = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.pass.vcf.gz')
        output_pass_tsv = os.path.join(output_dir, str(arg_dict['sample_id']) + '.cpsr.' + str(arg_dict['genome_assembly']) + '.pass.tsv')
        input_vcf_cpsr_ready = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf.gz', cpsr_paths['input_vcf_basename']))
        input_vcf_cpsr_ready_uncompressed = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf', cpsr_paths['input_vcf_basename']))
        vep_vcf = re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_vep.vcf',input_vcf_cpsr_ready)
        vep_vcfanno_vcf = re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_vep.vcfanno.vcf',input_vcf_cpsr_ready)
        vep_vcfanno_annotated_vcf = re.sub(r'\.vcfanno','.vcfanno.annotated',vep_vcfanno_vcf) + '.gz'
        vep_vcfanno_annotated_pass_vcf = re.sub(r'\.vcfanno','.vcfanno.annotated.pass',vep_vcfanno_vcf) + '.gz'
        custom_bed = os.path.join(output_dir, str(arg_dict['sample_id']) + '.' + str(pcgr_model) + '.' + str(arg_dict['genome_assembly']) + '.custom_list.bed')

        ## File names for assembly-specific genome fasta files (VEP)
        fasta_assembly = os.path.join(vep_dir, f"homo_sapiens/{VEP_VERSION}_{VEP_ASSEMBLY}/Homo_sapiens.{VEP_ASSEMBLY}.dna.primary_assembly.fa.gz")
        ancestor_assembly = os.path.join(vep_dir, f"homo_sapiens/{VEP_VERSION}_{VEP_ASSEMBLY}/human_ancestor.fa.gz")

        ## Set all flags used in VEP run
        plugins_in_use = "NearestExonJB, LoF"
        vep_flags = (
            f"--format vcf --vcf --check_ref --flag_pick_allele_gene --hgvs --dont_skip --failed 1 --af --af_1kg --af_gnomad "
            f"--variant_class --domains --symbol --protein --ccds --uniprot --appris --biotype --canonical --cache "
            f"--numbers --total_length --no_stats --allele_number --no_escape --xref_refseq --plugin NearestExonJB,max_range=50000"
            )
        vep_options = (
            f"--pick_order {arg_dict['vep_pick_order']} --force_overwrite --buffer_size {arg_dict['vep_buffer_size']} "
            f"--species homo_sapiens --assembly {VEP_ASSEMBLY} --offline --fork {arg_dict['vep_n_forks']} {vep_flags} --dir {vep_dir} "
            f"--cache_version {VEP_VERSION}"
            )
        gencode_set_in_use = "GENCODE - all transcripts"
        if arg_dict['vep_gencode_all'] == 0:
            vep_options += ' --gencode_basic'
            gencode_set_in_use = "GENCODE - basic transcript set (--gencode_basic)"
        if arg_dict['vep_no_intergenic'] == 1:
            vep_options = vep_options + " --no_intergenic"
        if arg_dict['vep_regulatory'] == 1:
            vep_options = vep_options + " --regulatory"
        if arg_dict['genome_assembly'] == "grch38":
            vep_options = vep_options +  " --mane"
        loftee_dir = utils.get_loftee_dir()
        assert os.path.isdir(loftee_dir), f'LoF VEP plugin is not found in {loftee_dir}. Please make sure you installed pcgr conda package and have corresponding conda environment active.'
        vep_options += f" --plugin LoF,loftee_path:{loftee_dir},human_ancestor_fa:{ancestor_assembly},use_gerp_end_trunc:0 --dir_plugins {loftee_dir}"
        if not debug:
            vep_options += " --quiet"

        ## Compose full VEP command
        vep_main_command = f'{utils.get_perl_exports()} && vep --input_file {input_vcf_cpsr_ready} --output_file {vep_vcf} {vep_options} --fasta {fasta_assembly}'
        vep_bgzip_command = f'bgzip -f {vep_vcf}'
        vep_tabix_command = f'tabix -f -p vcf {vep_vcf}.gz'
        logger = getlogger('cpsr-vep')

        ## CPSR|VEP - run Variant Effect Predictor on query VCF with LoF and NearestExonJB plugins
        logger.info(f"CPSR - STEP 1: Basic variant annotation with Variant Effect Predictor ({VEP_VERSION}, GENCODE {GENCODE_VERSION}, {arg_dict['genome_assembly']})")
        logger.info(f"VEP configuration - one primary consequence block pr. alternative allele (--flag_pick_allele)")
        logger.info(f"VEP configuration - transcript pick order: {arg_dict['vep_pick_order']}")
        logger.info(f"VEP configuration - transcript pick order: See more at https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick_options")
        logger.info(f"VEP configuration - GENCODE set: {gencode_set_in_use}")
        logger.info(f"VEP configuration - skip intergenic: {arg_dict['vep_no_intergenic']}")
        logger.info(f"VEP configuration - look for overlap with regulatory regions: {vep_regulatory}")
        logger.info(f"VEP configuration - plugins in use: {plugins_in_use}")
        logger.info(f"VEP configuration - buffer_size/number of forks: {arg_dict['vep_buffer_size']}/{arg_dict['vep_n_forks']}")
        check_subprocess(logger, vep_main_command, debug)
        check_subprocess(logger, vep_bgzip_command, debug)
        check_subprocess(logger, vep_tabix_command, debug)
        logger.info("Finished cpsr-vep")
        print('----')

        ## CPSR|vcfanno - run vcfanno on query VCF with a number of relevant annotated VCFs
        logger = getlogger('cpsr-vcfanno')
        logger.info("CPSR - STEP 2: Annotation for cancer predisposition with cpsr-vcfanno")
        logger.info("(ClinVar, CIViC, dbNSFP, dbMTS, UniProtKB, cancerhotspots.org, ncER, GERP RS scores, GWAS catalog, gnomAD non-cancer subset)")
        pcgr_vcfanno_command = (
                f"pcgr_vcfanno.py --num_processes {arg_dict['vcfanno_n_proc']} --dbnsfp --clinvar "
                f"--cancer_hotspots --dbmts --ncer --gerp --civic --uniprot --gnomad_cpsr --pcgr_onco_xref "
                f"--gwas --rmsk {vep_vcf}.gz {vep_vcfanno_vcf} {os.path.join(data_dir, 'data', str(arg_dict['genome_assembly']))}"
                )
        check_subprocess(logger, pcgr_vcfanno_command, debug)
        logger.info("Finished cpsr-vcfanno")
        print('----')

        ## CPSR|summarise - expand annotations with separate VCF INFO tags
        logger = getlogger("cpsr-summarise")
        pcgr_summarise_command = (
                f'pcgr_summarise.py {vep_vcfanno_vcf}.gz 0 {vep_regulatory} '
                f'{os.path.join(data_dir, "data", arg_dict["genome_assembly"])} '
                f'--cpsr {"--debug" if debug else ""}'
                )
        logger.info("CPSR - STEP 3: Cancer gene annotations with cpsr-summarise")
        check_subprocess(logger, pcgr_summarise_command, debug)

        ## CPSR|clean - rename output files, remove temporary files
        os.rename(vep_vcfanno_annotated_vcf, output_vcf)
        os.rename(f'{vep_vcfanno_annotated_vcf}.tbi', f'{output_vcf}.tbi')
        os.rename(vep_vcfanno_annotated_pass_vcf, output_pass_vcf)
        os.rename(f'{vep_vcfanno_annotated_pass_vcf}.tbi', f'{output_pass_vcf}.tbi')
        delete_files = (
                glob(f'{vep_vcf}*') +
                glob(f'{vep_vcfanno_annotated_vcf}') +
                glob(f'{vep_vcfanno_annotated_pass_vcf}*') +
                glob(f'{vep_vcfanno_vcf}*') +
                glob(f'{input_vcf_cpsr_ready_uncompressed}*')
                )
        # do not delete if debugging
        if not debug:
            for fn in delete_files:
                #print(f"Deleting {fn}")
                utils.remove(fn)
        logger.info('Finished cpsr-summarise main command')
        ## CPSR|vcf2tsv - perform vcf2tsv conversion on the final annotated VCF file
        cpsr_vcf2tsv_command = f"vcf2tsv.py {output_pass_vcf} --compress {output_pass_tsv}"
        logger.info("Converting VCF to TSV with https://github.com/sigven/vcf2tsv")
        check_subprocess(logger, cpsr_vcf2tsv_command, debug)
        logger.info('Finished cpsr-summarise-vcf2tsv')
    logger.info('Finished cpsr-summarise')
    print('----')

    ## Generation of HTML reports for VEP/vcfanno-annotated VCF file
    if not arg_dict['basic']:
        logger = getlogger('cpsr-writer')
        logger.info("CPSR - STEP 4: Generation of output files - Cancer predisposition sequencing report")

        # export PATH to R conda env Rscript
        rscript = utils.script_path("pcgrr", "bin/Rscript")
        cpsrr_script = utils.script_path('pcgr', 'bin/cpsr.R')
        cpsr_report_command = (
                f"{rscript} {cpsrr_script} "
                f"{output_dir} "
                f"{output_pass_tsv}.gz "
                f"{arg_dict['sample_id']} "
                f"{pcgr_vars.PCGR_VERSION} "
                f"{pcgr_vars.DB_VERSION} "
                f"{arg_dict['genome_assembly']} "
                f"{data_dir} "
                f"{virtual_panel_id} "
                f"{preserved_info_tags} "
                f"{custom_bed} "
                f"{arg_dict['custom_list_name']} "
                f"{arg_dict['report_theme']} "
                f"{arg_dict['report_table_display']} "
                f"{report_nonfloating_toc} "
                f"{gwas_findings} "
                f"{arg_dict['gwas_p_value']} "
                f"{arg_dict['pop_gnomad']} "
                f"{arg_dict['maf_upper_threshold']} "
                f"{arg_dict['vep_pick_order']} "
                f"{arg_dict['vep_n_forks']} "
                f"{arg_dict['vep_buffer_size']} "
                f"{arg_dict['vep_gencode_all']} "
                f"{vep_no_intergenic} "
                f"{vep_regulatory} "
                f"{secondary_findings} "
                f"{classify_all} "
                f"{ignore_noncoding} "
                f"{clinvar_ignore_noncancer} "
                f"{diagnostic_grade_only}"
           )

        if debug:
            print(cpsr_report_command)
        check_subprocess(logger, cpsr_report_command, debug)
        logger.info("Finished CPSR!")
        print('----')
    print()
Пример #3
0
def run_vcfanno(num_processes, query_vcf, panel_normal_vcf, query_info_tags, vcfheader_file, pcgr_db_directory, conf_fname,
                output_vcf, docm, clinvar, ncer, dbmts, gerp, tcga, tcga_pcdm, chasmplus, dbnsfp, civic, cgi, icgc, uniprot, cancer_hotspots,
                pcgr_onco_xref, gwas, rmsk, simplerepeats, winmsk, gnomad_cpsr, keep_logs, debug, logger):
    """
    Function that annotates a VCF file with vcfanno against a user-defined set of germline and somatic VCF files
    """

    civic_info_tags = ["CIVIC_ID","CIVIC_ID_SEGMENT"]
    cgi_info_tags = ["CGI_ID","CGI_ID_SEGMENT"]
    icgc_info_tags = ["ICGC_PCAWG_OCCURRENCE","ICGC_PCAWG_AFFECTED_DONORS"]
    docm_info_tags = ["DOCM_PMID"]
    tcga_info_tags = ["TCGA_FREQUENCY","TCGA_PANCANCER_COUNT"]
    tcga_pcdm_info_tags = ["PUTATIVE_DRIVER_MUTATION"]
    chasmplus_info_tags = ["CHASMPLUS_DRIVER","CHASMPLUS_TTYPE","CHASMPLUS_PANCAN"]
    ncer_info_tags = ["NCER_PERCENTILE"]
    clinvar_info_tags = ["CLINVAR_MSID","CLINVAR_PMID","CLINVAR_CLNSIG","CLINVAR_VARIANT_ORIGIN","CLINVAR_CONFLICTED","CLINVAR_UMLS_CUI","CLINVAR_HGVSP",
                         "CLINVAR_UMLS_CUI_SOMATIC","CLINVAR_CLNSIG_SOMATIC","CLINVAR_PMID_SOMATIC","CLINVAR_ALLELE_ID","CLINVAR_MOLECULAR_EFFECT",
                         "CLINVAR_REVIEW_STATUS_STARS","CLINVAR_CLASSIFICATION","CLINVAR_ENTREZGENE"]
    cancer_hotspots_info_tags = ["MUTATION_HOTSPOT","MUTATION_HOTSPOT_TRANSCRIPT","MUTATION_HOTSPOT_CANCERTYPE"]
    dbnsfp_info_tags = ["DBNSFP"]
    uniprot_info_tags = ["UNIPROT_FEATURE"]
    pcgr_onco_xref_info_tags = ["PCGR_ONCO_XREF"]
    gwas_info_tags = ["GWAS_HIT"]
    rmsk_info_tags = ["RMSK_HIT"]
    simplerepeats_info_tags = ["SIMPLEREPEATS_HIT"]
    winmsk_info_tags = ["WINMASKER_HIT"]
    panel_normal_tags = ["PANEL_OF_NORMALS"]
    dbmts_info_tags = ["DBMTS"]
    gerp_info_tags = ['GERP_SCORE']

    gnomad_cpsr_tags = []
    gnomad_cpsr_tags.append('NON_CANCER_AC_GLOBAL')
    gnomad_cpsr_tags.append('NON_CANCER_NHOMALT_GLOBAL')
    gnomad_cpsr_tags.append('NON_CANCER_AN_GLOBAL')
    gnomad_cpsr_tags.append('NON_CANCER_AF_GLOBAL')
    for pop in ['ASJ','NFE','SAS','FIN','EAS','AMR','AFR','OTH']:
        gnomad_cpsr_tags.append('NON_CANCER_AC_' + str(pop))
        gnomad_cpsr_tags.append('NON_CANCER_AN_' + str(pop))
        gnomad_cpsr_tags.append('NON_CANCER_AF_' + str(pop))
        gnomad_cpsr_tags.append('NON_CANCER_NHOMALT_' + str(pop))

    if icgc is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, icgc_info_tags, query_info_tags, "icgc")
    if clinvar is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, clinvar_info_tags, query_info_tags, "clinvar")
    if ncer is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, ncer_info_tags, query_info_tags, "ncer")
    if gerp is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, gerp_info_tags, query_info_tags, "gerp")
    if dbmts is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, dbmts_info_tags, query_info_tags, "dbmts")
    if dbnsfp is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, dbnsfp_info_tags, query_info_tags, "dbnsfp")
    if cgi is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, cgi_info_tags, query_info_tags, "cgi")
    if tcga is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, tcga_info_tags, query_info_tags, "tcga")
    if tcga_pcdm is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, tcga_pcdm_info_tags, query_info_tags, "tcga_pcdm")
    if chasmplus is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, chasmplus_info_tags, query_info_tags, "chasmplus")
    if civic is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, civic_info_tags, query_info_tags, "civic")
    if cancer_hotspots is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, cancer_hotspots_info_tags, query_info_tags, "cancer_hotspots")
    if uniprot is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, uniprot_info_tags, query_info_tags, "uniprot")
    if docm is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, docm_info_tags, query_info_tags, "docm")
    if pcgr_onco_xref is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, pcgr_onco_xref_info_tags, query_info_tags, "pcgr_onco_xref")
    if gwas is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, gwas_info_tags, query_info_tags, "gwas")
    if rmsk is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, rmsk_info_tags, query_info_tags, "rmsk")
    if simplerepeats is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, simplerepeats_info_tags, query_info_tags, "simplerepeats")
    if winmsk is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, winmsk_info_tags, query_info_tags, "winmsk")
    if gnomad_cpsr is True:
        prepare_vcfanno_configuration(pcgr_db_directory, conf_fname, vcfheader_file, logger, gnomad_cpsr_tags, query_info_tags, "gnomad_cpsr")

    if not panel_normal_vcf is None:
        if "PANEL_OF_NORMALS" in query_info_tags:
            logger.warning("Query VCF has INFO tag \"PANEL_OF_NORMALS\" - this is also present in the panel of normal VCF file. This tag will be overwritten if not renamed in the query VCF")
        append_to_vcf_header(pcgr_db_directory, "panel_of_normals", vcfheader_file, logger)
        fh = open(conf_fname,'a')
        fh.write('[[annotation]]\n')
        fh.write('file="' + str(panel_normal_vcf) + '"\n')
        fields_string = 'fields = ["' + '","'.join(panel_normal_tags) + '"]'
        ops = ['self'] * len(panel_normal_tags)
        ops_string = 'ops=["' + '","'.join(ops) + '"]'
        fh.write(fields_string + '\n')
        fh.write(ops_string + '\n\n')
        fh.close()

    out_vcf_vcfanno_unsorted1 = output_vcf + '.tmp.unsorted.1'
    query_prefix = re.sub(r'\.vcf.gz$','',query_vcf)
    print_vcf_header(query_vcf, vcfheader_file, logger, chromline_only = True)
    command1 = f"vcfanno -p={num_processes} {conf_fname} {query_vcf} > {out_vcf_vcfanno_unsorted1} 2> {query_prefix}.vcfanno.log"
    check_subprocess(logger, command1, debug)

    check_subprocess(logger, f'cat {vcfheader_file} > {output_vcf}', debug=False)
    check_subprocess(logger, f'cat {out_vcf_vcfanno_unsorted1} | grep -v \'^#\' >> {output_vcf}', debug=False)
    check_subprocess(logger, f'bgzip -f {output_vcf}', debug)
    check_subprocess(logger, f'tabix -f -p vcf {output_vcf}.gz', debug)
    if not keep_logs:
        for tmpf in glob.glob(f"{output_vcf}.tmp*"):
            utils.remove(tmpf)
Пример #4
0
def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, genome_assembly, logger, debug):
   """
   Function that checks whether the custom genelist contains valid entries from the complete exploratory track
   """
   genelist_reader = csv.DictReader(open(genelist_fname,'r'), delimiter='\n', fieldnames=['ensembl_gene_id'])
   superpanel_track_bed = os.path.join(pcgr_dir, "data", genome_assembly, "virtual_panels",  "0." + genome_assembly + ".bed.gz")
   superpanel_track_tsv = os.path.join(pcgr_dir, "data", genome_assembly, "virtual_panels", "cpsr_superpanel." + genome_assembly + ".tsv")
   genelist_bed_fname_unsorted = genelist_bed_fname + '.tmp_unsorted'

   customlist_identifiers = {}
   superpanel_track = []
   superpanel_identifiers_all = {}
   valid_custom_identifiers = []
   valid_custom_symbols = []

   for row in genelist_reader:
      if not re.match(r'^ENSG[0-9]{1,}$',str(row['ensembl_gene_id']).rstrip()):
         err_msg = "Custom list of genes from CPSR superpanel (panel 0) should be provided as Ensembl gene identifiers, '" + str(row['ensembl_gene_id']) + "' is not a valid identifier"
         return error_message(err_msg, logger)
      else:
         customlist_identifiers[str(row['ensembl_gene_id']).strip()] = 1

   superpanel_reader = csv.DictReader(open(superpanel_track_tsv, 'r'), delimiter = '\t')

   for row in superpanel_reader:
      superpanel_track.append(dict(row))
   #superpanel_track = list(set(superpanel_track))

   i = 0
   while i < len(superpanel_track):
      superpanel_identifiers_all[superpanel_track[i]['ensembl_gene_id']] = superpanel_track[i]['symbol']
      i = i + 1

   for g in customlist_identifiers.keys():
      if g in superpanel_identifiers_all.keys():
         valid_custom_identifiers.append(g)
         valid_custom_symbols.append(superpanel_identifiers_all[g])
      else:
         logger.warning("Ignoring custom-provided gene identifier (" + str(g) + ") NOT found in CPSR superpanel (panel 0)")
         logger.warning("Choose only Ensembl gene identifiers from this set in data bundle: data/" + str(genome_assembly) + "/virtual_panels/cpsr_superpanel." + str(genome_assembly) + ".tsv")
   all_valid_custom_geneset = ', '.join(sorted(valid_custom_symbols))

   logger.info('Detected n = ' + str(len(valid_custom_identifiers)) + ' valid targets in custom-provided gene list file (--custom_list)):')
   logger.info(all_valid_custom_geneset)

   if len(valid_custom_identifiers) == 0:
      logger.info('')
      logger.info("NO valid gene identifiers from panel 0 in custom-provided genelist - exiting")
      logger.info('')
      exit(1)

   ## Add secondary findings genes to target BED
   cmd_secondary_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'\|ACMG_SF30\|\' > ' + str(genelist_bed_fname_unsorted)
   check_subprocess(logger, cmd_secondary_regions_bed, debug)

   ## Add GWAS hits to target BED
   cmd_gwas_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'rs[0-9]{3,}\|\' >> ' + str(genelist_bed_fname_unsorted)
   check_subprocess(logger, cmd_gwas_regions_bed, debug)

   ## Add custom set genes to target BED
   logger.info('Creating BED file with custom target genes: ' + str(genelist_bed_fname))
   for g in valid_custom_identifiers:
      cmd_target_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'\|' + g + '\|\' >> ' + str(genelist_bed_fname_unsorted)
      check_subprocess(logger, cmd_target_regions_bed, debug)

   ## Sort regions in target BED
   if os.path.exists(genelist_bed_fname_unsorted) and os.stat(genelist_bed_fname_unsorted).st_size != 0:
      cmd_sort_custom_bed1 = 'egrep \'^[0-9]\' ' + str(genelist_bed_fname_unsorted) + ' | sort -k1,1n -k2,2n -k3,3n > ' + str(genelist_bed_fname)
      cmd_sort_custom_bed2 = 'egrep -v \'^[0-9]\' ' + str(genelist_bed_fname_unsorted) + ' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3n >> ' + str(genelist_bed_fname)

      check_subprocess(logger, cmd_sort_custom_bed1, debug)
      check_subprocess(logger, cmd_sort_custom_bed2, debug)
      if not debug:
         utils.remove(str(genelist_bed_fname_unsorted))
   #else:
      #print('balle')

   return 0
Пример #5
0
def simplify_vcf(input_vcf, vcf, custom_bed, pcgr_directory, genome_assembly, virtual_panel_id, sample_id, diagnostic_grade_only, output_dir, logger, debug):

   """
   Function that performs four separate checks/filters on the validated input VCF:
   1. Remove/Strip off any genotype data (not needed for annotation)
   2. If VCF have variants with multiple alternative alleles ("multiallelic", e.g. 'A,T'), these are decomposed into variants with a single alternative allele
   3. Filters against predisposition loci (virtual panel id or custom target)
   4. Final VCF file is sorted and indexed (bgzip + tabix)
   """

   input_vcf_cpsr_ready = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready.tmp.vcf', os.path.basename(input_vcf)))
   input_vcf_cpsr_ready_decomposed = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready.vcf', os.path.basename(input_vcf)))
   input_vcf_cpsr_ready_decomposed_target = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)','.cpsr_ready_target.vcf', os.path.basename(input_vcf)))
   virtual_panels_tmp_bed = os.path.join(output_dir, "virtual_panels_all." + str(sample_id) + ".tmp.bed")
   virtual_panels_bed = os.path.join(output_dir, "virtual_panels_all." + str(sample_id) + ".bed")

   multiallelic_list = list()
   for rec in vcf:
      POS = rec.start + 1
      alt = ",".join(str(n) for n in rec.ALT)
      if len(rec.ALT) > 1:
         variant_id = f"{rec.CHROM}:{POS}_{rec.REF}->{alt}"
         multiallelic_list.append(variant_id)

   is_gzipped = True if input_vcf.endswith('.gz') else False
   cat_vcf = f"bgzip -dc {input_vcf}" if is_gzipped else "cat {input_vcf}"

   command_vcf_sample_free1 = f'{cat_vcf} | egrep \'^##\' > {input_vcf_cpsr_ready}'
   command_vcf_sample_free2 = f'{cat_vcf} | egrep \'^#CHROM\' >> {input_vcf_cpsr_ready}'
   command_vcf_sample_free3 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k4,4 -k5,5 >> {input_vcf_cpsr_ready}'
   command_vcf_sample_free4 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k4,4 -k5,5 >> {input_vcf_cpsr_ready}'
   command_vcf_sample_free5 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k4,4 -k5,5 >> {input_vcf_cpsr_ready}'

   check_subprocess(logger, command_vcf_sample_free1, debug)
   check_subprocess(logger, command_vcf_sample_free2, debug)
   check_subprocess(logger, command_vcf_sample_free3, debug)
   check_subprocess(logger, command_vcf_sample_free4, debug)
   check_subprocess(logger, command_vcf_sample_free5, debug)

   if multiallelic_list:
      logger.warning(f"There were {len(multiallelic_list)} multiallelic sites detected. Showing (up to) the first 100:")
      print('----')
      print(', '.join(multiallelic_list[:100]))
      print('----')
      logger.info('Decomposing multi-allelic sites in input VCF file using \'vt decompose\'')
      command_decompose = f'vt decompose -s {input_vcf_cpsr_ready} > {input_vcf_cpsr_ready_decomposed}  2> {os.path.join(output_dir, "decompose.log")}'
      check_subprocess(logger, command_decompose, debug)
   else:
      command_copy = f'cp {input_vcf_cpsr_ready} {input_vcf_cpsr_ready_decomposed}'
      check_subprocess(logger, command_copy, debug)


   if not custom_bed == 'None':
      logger.info('Limiting variant set to user-defined screening loci (custom list from panel 0)')
      if os.path.exists(custom_bed) and os.stat(custom_bed).st_size != 0:
         target_variants_intersect_cmd = "bedtools intersect -wa -u -header -a " + str(input_vcf_cpsr_ready_decomposed) + " -b " + str(custom_bed) + " > " + str(input_vcf_cpsr_ready_decomposed_target)
         check_subprocess(logger, target_variants_intersect_cmd, debug)
      else:
         logger.info('Custom BED file has a filesize of zero or does not exist')
   else:
      logger.info('Limiting variant set to cancer predisposition loci - virtual panel id(s): ' + str(virtual_panel_id))

      ## Concatenate all panel BEDs to one big virtual panel BED, sort and make unique
      panel_ids = str(virtual_panel_id).split(',')
      for pid in panel_ids:
         target_bed_gz = os.path.join(pcgr_directory,'data',genome_assembly, 'virtual_panels', str(pid) + "." + genome_assembly + ".bed.gz")
         if diagnostic_grade_only == 1 and virtual_panel_id != 0:
            logger.info('Considering diagnostic-grade only genes in panel ' + str(pid) + ' - (GREEN status in Genomics England PanelApp)')
            target_bed_gz = os.path.join(pcgr_directory, 'data', genome_assembly, 'virtual_panels', str(pid) + "." + genome_assembly + ".GREEN.bed.gz")
         check_subprocess(logger, f'bgzip -dc {target_bed_gz} >> {virtual_panels_tmp_bed}', debug)

      ## sort the collection of virtual panels
      if os.path.exists(virtual_panels_tmp_bed) and os.stat(virtual_panels_tmp_bed).st_size != 0:
         cmd_sort_virtual_panel_bed1 = 'egrep \'^[0-9]\' ' + str(virtual_panels_tmp_bed) + ' | sort -k1,1n -k2,2n -k3,3n | uniq > ' + str(virtual_panels_bed)
         cmd_sort_virtual_panel_bed2 = 'egrep -v \'^[0-9]\' ' + str(virtual_panels_tmp_bed) + ' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3n | uniq >> ' + str(virtual_panels_bed)
         check_subprocess(logger, cmd_sort_virtual_panel_bed1, debug)
         check_subprocess(logger, cmd_sort_virtual_panel_bed2, debug)
         if not debug:
            utils.remove(str(virtual_panels_tmp_bed))

         if os.path.exists(virtual_panels_bed):
            target_variants_intersect_cmd = f'bedtools intersect -wa -u -header -a {input_vcf_cpsr_ready_decomposed} -b {virtual_panels_bed} > {input_vcf_cpsr_ready_decomposed_target}'
            check_subprocess(logger, target_variants_intersect_cmd, debug)


   check_subprocess(logger, f'bgzip -cf {input_vcf_cpsr_ready_decomposed_target} > {input_vcf_cpsr_ready_decomposed_target}.gz', debug)
   check_subprocess(logger, f'tabix -p vcf {input_vcf_cpsr_ready_decomposed_target}.gz', debug)
   if not debug:
      for fn in [input_vcf_cpsr_ready, virtual_panels_bed, input_vcf_cpsr_ready_decomposed, os.path.join(output_dir, "decompose.log")]:
         #print(f"Deleting {fn}")
         utils.remove(fn)

   if os.path.exists(input_vcf_cpsr_ready_decomposed_target + '.gz') and os.path.getsize(input_vcf_cpsr_ready_decomposed_target + '.gz') > 0:
      vcf = VCF(input_vcf_cpsr_ready_decomposed_target + '.gz')
      i = 0
      for rec in vcf:
         i = i + 1
      if len(vcf.seqnames) == 0 or i == 0:
         logger.info('')
         logger.info("Query VCF contains NO variants within the selected cancer predisposition geneset or ACMG-recommended genes for secondary findings - quitting workflow")
         logger.info('')
         exit(1)
Пример #6
0
def simplify_vcf(input_vcf, vcf, output_dir, keep_uncompressed, logger, debug):
    """
    Function that performs the following on the validated input VCF:
    1. Strip of any genotype data
    2. If VCF has variants with multiple alternative alleles ("multiallelic", e.g. 'A,T'), these are decomposed into variants with a single alternative allele
    3. Final VCF file is sorted and indexed (bgzip + tabix)
    """

    input_vcf_pcgr_ready = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.tmp.vcf', os.path.basename(input_vcf)))
    input_vcf_pcgr_ready_decomposed = os.path.join(output_dir, re.sub(r'(\.vcf$|\.vcf\.gz$)', '.pcgr_ready.vcf', os.path.basename(input_vcf)))

    multiallelic_list = list()
    for rec in vcf:
        POS = rec.start + 1
        alt = ",".join(str(n) for n in rec.ALT)
        if len(rec.ALT) > 1:
            variant_id = f"{rec.CHROM}:{POS}_{rec.REF}->{alt}"
            multiallelic_list.append(variant_id)

    is_gzipped = True if input_vcf.endswith('.gz') else False
    cat_vcf = f"bgzip -dc {input_vcf}" if is_gzipped else "cat {input_vcf}"
    # Remove FORMAT metadata lines
    command_vcf_sample_free1 = f'{cat_vcf} | egrep \'^##\' | egrep -v \'^##FORMAT=\' > {input_vcf_pcgr_ready}'
    # Output first 8 column names (CHROM-INFO, so ignore FORMAT + sample columns)
    command_vcf_sample_free2 = f'{cat_vcf} | egrep \'^#CHROM\' | cut -f1-8 >> {input_vcf_pcgr_ready}'
    # Looking at variant rows, remove chr prefix, grab CHROM-INFO, sort separately for auto/XYM/rest chrom by cols 1+2 (CHROM+POS) then cols 4+5 (REF+ALT)
    command_vcf_sample_free3 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep \'^[0-9]\' | sort -k1,1n -k2,2n -k4,4 -k5,5 >> {input_vcf_pcgr_ready}'
    command_vcf_sample_free4 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k4,4 -k5,5 >> {input_vcf_pcgr_ready}'
    command_vcf_sample_free5 = f'{cat_vcf} | egrep -v \'^#\' | sed \'s/^chr//\' | cut -f1-8 | egrep -v \'^[0-9]\' | egrep -v \'^[XYM]\' | sort -k1,1 -k2,2n -k4,4 -k5,5 >> {input_vcf_pcgr_ready}'

    check_subprocess(logger, command_vcf_sample_free1, debug)
    check_subprocess(logger, command_vcf_sample_free2, debug)
    check_subprocess(logger, command_vcf_sample_free3, debug)
    check_subprocess(logger, command_vcf_sample_free4, debug)
    check_subprocess(logger, command_vcf_sample_free5, debug)

    if multiallelic_list:
        logger.warning(f"There were {len(multiallelic_list)} multiallelic sites detected. Showing (up to) the first 100:")
        print('----')
        print(', '.join(multiallelic_list[:100]))
        print('----')
        logger.info('Decomposing multi-allelic sites in input VCF file using \'vt decompose\'')
        command_decompose = f'vt decompose -s {input_vcf_pcgr_ready} > {input_vcf_pcgr_ready_decomposed} 2> {os.path.join(output_dir, "decompose.log")}'
        check_subprocess(logger, command_decompose, debug)
    else:
        logger.info('All sites seem to be decomposed - skipping decomposition!')
        check_subprocess(logger, f'cp {input_vcf_pcgr_ready} {input_vcf_pcgr_ready_decomposed}', debug)

    # need to keep uncompressed copy for vcf2maf.pl if selected
    bgzip_cmd = f"bgzip -cf {input_vcf_pcgr_ready_decomposed} > {input_vcf_pcgr_ready_decomposed}.gz" if keep_uncompressed else f"bgzip -f {input_vcf_pcgr_ready_decomposed}"
    check_subprocess(logger, bgzip_cmd, debug)
    check_subprocess(logger, f'tabix -p vcf {input_vcf_pcgr_ready_decomposed}.gz', debug)

    if os.path.exists(f'{input_vcf_pcgr_ready_decomposed}.gz') and os.path.getsize(f'{input_vcf_pcgr_ready_decomposed}.gz') > 0:
        vcf = VCF(f'{input_vcf_pcgr_ready_decomposed}.gz')
        i = 0
        for rec in vcf:
            i = i + 1
        if len(vcf.seqnames) == 0 or i == 0:
            logger.info('')
            logger.info("Input VCF contains NO valid variants after VCF cleaning - quitting workflow")
            logger.info('')
            exit(1)

    utils.remove(input_vcf_pcgr_ready)
    utils.remove(os.path.join(output_dir, "decompose.log"))