示例#1
0
def check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger):

    """
    Function that compares the INFO tags in the query VCF and the INFO tags generated by PCGR
    If any coinciding tags, an error will be returned
    """

    pcgr_infotags_desc = annoutils.read_infotag_file(os.path.join(pcgr_directory,'data',genome_assembly, 'pcgr_infotags.tsv'))

    vcf = VCF(input_vcf)
    logger.info('Checking if existing INFO tags of query VCF file coincide with PCGR INFO tags')
    ret = 1
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO':
                if header_element['ID'] in pcgr_infotags_desc.keys():
                    err_msg = f'INFO tag {header_element["ID"]} in the query VCF coincides with a VCF annotation tag produced by PCGR - please remove or rename this tag in your query VCF'
                    return error_message(err_msg, logger)
                if header_element['ID'] == 'DP_TUMOR' or header_element['ID'] == 'AF_TUMOR' or header_element['ID'] == 'AF_NORMAL' or header_element['ID'] == 'DP_NORMAL' or header_element['ID'] == 'CALL_CONFIDENCE':
                    err_msg = f'INFO tag {header_element["ID"]} in the query VCF coincides with a VCF annotation tag produced by PCGR - please remove or rename this tag in your query VCF'
                    return error_message(err_msg, logger)

    logger.info('No query VCF INFO tags coincide with PCGR INFO tags')
    return ret
示例#2
0
文件: annoutils.py 项目: sigven/pcgr
def detect_reserved_info_tag(tag, tag_name, logger):
    reserved_tags = [
        'AA', 'AC', 'AF', 'AN', 'BQ', 'CIGAR', 'DB', 'DP', 'END', 'H2', 'H3',
        'MQ', 'MQ0', 'NS', 'SB', 'SOMATIC', 'VALIDATED', '1000G'
    ]
    if tag in reserved_tags:
        err_msg = f'Custom INFO tag ({tag_name}) needs another name - \'{tag}\' is a reserved field in the VCF specification (INFO)'
        return utils.error_message(err_msg, logger)

    reserved_format_tags = [
        'GT', 'DP', 'FT', 'GL', 'GLE', 'GQ', 'PL', 'HQ', 'PS', 'PQ', 'EC', 'MQ'
    ]
    if tag in reserved_format_tags:
        err_msg = 'Custom INFO tag ({tag_name}) needs another name - \'{tag}\' is a reserved field in the VCF specification (FORMAT)'
        return utils.error_message(err_msg, logger)
示例#3
0
def check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger):

   """
   Function that compares the INFO tags in the query VCF and preserved INFO tags set by the user as retained in CPSR output TSV
   If any preserved tag is not in query VCF, an error will be returned
   """

   tags = str(preserved_info_tags).split(',')
   info_elements_query_vcf = []

   vcf = VCF(input_vcf)
   logger.info('Checking if existing INFO tags of query VCF file matches preserved INFO tags set by the user')
   ret = 1
   for e in vcf.header_iter():
      header_element = e.info()
      if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
         if header_element['HeaderType'] == 'INFO':
            info_elements_query_vcf.append(header_element['ID'])


   for t in tags:
      if not t in info_elements_query_vcf:
         err_msg = "Preserved INFO tag '" + str(t) + "' not found among INFO tags in query VCF - make sure preserved VCF INFO tags are set correctly"
         return error_message(err_msg, logger)
      else:
         logger.info("Preserved INFO tag '" + str(t) + "' detected among INFO tags in query VCF")

   return ret
示例#4
0
def is_valid_cna(cna_segment_file, logger):
    """
    Function that checks whether the CNA segment file adheres to the correct format
    """
    cna_reader = csv.DictReader(open(cna_segment_file,'r'), delimiter='\t')
    ## check that required columns are present
    if not ('Chromosome' in cna_reader.fieldnames and 'Segment_Mean' in cna_reader.fieldnames and 'Start' in cna_reader.fieldnames and 'End' in cna_reader.fieldnames):
        err_msg = "Copy number segment file (" + str(cna_segment_file) + ") is missing required column(s): 'Chromosome', 'Start', 'End', or  'Segment_Mean'\n. Column names present in file: " + str(cna_reader.fieldnames)
        return error_message(err_msg, logger)

    cna_dataframe = np.read_csv(cna_segment_file, sep="\t")
    if cna_dataframe.empty is True:
        err_msg = 'Copy number segment file is empty - contains NO segments'
        return error_message(err_msg, logger)
    if not cna_dataframe['Start'].dtype.kind in 'i': ## check that 'Start' is of type integer
        err_msg = '\'Start\' column of copy number segment file contains non-integer values'
        return error_message(err_msg, logger)
    if not cna_dataframe['End'].dtype.kind in 'i': ## check that 'End' is of type integer
        err_msg = '\'End\' column of copy number segment file contains non-integer values'
        return error_message(err_msg, logger)

    if not cna_dataframe['Segment_Mean'].dtype.kind in 'if': ## check that 'Segment_Mean' is of type integer/float
        err_msg = '\'Segment_Mean\' column of copy number segment file contains non-numerical values'
        return error_message(err_msg, logger)

    for rec in cna_reader:
        if int(rec['End']) < int(rec['Start']): ## check that 'End' is always greather than 'Start'
            err_msg = 'Detected wrongly formatted chromosomal segment - \'Start\' is greater than \'End\' (' + str(rec['Chromosome']) + ':' + str(rec['Start']) + '-' + str(rec['End']) + ')'
            return error_message(err_msg, logger)
        if int(rec['End']) < 1 or int(rec['Start']) < 1: ## check that 'Start' and 'End' is always non-negative
            err_msg = 'Detected wrongly formatted chromosomal segment - \'Start\' or \'End\' is less than or equal to zero (' + str(rec['Chromosome']) + ':' + str(rec['Start']) + '-' + str(rec['End']) + ')'
            return error_message(err_msg, logger)
    logger.info(f'Copy number segment file ({cna_segment_file}) adheres to the correct format')
    return 0
示例#5
0
def is_valid_rna_expression(rna_exp_file, logger):
    """
    Function that checks whether the RNA expression file adheres to the correct format
    """
    rna_exp_reader = csv.DictReader(open(rna_exp_file,'r'), delimiter='\t')
    ## check that required columns are present
    if not ('Gene' in rna_exp_reader.fieldnames and 'TPM' in rna_exp_reader.fieldnames and 'Log2FC' in rna_exp_reader.fieldnames and 'PAdj' in rna_exp_reader.fieldnames and 'DiffExp' in rna_exp_reader.fieldnames):
        err_msg = "RNA fusion file (" + str(rna_exp_file) + ") is missing required column(s): 'Gene', 'TPM', 'Log2FC','PAdj', or 'DiffExp'\n. Column names present in file: " + str(rna_exp_reader.fieldnames)
        return error_message(err_msg, logger)

    rna_exp_dataframe = np.read_csv(rna_exp_file, sep="\t")
    if rna_exp_dataframe.empty is True:
        err_msg = 'RNA gene expression file is empty - contains NO gene expression estimates'
        return error_message(err_msg, logger)
    if not rna_exp_dataframe['Gene'].dtype.kind in 'O': ## check that 'Gene' is of type object
        err_msg = "'Gene' column of RNA expression file cannot not be of type '" + str(rna_exp_dataframe['Gene'].dtype) + "'"
        return error_message(err_msg, logger)
    if not rna_exp_dataframe['TPM'].dtype.kind in 'if': ## check that 'TPM' is of type object
        err_msg = "'TPM' column of RNA expression file cannot not be of type '" + str(rna_exp_dataframe['TPM'].dtype) + "'"
        return error_message(err_msg, logger)
    if not rna_exp_dataframe['Log2FC'].dtype.kind in 'if': ## check that 'LogFC' is of type object
        err_msg = "'Log2FC' column of RNA expression file cannot not be of type '" + str(rna_exp_dataframe['Log2FC'].dtype) + "'"
        return error_message(err_msg, logger)
    if not rna_exp_dataframe['PAdj'].dtype.kind in 'if': ## check that 'PAdj' is of type object
        err_msg = "'TPM' column of RNA expression file cannot not be of type '" + str(rna_exp_dataframe['PAdj'].dtype) + "'"
        return error_message(err_msg, logger)
    if not rna_exp_dataframe['DiffExp'].dtype.kind in 'O': ## check that 'DiffExp' is of type object
        err_msg = "'DiffExp' column of RNA expression file cannot not be of type '" + str(rna_exp_dataframe['DiffExp'].dtype) + "'"
        return error_message(err_msg, logger)

    for rec in rna_exp_reader:
        if not (rec['DiffExp'] == 'over' or rec['DiffExp'] == 'under' or rec['DiffExp'] == 'NS'): ## check that 'DiffExp' column harbors permitted values
            err_msg = "Confidence column contains non-permitted values - only 'over','under', or 'NS' permitted. Value entered was " + str(rec['DiffExp'])
            return error_message(err_msg, logger)

        if not (rec['TPM'] >= 0):
            err_msg = "'TPM' column cannot contain negative values - value was " + str(rec['TPM'])
            return error_message(err_msg, logger)
        if not (rec['PAdj'] >= 0):
            err_msg = "'PAdj' column (adjusted p-value from differential expression testing) cannot contain negative values - value was " + str(rec['PAdj'])
            return error_message(err_msg, logger)

    logger.info('RNA expression file (' + str(rna_exp_file) + ') adheres to the correct format')
    return 0
示例#6
0
def validate_cpsr_input(pcgr_directory, input_vcf, custom_list_fname, preserved_info_tags, vcf_validation, genome_assembly, sample_id, virtual_panel_id, diagnostic_grade_only, output_dir, debug):
   """
   Function that reads the input files to CPSR (VCF file) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by CPSR
   3. Check that custom VCF INFO tags set by user as retained for output is found in query VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that VCF contains a single sample column
   6. The resulting VCF file is sorted and indexed (bgzip + tabix)
   """
   logger = utils.getlogger('cpsr-validate-input-arguments')

   custom_list_bed_fname = 'None'
   if not custom_list_fname == 'None':
      logger.info('Establishing BED track with custom list of genes from panel 0')
      custom_list_bed_fname = os.path.join(output_dir, sample_id + '.cpsr.' + genome_assembly + '.custom_list.bed')
      get_valid_custom_genelist(custom_list_fname, custom_list_bed_fname, pcgr_directory, genome_assembly, logger, debug)

   #config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'cpsr')
   if not input_vcf == 'None':
      if vcf_validation == 1:
         logger.info('Skipping validation of VCF file (deprecated as of Dec 2021)')
      else:
         logger.info('Skipping validation of VCF file as provided by option --no_vcf_validate')

      tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger)
      if tag_check == -1:
         return -1

      if preserved_info_tags != "None":
         custom_check = check_preserved_vcf_info_tags(input_vcf, preserved_info_tags, logger)
         if custom_check == -1:
            return -1

      vcf = VCF(input_vcf)
      samples = vcf.samples
      if len(samples) > 1:
         err_msg = "Query VCF contains more than one sample column (" + ', '.join(samples) + ") - CPSR expects a germline VCF with a single sample column - exiting"
         return error_message(err_msg, logger)
      simplify_vcf(input_vcf, vcf, custom_list_bed_fname, pcgr_directory, genome_assembly, virtual_panel_id, sample_id, diagnostic_grade_only, output_dir, logger, debug)

   return 0
示例#7
0
def validate_panel_normal_vcf(vcf, logger):
    """
    Function that checks the INFO tags in the panel of normal VCF for the presense of 'PANEL_OF_NORMAL' (logical tag)
    If any coinciding tags, an error will be returned
    """

    vcf = VCF(vcf)
    ret = -1
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO' and header_element['Type'] == 'Flag':
                if header_element['ID'] == 'PANEL_OF_NORMALS':
                    logger.info('Found \'PANEL_OF_NORMALS\' INFO flag in the VCF header section of the of panel of normals VCF file')
                    ret = 1

    if ret == -1:
        err_msg = 'INFO flag \'PANEL_OF_NORMALS\' is missing from the panel of normal VCF header'
        return error_message(err_msg, logger)

    return ret
示例#8
0
def is_valid_rna_fusion(rna_fusion_file, logger):
    """
    Function that checks whether the RNA fusion transcript file adheres to the correct format
    """
    rna_fusion_reader = csv.DictReader(open(rna_fusion_file,'r'), delimiter='\t')
    ## check that required columns are present
    if not ('GeneA' in rna_fusion_reader.fieldnames and 'GeneB' in rna_fusion_reader.fieldnames and 'Confidence' in rna_fusion_reader.fieldnames):
        err_msg = "RNA fusion file (" + str(rna_fusion_file) + ") is missing required column(s): 'Gene1', 'Gene2', or  'Confidence'\n. Column names present in file: " + str(rna_fusion_reader.fieldnames)
        return error_message(err_msg, logger)

    rna_fusion_dataframe = np.read_csv(rna_fusion_file, sep="\t")
    if rna_fusion_dataframe.empty is True:
        err_msg = 'RNA fusion file is empty - contains NO fusions'
        return error_message(err_msg, logger)
    if not rna_fusion_dataframe['Gene1'].dtype.kind in 'O': ## check that 'Gene1' is of type object
        err_msg = "'Gene1' column of RNA fusion file cannot not be of type '" + str(rna_fusion_dataframe['Gene1'].dtype) + "'"
        return error_message(err_msg, logger)
    if not rna_fusion_dataframe['Gene2'].dtype.kind in 'O': ## check that 'Gene2' is of type object
        err_msg = "'Gene2' column of RNA fusion file cannot not be of type '" + str(rna_fusion_dataframe['Gene2'].dtype) + "'"
        return error_message(err_msg, logger)
    if not rna_fusion_dataframe['Confidence'].dtype.kind in 'O': ## check that 'Confidence' is of type object
        err_msg = "'Confidence' column of RNA fusion file cannot not be of type '" + str(rna_fusion_dataframe['Confidence'].dtype) + "'"
        return error_message(err_msg, logger)

    observed_variants = {}
    for rec in rna_fusion_reader:
        if not (rec['Confidence'] == 'high' or rec['Confidence'] == 'medium' or rec['Confidence'] == 'low'): ## check that 'Confidence' column harbor permitted values
            err_msg = "Confidence column contains non-permitted values - only 'high','medium', or 'low' permitted. Value entered was " + str(rec['Confidence'])
            return error_message(err_msg, logger)

        variant_key = str(rec['Gene1']) + "_" + str(rec['Gene2'])
        if variant_key in observed_variants.keys():
            err_msg = "Duplicate entry in RNA fusion variants: " + str(variant_key) + " is found in multiple rows"
            return error_message(err_msg, logger)
        observed_variants[variant_key] = 1


    logger.info('RNA fusion file (' + str(rna_fusion_file) + ') adheres to the correct format')
    return 0
示例#9
0
def check_args(arg_dict):

    logger = getlogger("pcgr-validate-arguments-input-a")
    # Check the existence of required arguments
    if arg_dict['pcgr_dir'] is None or not os.path.exists(
            arg_dict['pcgr_dir']):
        err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})."
        error_message(err_msg, logger)

    if arg_dict['genome_assembly'] is None:
        err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})."
        error_message(err_msg, logger)

    if arg_dict['input_vcf'] is None:
        err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
        error_message(err_msg, logger)

    if arg_dict['sample_id'] is None:
        err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})."
        error_message(err_msg, logger)

    if len(arg_dict['sample_id']) <= 2 or len(arg_dict['sample_id']) > 35:
        err_msg = f"Sample name identifier ('--sample_id' = {arg_dict['sample_id']}) must be between 2 and 35 characters long"
        error_message(err_msg, logger)

    # Optional arguments

    # check if input is cancer cell line, requires --tumor_only
    if arg_dict['cell_line'] and not arg_dict['tumor_only']:
        err_msg = 'Analysis of cell line (--cell_line) needs option --tumor_only'
        error_message(err_msg, logger)

    # check that tumor primary site/type is set correctly (integer between 0 and 30)
    if arg_dict['tsite'] > max(
            pcgr_vars.tsites.keys()) or arg_dict['tsite'] < 0:
        err_msg = f"Tumor type code ('--tumor_site' = {arg_dict['tsite']}) must be within [0, {max(pcgr_vars.tsites.keys())}]"
        error_message(err_msg, logger)

    # check that tumor purity and tumor ploidy is set correctly
    if not arg_dict['tumor_purity'] is None:
        if not (arg_dict['tumor_purity'] > 0
                and arg_dict['tumor_purity'] <= 1):
            err_msg = f"Tumor purity value ('--tumor_purity' = {arg_dict['tumor_purity']}) must be within (0, 1]"
            error_message(err_msg, logger)

    if not arg_dict['tumor_ploidy'] is None:
        if not arg_dict['tumor_ploidy'] > 0:
            err_msg = f"Tumor ploidy value ('--tumor_ploidy' = {arg_dict['tumor_ploidy']}) must be > 0"
            error_message(err_msg, logger)

    # check that minimum/maximum depth/allelic fractions are set correctly
    if arg_dict['tumor_dp_min'] < 0:
        err_msg = f"Minimum depth tumor ('tumor_dp_min' = {arg_dict['tumor_dp_min']}) must be >= 0"
        error_message(err_msg, logger)

    if arg_dict['tumor_af_min'] < 0 or arg_dict['tumor_af_min'] > 1:
        err_msg = f"Minimum AF tumor ('tumor_af_min' = {arg_dict['tumor_af_min']}) must be within [0, 1]"
        error_message(err_msg, logger)

    if arg_dict['control_dp_min'] < 0:
        err_msg = f"Minimum depth control ('control_dp_min' = {arg_dict['control_dp_min']}) must be >= 0"
        error_message(err_msg, logger)

    if arg_dict['control_af_max'] < 0 or arg_dict['control_af_max'] > 1:
        err_msg = f"Maximum AF control ('control_af_max' = {arg_dict['control_af_max']}) must be within [0, 1]"
        error_message(err_msg, logger)

    # Check that coding target size region of sequencing assay is set correctly
    if arg_dict['target_size_mb'] < 0 or arg_dict['target_size_mb'] > 34:
        err_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) is not positive or larger than the likely maximum size of the coding human genome (34 Mb))"
        error_message(err_msg, logger)
    if arg_dict['target_size_mb'] < 1:
        warn_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) must be greater than 1 Mb for mutational burden estimate to be robust"
        warn_message(warn_msg, logger)
    if arg_dict['target_size_mb'] < 34 and arg_dict['assay'] != 'TARGETED':
        warn_msg = f"Coding target size region in Mb ('--target_size_mb' = {arg_dict['target_size_mb']}) is less than default for WES/WGS (34Mb), assay must be set to 'TARGETED'"
        warn_message(warn_msg, logger)

    # if assay is targeted or mode is Tumor-Only, MSI prediction will not be performed/switched off
    assay_type = 'Tumor-Control'
    if arg_dict['estimate_msi_status'] is True and (
            arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True):
        if arg_dict['tumor_only'] is True:
            assay_type = 'Tumor-Only'
        warn_msg = f"MSI status prediction can be applied for WGS/WES tumor-control assays only (query type: {arg_dict['assay']}|{assay_type}) - analysis will be omitted"
        warn_message(warn_msg, logger)
        arg_dict['estimate_msi_status'] = 0

    # minimum number of mutations required for mutational signature reconstruction cannot be less than 100 (somewhat arbitrary lower threshold, recommended value is 200)
    if arg_dict['min_mutations_signatures'] < 200:
        warn_msg = f"Minimum number of mutations required for mutational signature analysis ('--min_mutations_signatures' = {arg_dict['min_mutations_signatures']}) is less than the recommended number (n = 200)"
        warn_message(warn_msg, logger)
        if arg_dict['min_mutations_signatures'] < 100:
            err_msg = f"Minimum number of mutations required for mutational signature analysis ('--min_mutations_signatures' = {arg_dict['min_mutations_signatures']}) must be >= 100"
            error_message(err_msg, logger)

    # if MSI status is to be estimated, mutational burden must be turned on
    if arg_dict['estimate_msi_status'] is True and arg_dict[
            'estimate_tmb'] is False:
        err_msg = "Prediction of MSI status ('--estimate_msi_status') requires mutational burden analysis ('--estimate_tmb')"
        error_message(err_msg, logger)

    if arg_dict['tumor_only'] is True:
        for t in [
                'exclude_likely_het_germline', 'exclude_likely_hom_germline'
        ]:
            if arg_dict[t]:
                if arg_dict['tumor_af_tag'] == "_NA_":
                    err_msg = f"Option '--{t}' requires '--tumor_af_tag' option to be set"
                    error_message(err_msg, logger)

        # Emit warning if panel-of-normals VCF is not present and exclude_pon is set
        if arg_dict['pon_vcf'] is None and arg_dict['exclude_pon'] is True:
            warn_msg = "Panel-of-normals VCF is NOT provided ('--pon_vcf') - exclusion of calls found in panel-of-normals ('--exclude_pon') will be ignored"
            warn_message(warn_msg, logger)
            arg_dict['exclude_pon'] = False

        # Emit warnings that mutational burden and mutational signatures are less accurate for assays with tumor-only data
        if arg_dict['estimate_tmb'] is True:
            warn_msg = "Estimation of mutational burden in tumor-only mode is suboptimal - results must be interpreted with caution"
            warn_message(warn_msg, logger)
        if arg_dict['estimate_signatures'] is True:
            warn_msg = "Estimation of mutational signatures in tumor-only mode is suboptimal - results must be interpreted with caution"
            warn_message(warn_msg, logger)

        # Emit errors when tumor-only filtering thresholds are not properly set
        for pop in ['eur', 'afr', 'amr', 'eas', 'sas', 'global']:
            tag = f'maf_onekg_{pop}'
            if arg_dict[tag]:
                if float(arg_dict[tag]) < 0 or float(arg_dict[tag]) > 1:
                    err_msg = f"MAF threshold (tumor-only germline filter) for 1000 Genomes Project (pop '{pop.upper()}') must be within the [0, 1] range, current value is {arg_dict[tag]}"
                    error_message(err_msg, logger)

        for pop in [
                'nfe', 'fin', 'amr', 'eas', 'sas', 'asj', 'oth', 'afr',
                'global'
        ]:
            tag = f'maf_gnomad_{pop}'
            if arg_dict[tag]:
                if float(arg_dict[tag]) < 0 or float(arg_dict[tag]) > 1:
                    err_msg = f"MAF threshold (tumor-only germline filter) for gnomAD (pop '{pop.upper()}') must be within the [0, 1] range, current value is {arg_dict[tag]}"
                    error_message(err_msg, logger)

    ## tumor-only is False
    # else:
    #    for t in ["exclude_pon","exclude_likely_het_germline","exclude_likely_hom_germline","exclude_dbsnp_nonsomatic","exclude_nonexonic"]:
    #       if arg_dict[t] is True:
    #          warn_msg = "Option "--" + str(t) + "" requires "--tumor_only" option (not currently set)"
    #          warn_message(warn_msg, logger)

    # Emit warning that mutational signature estimation is (likely) not optimal for small/targeted sequencing assays
    if arg_dict['estimate_signatures'] is True and arg_dict[
            'assay'] == 'TARGETED':
        warn_msg = "Estimation of mutational signatures ('--estimate_signatures') is not optimal for TARGETED sequencing assays - results must be interpreted with caution"
        warn_message(warn_msg, logger)

    # Check that log ratio thresholds for homozygous deletions and amplifications are properly set, and that segment overlap with transcripts are set appropriately
    if arg_dict['logr_homdel'] >= 0:
        err_msg = f"Log ratio for homozygous deletions ('--logr_homdel' = {arg_dict['logr_homdel']}) should be < 0"
        error_message(err_msg, logger)
    if arg_dict['logr_gain'] <= 0:
        err_msg = f"Log ratio for copy number gains/amplifications ('--logr_gain' = {arg_dict['logr_gain']}) should be > 0"
        error_message(err_msg, logger)
    if arg_dict['cna_overlap_pct'] > 100 or arg_dict['cna_overlap_pct'] <= 0:
        err_msg = f"Minimum percent overlap between copy number segment and gene transcript ('--cna_overlap_pct' = {arg_dict['cna_overlap_pct']}) must be within (0, 100]"
        error_message(err_msg, logger)

    # VEP options
    if arg_dict['vep_n_forks'] <= 0 or arg_dict['vep_n_forks'] > 4:
        err_msg = f"Number of forks that VEP can use during annotation ('--vep_n_forks' = {arg_dict['vep_n_forks']}must be within (0, 4]"
        error_message(err_msg, logger)

    if arg_dict['vep_buffer_size'] <= 0 or arg_dict['vep_buffer_size'] > 30000:
        err_msg = f"Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously ('--vep_buffer_size' = {arg_dict['vep_buffer_size']}),  must be within (0, 30000]"
        error_message(err_msg, logger)

    # Check that VEP pick criteria is formatted correctly
    if not arg_dict['vep_pick_order'] is None:
        values = str(arg_dict['vep_pick_order']).split(',')
        permitted_sources = [
            'canonical', 'appris', 'tsl', 'biotype', 'ccds', 'rank', 'length',
            'mane'
        ]
        num_permitted_sources = 0
        for v in values:
            if v in permitted_sources:
                num_permitted_sources += 1

        if num_permitted_sources != 8:
            err_msg = (
                f"'--vep_pick_order' = {arg_dict['vep_pick_order']} is formatted incorrectly, should be "
                "a comma-separated string of the following values: canonical,appris,tsl,biotype,ccds,rank,length,mane"
            )
            error_message(err_msg, logger)
    return
示例#10
0
def verify_input_files_cpsr(arg_dict):

    logger = getlogger('cpsr-validate-input-arguments-b')
    input_vcf_dir = "NA"
    db_dir = "NA"
    base_dir = "NA"
    output_dir_full = "NA"
    input_vcf_basename = "NA"
    input_customlist_basename = "NA"
    input_customlist_dir = "NA"

    # create output folder (if not already exists)
    output_dir_full = utils.safe_makedir(
        os.path.abspath(arg_dict['output_dir']))

    ## check if input BED exist
    if not arg_dict['custom_list'] is None:
        if not os.path.exists(os.path.abspath(arg_dict['custom_list'])):
            err_msg = f"Input file ({arg_dict['custom_list']}) does not exist"
            error_message(err_msg, logger)

        input_customlist_basename = os.path.basename(
            str(arg_dict['custom_list']))
        input_customlist_dir = os.path.dirname(
            os.path.abspath(arg_dict['custom_list']))

    ## check if input vcf exist
    if not arg_dict['input_vcf'] is None:
        if not os.path.exists(os.path.abspath(arg_dict['input_vcf'])):
            err_msg = f"Input file ({arg_dict['input_vcf']}) does not exist"
            error_message(err_msg, logger)

        if not (os.path.abspath(arg_dict['input_vcf']).endswith('.vcf')
                or os.path.abspath(arg_dict['input_vcf']).endswith('.vcf.gz')):
            err_msg = f"VCF input file ({os.path.abspath(arg_dict['input_vcf'])}) does not have the correct file extension (.vcf or .vcf.gz)"
            error_message(err_msg, logger)

        ## check that tabix file exist if bgzipped files is given
        if os.path.abspath(arg_dict['input_vcf']).endswith('.vcf.gz'):
            tabix_file = arg_dict['input_vcf'] + '.tbi'
            if not os.path.exists(os.path.abspath(tabix_file)):
                err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped VCF input file (" + os.path.abspath(
                    arg_dict['input_vcf']
                ) + "). Please make sure your input VCF is properly compressed and indexed (bgzip + tabix)"
                error_message(err_msg, logger)

        input_vcf_basename = os.path.basename(str(arg_dict['input_vcf']))
        input_vcf_dir = os.path.dirname(os.path.abspath(arg_dict['input_vcf']))

        ## if output vcf exist and overwrite not set
        output_vcf = os.path.join(str(output_dir_full),
                                  str(arg_dict['sample_id'])) + '.cpsr.' + str(
                                      arg_dict['genome_assembly']) + '.vcf.gz'
        if os.path.exists(output_vcf) and arg_dict['force_overwrite'] is False:
            err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different sample_id or add option --force_overwrite"
            error_message(err_msg, logger)

    ## check the existence of base folder
    base_dir = os.path.abspath(arg_dict['pcgr_dir'])
    if not os.path.isdir(base_dir):
        err_msg = f"Base directory ({base_dir}) does not exist"
        error_message(err_msg, logger)

    ## check the existence of data folder within the base folder
    db_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']), 'data')
    if not os.path.isdir(db_dir):
        err_msg = f"Data directory ({db_dir}) does not exist"
        error_message(err_msg, logger)

    ## check the existence of specified assembly data folder within the base folder
    db_assembly_dir = os.path.join(os.path.abspath(arg_dict['pcgr_dir']),
                                   'data', arg_dict['genome_assembly'])
    if not os.path.isdir(db_assembly_dir):
        err_msg = f"Data directory for the specified genome assembly ({db_assembly_dir}) does not exist"
        error_message(err_msg, logger)

    ## check the existence of RELEASE_NOTES
    rel_notes_file = os.path.join(os.path.abspath(arg_dict['pcgr_dir']),
                                  'data', arg_dict['genome_assembly'],
                                  'RELEASE_NOTES')
    if not os.path.exists(rel_notes_file):
        err_msg = 'The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/cpsr for instructions)'
        error_message(err_msg, logger)

    f_rel_not = open(rel_notes_file, 'r')
    compliant_data_bundle = 0
    for line in f_rel_not:
        if pcgr_vars.DB_VERSION in line:
            compliant_data_bundle = 1

    f_rel_not.close()

    if compliant_data_bundle == 0:
        err_msg = 'The PCGR data bundle is not compliant with the software version - please download the latest software and data bundle (see https://github.com/sigven/cpsr for instructions)'
        error_message(err_msg, logger)

    cpsr_paths = {
        "input_vcf_dir": input_vcf_dir,
        "input_customlist_dir": input_customlist_dir,
        "db_dir": db_assembly_dir,
        "base_dir": base_dir,
        "output_dir": output_dir_full,
        "input_vcf_basename": input_vcf_basename,
        "input_customlist_basename": input_customlist_basename,
    }

    return cpsr_paths
示例#11
0
def check_args_cpsr(arg_dict):

    logger = getlogger('cpsr-validate-input-arguments-a')
    arg_dict['vep_regulatory'] = True
    ## Required arguments
    ## Check that query VCF is set and exists
    if arg_dict['input_vcf'] is None or not os.path.exists(
            arg_dict['input_vcf']):
        err_msg = f"Required argument '--input_vcf' does not exist ({arg_dict['input_vcf']})."
        error_message(err_msg, logger)
    ## Check that PCGR directory (with data bundle) is provided and exists
    if arg_dict['pcgr_dir'] is None or not os.path.exists(
            arg_dict['pcgr_dir']):
        err_msg = f"Required argument '--pcgr_dir' does not exist ({arg_dict['pcgr_dir']})."
        error_message(err_msg, logger)
    ## Check that genome assembly is set
    if arg_dict['genome_assembly'] is None:
        err_msg = f"Required argument '--genome_assembly' has no/undefined value ({arg_dict['genome_assembly']})."
        error_message(err_msg, logger)
    ## Check that sample identifier is set and is of appropriate length (minimum two characters)
    if arg_dict['sample_id'] is None:
        err_msg = f"Required argument '--sample_id' has no/undefined value ({arg_dict['sample_id']})."
        error_message(err_msg, logger)

    if len(arg_dict['sample_id']) <= 2:
        err_msg = f"Sample name identifier ('--sample_id') requires a name with more than two characters ({arg_dict['sample_id']})."
        error_message(err_msg, logger)

    ### Optional arguments
    ## Provide virtual_panel_id or a custom list from panel 0
    if arg_dict['virtual_panel_id'] == "-1" and not arg_dict['custom_list']:
        err_msg = 'Provide valid virtual panel identifier(s) through --panel_id (0 - 42) or provide custom list of panel 0 genes (single column text file) through --custom_list'
        error_message(err_msg, logger)
    if arg_dict['custom_list'] and arg_dict['virtual_panel_id'] != "-1":
        err_msg = "Option --panel_id cannot be used in conjunction with --custom_list"
        error_message(err_msg, logger)
    if arg_dict['maf_upper_threshold'] <= 0 or arg_dict[
            'maf_upper_threshold'] > 1:
        err_msg = f"MAF upper threshold must be greater than 0 and below 1, current value is {arg_dict['maf_upper_threshold']}"
        error_message(err_msg, logger)
    if arg_dict['vcfanno_n_proc'] <= 0 or arg_dict['vcfanno_n_proc'] > 15:
        err_msg = f"Number of processes that vcfanno can use during annotation must be above 0 and not more than 15, current value is {arg_dict['vcfanno_n_proc']}."
        error_message(err_msg, logger)

    ## Check that panel identifier(s) are set appropriately
    if arg_dict['virtual_panel_id'] != "-1" and not arg_dict['custom_list']:
        if not ',' in arg_dict['virtual_panel_id']:
            if str(arg_dict['virtual_panel_id']).isdigit():
                panel_id = int(arg_dict['virtual_panel_id'])
                if not (panel_id >= 0 and panel_id <= 42):
                    err_msg = 'A single panel chosen with \'--panel_id\' must be in the range 0 - 42'
                    error_message(err_msg, logger)
            else:
                err_msg = 'A single panel chosen with \'--panel_id\' must be a proper integer - not \'' + str(
                    arg_dict['virtual_panel_id']) + '\''
                error_message(err_msg, logger)
        else:
            panels = str(arg_dict['virtual_panel_id']).split(',')
            for p in panels:
                #p = int(p)
                if str(p).isdigit():
                    panel_id = int(p)
                    if panel_id < 1 or panel_id > 42:
                        err_msg = 'Multiple panels submitted as comma-separated string with \'--panel_id\' must take values in the range 1 - 42'
                        error_message(err_msg, logger)
                else:
                    err_msg = f"Multiple panels submitted as comma-separated string with '--panel_id' must contain proper integer values only - \'{arg_dict['virtual_panel_id']}\' contains non-integer entries."
                    error_message(err_msg, logger)

    if (arg_dict['custom_list'] or arg_dict['virtual_panel_id']
            == "0") and arg_dict['diagnostic_grade_only']:
        warn_msg = 'Option \'--diagnostic_grade_only\' applies ONLY to panel identifiers from Genomics England PanelApp - will be ignored'
        warn_message(warn_msg, logger)

    ## VEP options
    if arg_dict['vep_n_forks'] <= 0 or arg_dict['vep_n_forks'] > 4:
        err_msg = f"Number of forks that VEP can use during annotation must be above 0 and not more than 4, current value is {arg_dict['vep_n_forks']}"
        error_message(err_msg, logger)

    if arg_dict['vep_buffer_size'] <= 0 or arg_dict['vep_buffer_size'] > 30000:
        err_msg = "Internal VEP buffer size, corresponding to the number of variants that are read in to memory simultaneously, must be above 0 and not more than 30,000, current value is {arg_dict['vep_buffer_size']}"
        error_message(err_msg, logger)

    ## Check that VEP pick criteria is formatted correctly
    if not arg_dict['vep_pick_order'] is None:
        values = str(arg_dict['vep_pick_order']).split(',')
        permitted_sources = [
            'canonical', 'appris', 'tsl', 'biotype', 'ccds', 'rank', 'length',
            'mane'
        ]
        num_permitted_sources = 0
        for v in values:
            if v in permitted_sources:
                num_permitted_sources += 1

        if num_permitted_sources != 8:
            err_msg = "Option 'vep_pick_order' = " + str(arg_dict['vep_pick_order']) + " is formatted incorrectly, should be " + \
               "a comma-separated string of the following values: canonical,appris,tsl,biotype,ccds,rank,length,mane"
            error_message(err_msg, logger)
    return
示例#12
0
def verify_input_files(arg_dict):
    """
    1. Checks existence of input files/dirs (arg_dict)
    2. Checks that the data bundle is of correct date
    """
    logger = getlogger("pcgr-validate-arguments-input-b")

    input_vcf_dir = 'NA'
    input_cna_dir = 'NA'
    input_rna_fusion_dir = 'NA'
    input_cpsr_report_dir = 'NA'
    input_rna_expression_dir = 'NA'
    input_cna_plot_dir = 'NA'
    panel_normal_vcf_dir = 'NA'
    db_dir = 'NA'
    base_dir = 'NA'
    output_dir_full = 'NA'
    panel_normal_vcf_basename = 'NA'
    input_vcf_basename = 'NA'
    input_cna_basename = 'NA'
    input_rna_fusion_basename = 'NA'
    input_rna_expression_basename = 'NA'
    input_cpsr_report_basename = 'NA'
    input_cna_plot_basename = 'NA'

    arg_dict['rna_fusion_tumor'] = None
    arg_dict['rna_exp_tumor'] = None

    # check that either input vcf or cna segments exist
    if arg_dict['input_vcf'] is None and arg_dict['input_cna'] is None:
        err_msg = 'Please specifiy either a VCF input file (--input_vcf) or a copy number segment file (--input_cna)'
        error_message(err_msg, logger)

    # create output folder (if not already exists)
    output_dir_full = utils.safe_makedir(
        os.path.abspath(arg_dict['output_dir']))

    # check if panel of normal VCF exist
    if not arg_dict["pon_vcf"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["pon_vcf"])):
            err_msg = "Input file (" + \
                str(arg_dict["pon_vcf"]) + ") does not exist"
            error_message(err_msg, logger)

        if not (os.path.abspath(arg_dict["pon_vcf"]).endswith(".vcf.gz")):
            err_msg = "Panel of normals VCF file (" + os.path.abspath(
                arg_dict["pon_vcf"]
            ) + ") does not have the correct file extension (.vcf.gz)"
            error_message(err_msg, logger)

        # check that tabix file exist if bgzipped files is given
        if os.path.abspath(arg_dict["pon_vcf"]).endswith(".vcf.gz"):
            tabix_file = arg_dict["pon_vcf"] + ".tbi"
            if not os.path.exists(os.path.abspath(tabix_file)):
                err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped panel of normal VCF file (" + os.path.abspath(arg_dict["pon_vcf"]) + \
                    "). Please make sure your the VCF is properly compressed and indexed (bgzip + tabix)"
                error_message(err_msg, logger)

        if arg_dict["input_vcf"] is None:
            warn_msg = "Ignoring panel of normal VCF file, --input_vcf missing"
            warn_message(warn_msg, logger)
        else:
            panel_normal_vcf_basename = os.path.basename(
                str(arg_dict["pon_vcf"]))
            panel_normal_vcf_dir = os.path.dirname(
                os.path.abspath(arg_dict["pon_vcf"]))

    # check if input vcf exists
    if not arg_dict["input_vcf"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["input_vcf"])):
            err_msg = f'Input file ({arg_dict["input_vcf"]}) does not exist'
            error_message(err_msg, logger)

        if not (os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf")
                or os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf.gz")):
            err_msg = f'VCF input file ({os.path.abspath(arg_dict["input_vcf"])}) does not have the correct file extension (.vcf or .vcf.gz)'
            error_message(err_msg, logger)

        # check that tabix file exists if bgzipped file is given
        if os.path.abspath(arg_dict["input_vcf"]).endswith(".vcf.gz"):
            tabix_file = arg_dict["input_vcf"] + ".tbi"
            if not os.path.exists(os.path.abspath(tabix_file)):
                err_msg = "Tabix file (i.e. '.gz.tbi') is not present for the bgzipped VCF input file (" + os.path.abspath(arg_dict["input_vcf"]) + \
                    "). Please make sure your input VCF is properly compressed and indexed (bgzip + tabix)"
                error_message(err_msg, logger)

        input_vcf_basename = os.path.basename(str(arg_dict["input_vcf"]))
        input_vcf_dir = os.path.dirname(os.path.abspath(arg_dict["input_vcf"]))

        # if output vcf exist and overwrite not set
        output_vcf = os.path.join(
            str(output_dir_full),
            f"{arg_dict['sample_id']}.pcgr_acmg.{arg_dict['genome_assembly']}.vcf.gz"
        )
        if os.path.exists(output_vcf) and arg_dict["force_overwrite"] is False:
            err_msg = f"Output files (e.g. {output_vcf}) already exist - please specify different sample_id or add option --force_overwrite"
            error_message(err_msg, logger)

    # check if input cna plot file exist
    # if not arg_dict["input_cna_plot"] is None:
    #    if not os.path.exists(os.path.abspath(arg_dict["input_cna_plot"])):
    #       err_msg = "Input file (" + str(arg_dict["input_cna_plot"]) + ") does not exist"
    #       error_message(err_msg,logger)
    #    if not (os.path.abspath(arg_dict["input_cna_plot"]).endswith(".png")):
    #       err_msg = "CNA segment input file (" + os.path.abspath(arg_dict["input_cna_plot"]) + ") does not have the correct file extension (.png)"
    #       error_message(err_msg,logger)
    #    if arg_dict["input_cna"] is None:
    #       err_msg = "Input a CNA plot needs to come with a CNA segment file (--input_cna is missing)"
    #       error_message(err_msg,logger)
    #    input_cna_plot_basename = os.path.basename(str(arg_dict["input_cna_plot"]))
    #    input_cna_plot_dir = os.path.dirname(os.path.abspath(arg_dict["input_cna_plot"]))

    # check if input cna segments exist
    if not arg_dict["input_cna"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["input_cna"])):
            err_msg = "Input file (" + \
                str(arg_dict["input_cna"]) + ") does not exist"
            error_message(err_msg, logger)
        if not (os.path.abspath(arg_dict["input_cna"]).endswith(".tsv")
                or os.path.abspath(arg_dict["input_cna"]).endswith(".txt")):
            err_msg = "CNA segment input file (" + os.path.abspath(
                arg_dict["input_cna"]
            ) + ") does not have the correct file extension (.tsv or .txt)"
            error_message(err_msg, logger)
        input_cna_basename = os.path.basename(str(arg_dict["input_cna"]))
        input_cna_dir = os.path.dirname(os.path.abspath(arg_dict["input_cna"]))

        # if output cna segments exist and overwrite not set
        output_cna_segments = os.path.join(
            str(output_dir_full), str(
                arg_dict["sample_id"])) + ".pcgr_acmg." + str(
                    arg_dict["genome_assembly"]) + ".cna_segments.tsv.gz"
        if os.path.exists(
                output_cna_segments) and arg_dict["force_overwrite"] is False:
            err_msg = "Output files (e.g. " + str(output_cna_segments) + \
                ") already exist - please specify different sample_id or add option --force_overwrite"
            error_message(err_msg, logger)

    # check if input rna fusion variants exist
    if not arg_dict["rna_fusion_tumor"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["rna_fusion_tumor"])):
            err_msg = "Input file (" + \
                str(arg_dict["rna_fusion_tumor"]) + ") does not exist"
            error_message(err_msg, logger)
        if not (os.path.abspath(arg_dict["rna_fusion_tumor"]).endswith(".tsv")
                or os.path.abspath(
                    arg_dict["rna_fusion_tumor"]).endswith(".txt")):
            err_msg = "RNA fusion variants file (" + os.path.abspath(
                arg_dict["rna_fusion_tumor"]
            ) + ") does not have the correct file extension (.tsv or .txt)"
            error_message(err_msg, logger)
        input_rna_fusion_basename = os.path.basename(
            str(arg_dict["rna_fusion_tumor"]))
        input_rna_fusion_dir = os.path.dirname(
            os.path.abspath(arg_dict["rna_fusion_tumor"]))

    # check if input rna expression exist
    if not arg_dict["rna_exp_tumor"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["rna_exp_tumor"])):
            err_msg = "Input file (" + \
                str(arg_dict["rna_exp_tumor"]) + ") does not exist"
            error_message(err_msg, logger)
        if not (os.path.abspath(arg_dict["rna_exp_tumor"]).endswith(".tsv") or
                os.path.abspath(arg_dict["rna_exp_tumor"]).endswith(".txt")):
            err_msg = "RNA gene expression file (" + os.path.abspath(
                arg_dict["rna_exp_tumor"]
            ) + ") does not have the correct file extension (.tsv or .txt)"
            error_message(err_msg, logger)
        input_rna_expression_basename = os.path.basename(
            str(arg_dict["rna_exp_tumor"]))
        input_rna_expression_dir = os.path.dirname(
            os.path.abspath(arg_dict["rna_exp_tumor"]))

    # check if input rna fusion variants exist
    if not arg_dict["cpsr_report"] is None:
        if not os.path.exists(os.path.abspath(arg_dict["cpsr_report"])):
            err_msg = "Input file (" + \
                str(arg_dict["cpsr_report"]) + ") does not exist"
            error_message(err_msg, logger)
        if not (os.path.abspath(arg_dict["cpsr_report"]).endswith(".json.gz")):
            err_msg = "CPSR report file (" + os.path.abspath(
                arg_dict["cpsr_report"]
            ) + ") does not have the correct file extension (.json.gz)"
            error_message(err_msg, logger)
        input_cpsr_report_basename = os.path.basename(
            str(arg_dict["cpsr_report"]))
        input_cpsr_report_dir = os.path.dirname(
            os.path.abspath(arg_dict["cpsr_report"]))

    # check the existence of base folder
    base_dir = os.path.abspath(arg_dict["pcgr_dir"])
    if not os.path.isdir(base_dir):
        err_msg = "Base directory (" + str(base_dir) + ") does not exist"
        error_message(err_msg, logger)

    # check the existence of data folder within the base folder
    db_dir = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]), "data")
    if not os.path.isdir(db_dir):
        err_msg = "Data directory (" + str(db_dir) + ") does not exist"
        error_message(err_msg, logger)

    # check the existence of specified assembly data folder within the base folder
    db_assembly_dir = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]),
                                   "data", arg_dict["genome_assembly"])
    if not os.path.isdir(db_assembly_dir):
        err_msg = "Data directory for the specified genome assembly (" + str(
            db_assembly_dir) + ") does not exist"
        error_message(err_msg, logger)

    # check the existence of RELEASE_NOTES (starting from 0.4.0)
    rel_notes_file = os.path.join(os.path.abspath(arg_dict["pcgr_dir"]),
                                  "data", arg_dict["genome_assembly"],
                                  "RELEASE_NOTES")
    if not os.path.exists(rel_notes_file):
        err_msg = "The PCGR data bundle is outdated - please download the latest data bundle (see github.com/sigven/pcgr for instructions)"
        error_message(err_msg, logger)

    f_rel_not = open(rel_notes_file, "r")
    compliant_data_bundle = 0
    for line in f_rel_not:
        if pcgr_vars.DB_VERSION in line:
            compliant_data_bundle = 1

    f_rel_not.close()

    if compliant_data_bundle == 0:
        err_msg = "The PCGR data bundle is not compliant with the software version - please download the latest software and data bundle (see https://github.com/sigven/pcgr for instructions)"
        error_message(err_msg, logger)

    pcgr_paths = {
        "input_vcf_dir": input_vcf_dir,
        "input_cna_dir": input_cna_dir,
        "input_rna_fusion_dir": input_rna_fusion_dir,
        "input_rna_expression_dir": input_rna_expression_dir,
        "input_cpsr_report_dir": input_cpsr_report_dir,
        "input_cna_plot_dir": input_cna_plot_dir,
        "panel_normal_vcf_dir": panel_normal_vcf_dir,
        "db_dir": db_assembly_dir,
        "base_dir": base_dir,
        "output_dir": output_dir_full,
        "panel_normal_vcf_basename": panel_normal_vcf_basename,
        "input_vcf_basename": input_vcf_basename,
        "input_cna_basename": input_cna_basename,
        "input_rna_fusion_basename": input_rna_fusion_basename,
        "input_rna_expression_basename": input_rna_expression_basename,
        "input_cpsr_report_basename": input_cpsr_report_basename,
        "input_cna_plot_basename": input_cna_plot_basename,
    }

    return pcgr_paths
示例#13
0
def extend_vcf_annotations(query_vcf, pcgr_db_dir, logger, pon_annotation, regulatory_annotation, cpsr, debug):
    """
    Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
    1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
    2. Cancer-relevant gene annotations (PCGR_ONCO_XREF), e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
    3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
    4. Variant effect predictions
    5. Panel-of-normal (blacklisted variants) annotation

    List of INFO tags to be produced is provided by the 'infotags' files in the pcgr_db_dir
    """

    ## read VEP and PCGR tags to be appended to VCF file
    vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'pcgr_infotags.tsv'))
    if cpsr is True:
        vcf_infotags_meta = annoutils.read_infotag_file(os.path.join(pcgr_db_dir, 'cpsr_infotags.tsv'))
    pcgr_onco_xref_map = annoutils.read_genexref_namemap(os.path.join(pcgr_db_dir, 'pcgr_onco_xref', 'pcgr_onco_xref_namemap.tsv'))


    out_vcf = re.sub(r'\.vcf(\.gz){0,}$','.annotated.vcf',query_vcf)

    meta_vep_dbnsfp_info = annoutils.vep_dbnsfp_meta_vcf(query_vcf, vcf_infotags_meta)
    dbnsfp_prediction_algorithms = meta_vep_dbnsfp_info['dbnsfp_prediction_algorithms']
    vep_csq_fields_map = meta_vep_dbnsfp_info['vep_csq_fieldmap']
    vcf = cyvcf2.VCF(query_vcf)
    for tag in sorted(vcf_infotags_meta):
        if pon_annotation == 0 and regulatory_annotation == 0:
            if not tag.startswith('PANEL_OF_NORMALS') and not tag.startswith('REGULATORY_'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        elif pon_annotation == 1 and regulatory_annotation == 0:
            if not tag.startswith('REGULATORY_'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        elif pon_annotation == 0 and regulatory_annotation == 1:
            if not tag.startswith('PANEL_OF_NORMALS'):
                vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})
        else:
            vcf.add_info_to_header({'ID': tag, 'Description': str(vcf_infotags_meta[tag]['description']),'Type':str(vcf_infotags_meta[tag]['type']), 'Number': str(vcf_infotags_meta[tag]['number'])})

    w = cyvcf2.Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0

    vcf_info_element_types = {}
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element and 'HeaderType' in header_element and 'Type' in header_element:
            identifier = str(header_element['ID'])
            fieldtype = str(header_element['Type'])
            vcf_info_element_types[identifier] = fieldtype

    vars_no_csq = list()
    for rec in vcf:
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                if not current_chrom is None:
                    logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}")
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = f"g.{rec.CHROM}:{pos}{rec.REF}>{alt_allele}"
            vars_no_csq.append(variant_id)
            continue

        num_chromosome_records_processed += 1
        pcgr_onco_xref = annoutils.make_transcript_xref_map(rec, pcgr_onco_xref_map, xref_tag = "PCGR_ONCO_XREF")

        if regulatory_annotation == 1:
            csq_record_results_all = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = False, csq_identifier = 'CSQ')
            if 'vep_block' in csq_record_results_all:
                vep_csq_records_all = csq_record_results_all['vep_block']
                rec.INFO['REGULATORY_ANNOTATION'] = annoutils.map_regulatory_variant_annotations(vep_csq_records_all)

        csq_record_results_pick = annoutils.parse_vep_csq(rec, pcgr_onco_xref, vep_csq_fields_map, logger, pick_only = True, csq_identifier = 'CSQ')
        vep_csq_records = None
        if 'vep_all_csq' in csq_record_results_pick:
            rec.INFO['VEP_ALL_CSQ'] = ','.join(csq_record_results_pick['vep_all_csq'])
        if 'vep_block' in csq_record_results_pick:
            vep_csq_records = csq_record_results_pick['vep_block']
            block_idx = 0
            if cpsr is True:
                block_idx = annoutils.get_correct_cpg_transcript(vep_csq_records)
            record = vep_csq_records[block_idx]
            for k in record:
                if k in vcf_info_element_types:
                    if vcf_info_element_types[k] == "Flag" and record[k] == "1":
                        rec.INFO[k] = True
                    else:
                        if not record[k] is None:
                            rec.INFO[k] = record[k]
        if not rec.INFO.get('DBNSFP') is None:
            annoutils.map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms)

        w.write_record(rec)
    if vars_no_csq:
        logger.warning(f"There were {len(vars_no_csq)} records with no CSQ tag from VEP (was --vep_no_intergenic flag set?). Skipping them and showing (up to) the first 100:")
        print('----')
        print(', '.join(vars_no_csq[:100]))
        print('----')
    w.close()
    if current_chrom is not None:
        logger.info(f"Completed summary of functional annotations for {num_chromosome_records_processed} variants on chr{current_chrom}")
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            check_subprocess(logger, f'bgzip -f {out_vcf}', debug=False)
            check_subprocess(logger, f'tabix -f -p vcf {out_vcf}.gz', debug=False)
            annotated_vcf = f'{out_vcf}.gz'
            annoutils.write_pass_vcf(annotated_vcf, logger)
        else:
            error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger)
    else:
        error_message('No remaining PASS variants found in query VCF - exiting and skipping STEP 4', logger)
示例#14
0
def get_valid_custom_genelist(genelist_fname, genelist_bed_fname, pcgr_dir, genome_assembly, logger, debug):
   """
   Function that checks whether the custom genelist contains valid entries from the complete exploratory track
   """
   genelist_reader = csv.DictReader(open(genelist_fname,'r'), delimiter='\n', fieldnames=['ensembl_gene_id'])
   superpanel_track_bed = os.path.join(pcgr_dir, "data", genome_assembly, "virtual_panels",  "0." + genome_assembly + ".bed.gz")
   superpanel_track_tsv = os.path.join(pcgr_dir, "data", genome_assembly, "virtual_panels", "cpsr_superpanel." + genome_assembly + ".tsv")
   genelist_bed_fname_unsorted = genelist_bed_fname + '.tmp_unsorted'

   customlist_identifiers = {}
   superpanel_track = []
   superpanel_identifiers_all = {}
   valid_custom_identifiers = []
   valid_custom_symbols = []

   for row in genelist_reader:
      if not re.match(r'^ENSG[0-9]{1,}$',str(row['ensembl_gene_id']).rstrip()):
         err_msg = "Custom list of genes from CPSR superpanel (panel 0) should be provided as Ensembl gene identifiers, '" + str(row['ensembl_gene_id']) + "' is not a valid identifier"
         return error_message(err_msg, logger)
      else:
         customlist_identifiers[str(row['ensembl_gene_id']).strip()] = 1

   superpanel_reader = csv.DictReader(open(superpanel_track_tsv, 'r'), delimiter = '\t')

   for row in superpanel_reader:
      superpanel_track.append(dict(row))
   #superpanel_track = list(set(superpanel_track))

   i = 0
   while i < len(superpanel_track):
      superpanel_identifiers_all[superpanel_track[i]['ensembl_gene_id']] = superpanel_track[i]['symbol']
      i = i + 1

   for g in customlist_identifiers.keys():
      if g in superpanel_identifiers_all.keys():
         valid_custom_identifiers.append(g)
         valid_custom_symbols.append(superpanel_identifiers_all[g])
      else:
         logger.warning("Ignoring custom-provided gene identifier (" + str(g) + ") NOT found in CPSR superpanel (panel 0)")
         logger.warning("Choose only Ensembl gene identifiers from this set in data bundle: data/" + str(genome_assembly) + "/virtual_panels/cpsr_superpanel." + str(genome_assembly) + ".tsv")
   all_valid_custom_geneset = ', '.join(sorted(valid_custom_symbols))

   logger.info('Detected n = ' + str(len(valid_custom_identifiers)) + ' valid targets in custom-provided gene list file (--custom_list)):')
   logger.info(all_valid_custom_geneset)

   if len(valid_custom_identifiers) == 0:
      logger.info('')
      logger.info("NO valid gene identifiers from panel 0 in custom-provided genelist - exiting")
      logger.info('')
      exit(1)

   ## Add secondary findings genes to target BED
   cmd_secondary_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'\|ACMG_SF30\|\' > ' + str(genelist_bed_fname_unsorted)
   check_subprocess(logger, cmd_secondary_regions_bed, debug)

   ## Add GWAS hits to target BED
   cmd_gwas_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'rs[0-9]{3,}\|\' >> ' + str(genelist_bed_fname_unsorted)
   check_subprocess(logger, cmd_gwas_regions_bed, debug)

   ## Add custom set genes to target BED
   logger.info('Creating BED file with custom target genes: ' + str(genelist_bed_fname))
   for g in valid_custom_identifiers:
      cmd_target_regions_bed = 'bgzip -dc ' + str(superpanel_track_bed) + ' | egrep \'\|' + g + '\|\' >> ' + str(genelist_bed_fname_unsorted)
      check_subprocess(logger, cmd_target_regions_bed, debug)

   ## Sort regions in target BED
   if os.path.exists(genelist_bed_fname_unsorted) and os.stat(genelist_bed_fname_unsorted).st_size != 0:
      cmd_sort_custom_bed1 = 'egrep \'^[0-9]\' ' + str(genelist_bed_fname_unsorted) + ' | sort -k1,1n -k2,2n -k3,3n > ' + str(genelist_bed_fname)
      cmd_sort_custom_bed2 = 'egrep -v \'^[0-9]\' ' + str(genelist_bed_fname_unsorted) + ' | egrep \'^[XYM]\' | sort -k1,1 -k2,2n -k3,3n >> ' + str(genelist_bed_fname)

      check_subprocess(logger, cmd_sort_custom_bed1, debug)
      check_subprocess(logger, cmd_sort_custom_bed2, debug)
      if not debug:
         utils.remove(str(genelist_bed_fname_unsorted))
   #else:
      #print('balle')

   return 0
示例#15
0
def check_format_ad_dp_tags(vcf,
                           tumor_dp_tag,
                           tumor_af_tag,
                           control_dp_tag,
                           control_af_tag,
                           call_conf_tag,
                           exclude_hom_germline,
                           exclude_het_germline,
                           tumor_only,
                           logger):

    """
    Function that checks whether the INFO tags specified for depth/allelic fraction are correctly formatted in the VCF header (i.e. Type)
    """

    found_taf_tag = 0
    found_tdp_tag = 0
    found_naf_tag = 0
    found_ndp_tag = 0
    found_call_conf_tag = 0

    annoutils.detect_reserved_info_tag(tumor_dp_tag,'tumor_dp_tag', logger)
    annoutils.detect_reserved_info_tag(control_dp_tag,'control_dp_tag', logger)
    annoutils.detect_reserved_info_tag(tumor_af_tag,'tumor_af_tag', logger)
    annoutils.detect_reserved_info_tag(control_af_tag,'control_af_tag', logger)
    annoutils.detect_reserved_info_tag(call_conf_tag,'call_conf_tag', logger)

    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys() and 'HeaderType' in header_element.keys():
            if header_element['HeaderType'] == 'INFO':
                if header_element['ID'] == tumor_dp_tag:
                    if header_element['Type'] == 'Integer':
                        logger.info(f'Found INFO tag for tumor variant sequencing depth (tumor_dp_tag {tumor_dp_tag}) in input VCF')
                        found_tdp_tag = 1
                    else:
                        err_msg = f'INFO tag for tumor variant sequencing depth (tumor_dp_tag {tumor_dp_tag}) is not correctly specified in input VCF (Type={header_element["Type"]}), should be Type=Integer'
                        return error_message(err_msg, logger)
                if header_element['ID'] == tumor_af_tag:
                    if header_element['Type'] == 'Float':
                        logger.info(f'Found INFO tag for tumor variant allelic fraction (tumor_af_tag {tumor_af_tag}) in input VCF')
                        found_taf_tag = 1
                    else:
                        err_msg = f'INFO tag for tumor variant allelic fraction (tumor_af_tag {tumor_af_tag}) is not correctly specified in input VCF (Type={header_element["Type"]}), should be Type=Float'
                        return error_message(err_msg, logger)
                if header_element['ID'] == control_dp_tag:
                    if header_element['Type'] == 'Integer':
                        logger.info(f'Found INFO tag for normal/control variant sequencing depth (control_dp_tag {control_dp_tag}) in input VCF')
                        found_ndp_tag = 1
                    else:
                        err_msg = f'INFO tag for normal/control variant sequencing depth (control_dp_tag {control_dp_tag}) is not correctly specified in input VCF (Type={header_element["Type"]}), should be Type=Integer'
                        return error_message(err_msg, logger)
                if header_element['ID'] == control_af_tag:
                    if header_element['Type'] == 'Float':
                        logger.info(f'Found INFO tag for normal/control allelic fraction (control_af_tag {control_af_tag}) in input VCF')
                        found_naf_tag = 1
                    else:
                        err_msg = f'INFO tag for for normal/control allelic fraction (control_af_tag {control_af_tag}) is not correctly specified in input VCF (Type={header_element["Type"]}) should be Type=Float'
                        return error_message(err_msg, logger)
                if header_element['ID'] == call_conf_tag:
                    if header_element['Type'] == 'String':
                        logger.info(f'Found INFO tag for variant call confidence (call_conf_tag {call_conf_tag}) in input VCF')
                        found_call_conf_tag = 1
                    else:
                        err_msg = f'INFO tag for variant call confidence (call_conf_tag) is not correctly specified in input VCF (Type={header_element["Type"]}), should be Type=String'
                        return error_message(err_msg, logger)


    if call_conf_tag != '_NA_' and found_call_conf_tag == 0:
        logger.warning(f"Could not find the specified call_conf_tag ('{call_conf_tag}') in INFO column of input VCF")
    if tumor_dp_tag != '_NA_' and found_tdp_tag == 0:
        logger.warning(f"Could not find the specified tumor_dp_tag ('{tumor_dp_tag}') in INFO column of input VCF")
    if tumor_af_tag != '_NA_' and found_taf_tag == 0:
        logger.warning(f"Could not find the specified tumor_af_tag ('{tumor_af_tag}') in INFO column of input VCF")
    if control_dp_tag != '_NA_' and found_ndp_tag == 0:
        logger.warning(f"Could not find the specified control_dp_tag ('{control_dp_tag}') in INFO column of input VCF")
    if control_af_tag != '_NA_' and found_naf_tag == 0:
        logger.warning(f"Could not find the specified control_af_tag ('{control_af_tag}') in INFO column of input VCF")

    if exclude_hom_germline is True and tumor_only == 1 and found_taf_tag == 0:
        logger.warning(f"Could not find the specified tumor_af_tag ('{tumor_af_tag}') in INFO column of input VCF - filtering of homozygous germline variants in tumor-only mode will be ignored")

    if exclude_het_germline is True and tumor_only == 1 and found_taf_tag == 0:
        logger.warning(f"Could not find the specified tumor_af_tag ('{tumor_af_tag}') in INFO column of input VCF - filtering of heterozygous germline variants in tumor-only mode will be ignored")


    if found_tdp_tag == 1 and found_taf_tag == 0:
        logger.warning('BOTH \' tumor_dp_tag\' AND \' tumor_af_tag\' need to be specified for use in tumor report (\'tumor_af_tag\' is missing)')

    if found_tdp_tag == 0 and found_taf_tag == 1:
        logger.warning('BOTH \'tumor_dp_tag\' AND \'tumor_af_tag\' need to be specified for use in tumor report (\'tumor_dp_tag\' is missing)')

    if found_ndp_tag == 1 and found_naf_tag == 0:
        logger.warning('BOTH \'control_dp_tag\' AND \'control_af_tag\' need to be specified for use in tumor report (\'control_af_tag\' is missing)')

    if found_ndp_tag == 0 and found_naf_tag == 1:
        logger.warning('BOTH \'control_dp_tag\' AND \'control_af_tag\' need to be specified for use in tumor report (\'control_dp_tag\' is missing)')

    ## if filtering turned on for AF-based tumor-only filtering, return error if TVAF not defined

    return 0