示例#1
0
def validate_gvanno_input(gvanno_directory, input_vcf, configuration_file,
                          genome_assembly):
    """
   Function that reads the input file to gvanno (VCF file) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by gvanno
   3. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   4. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
    logger = annoutils.getlogger('gvanno-validate-input')
    config_options = annoutils.read_config_options(configuration_file,
                                                   gvanno_directory,
                                                   genome_assembly,
                                                   logger,
                                                   wflow='gvanno')

    if not input_vcf == 'None':
        if config_options['other']['vcf_validation']:
            valid_vcf = is_valid_vcf(input_vcf, logger)
            if valid_vcf == -1:
                return -1
        else:
            logger.info(
                'Skipping validation of VCF file - as defined in configuration file (vcf_validation = false)'
            )
        tag_check = check_existing_vcf_info_tags(input_vcf, gvanno_directory,
                                                 genome_assembly, logger)
        if tag_check == -1:
            return -1

        vcf = VCF(input_vcf)

        simplify_vcf(input_vcf, vcf, logger)

    return 0
示例#2
0
def validate_pcgr_input(pcgr_directory, input_vcf, input_cna, configuration_file, panel_normal_vcf, vcf_validation, tumor_only, genome_assembly, output_dir):
   """
   Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR
   3. Check that provided columns for tumor/normal coverage and allelic depths are found in VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that panel-of-normals VCF adheres to the required format (PANEL_OF_NORMALS INFO tag in header)
   6. Check that copy number segment file has required columns and correct data types (and range)
   7. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
   logger = annoutils.getlogger('pcgr-validate-input')
   config_options = annoutils.read_config_options(configuration_file, pcgr_directory, genome_assembly, logger, wflow = 'pcgr')

   if panel_normal_vcf == "None" and tumor_only == 1 and config_options['tumor_only']['exclude_pon'] is True:
      logger.warn('Panel-of-normals VCF is not present - exclusion of calls found in panel-of-normals will be ignored')

   if not input_vcf == 'None':
      if vcf_validation == 1:
         valid_vcf = is_valid_vcf(input_vcf, output_dir, logger)
         if valid_vcf == -1:
            return -1
      else:
         logger.info('Skipping validation of VCF file - as provided by option --no_vcf_validate')
      tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory, genome_assembly, logger)
      if tag_check == -1:
         return -1
      
      vcf = VCF(input_vcf)
      allelic_support_check = check_format_ad_dp_tags(vcf, pcgr_directory, config_options, tumor_only, logger)
      if allelic_support_check == -1:
         return -1
      
      simplify_vcf(input_vcf, vcf, output_dir, logger)
   
   if not panel_normal_vcf == "None":
      valid_panel_normals = validate_panel_normal_vcf(panel_normal_vcf, logger)
      if valid_panel_normals == -1:
         return -1
      
   if not input_cna == 'None':
      valid_cna = is_valid_cna(input_cna, logger)
      if valid_cna == -1:
         return -1
   
   return 0
示例#3
0
def validate_pcgr_input(pcgr_directory, input_vcf, input_cna,
                        configuration_file, genome_assembly, output_dir):
    """
   Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks:
   1. Check that VCF file is properly formatted (according to EBIvariation/vcf-validator - VCF v4.2) - optional (vcf_validation in config file)
   2. Check that no INFO annotation tags in the query VCF coincides with those generated by PCGR
   3. Check that provided columns for tumor/normal coverage and allelic depths are found in VCF
   4. Check that if VCF have variants with multiple alternative alleles (e.g. 'A,T') run vt decompose
   5. Check that copy number segment file has required columns and correct data types (and range)
   6. Any genotype data from VCF input file is stripped, and the resulting VCF file is sorted and indexed (bgzip + tabix) 
   """
    logger = annoutils.getlogger('pcgr-validate-input')
    config_options = annoutils.read_config_options(configuration_file,
                                                   pcgr_directory,
                                                   genome_assembly,
                                                   logger,
                                                   wflow='pcgr')
    #print str(config_options)
    if not input_vcf == 'None':
        if config_options['other']['vcf_validation']:
            valid_vcf = is_valid_vcf(input_vcf, output_dir, logger)
            if valid_vcf == -1:
                return -1
        else:
            logger.info(
                'Skipping validation of VCF file - as defined in configuration file (vcf_validation = false)'
            )
        tag_check = check_existing_vcf_info_tags(input_vcf, pcgr_directory,
                                                 genome_assembly, logger)
        if tag_check == -1:
            return -1

        vcf = VCF(input_vcf)
        allelic_support_check = check_format_ad_dp_tags(
            vcf, pcgr_directory, config_options, logger)
        if allelic_support_check == -1:
            return -1

        simplify_vcf(input_vcf, vcf, output_dir, logger)

    if not input_cna == 'None':
        ret = is_valid_cna(input_cna, logger)
        return ret

    return 0