示例#1
0
def _filter_column_list(column_spec_list, potential_col_list,
                        column_spec_filename):

    actual_column_list = []
    glossary_fields = OrderedDict()
    for i, column_regex in enumerate(column_spec_list):
        no_columns_found = True
        for (column_name, vcf_tag) in potential_col_list.items():
            column_exists = column_name in actual_column_list
            regex_matches = re.match("^" + column_regex + "$", column_name)
            if regex_matches and not column_exists:
                glossary_fields[vcf_tag] = 1
                actual_column_list.append(column_name)
                no_columns_found = False

        if no_columns_found:
            logger.warning(UNUSED_REGEX_WARNING_FORMAT, column_regex,
                           column_spec_filename, i + 1)

    if not actual_column_list:
        raise utils.JQException(
            "The selected_columns_file [{}] would "
            "exclude all input columns. Review "
            "inputs/usage and try again.", column_spec_filename)
    glossary_fields = [x for x in glossary_fields.keys() if x]
    return actual_column_list, glossary_fields
示例#2
0
    def _get_new_column_header(self, vcf_reader):
        """Returns a standardized column header.

        MuTect sample headers include the name of input alignment, which is
        nice, but doesn't match up with the sample names reported in Strelka
        or VarScan. To fix this, we replace with NORMAL and TUMOR using the
        MuTect metadata command line to replace them correctly."""
        mutect_dict = self._build_mutect_dict(vcf_reader.metaheaders)

        new_header_list = []
        required_keys = set([self._NORMAL_SAMPLE_KEY, self._TUMOR_SAMPLE_KEY])
        mutect_keys = set(mutect_dict.keys())

        if not required_keys.issubset(mutect_keys):
            raise utils.JQException("Unable to determine normal "
                                    "and tumor sample ordering "
                                    "based on MuTect metaheader.")

        for field_name in vcf_reader.column_header.split("\t"):
            if field_name == mutect_dict[self._NORMAL_SAMPLE_KEY]:
                field_name = "NORMAL"
            elif field_name == mutect_dict[self._TUMOR_SAMPLE_KEY]:
                field_name = "TUMOR"
            new_header_list.append(field_name)

        return "\t".join(new_header_list)
示例#3
0
 def _determine_source_tag(metaheaders):
     for header in metaheaders:
         if header.startswith('##FORMAT=<ID=AF,'):
             return 'AF'
         if header.startswith('##FORMAT=<ID=FA,'):
             return 'FA'
     msg = ('could not determine the correct allele frequency '
            'FORMAT tag in the source MuTect file')
     raise utils.JQException(msg)
示例#4
0
 def _raise_invalid_filter_exception(invalid_filter_files):
     if invalid_filter_files[5:]:
         omitted_files = ("...({} file(s) omitted)")\
                         .format(len(invalid_filter_files[5:]))
     else:
         omitted_files = ""
     first_five_fnames = [i.file_name for i in invalid_filter_files[:5]]
     raise utils.JQException(
         "The [{}] input files [{}{}] match "
         "high-confidence file names, but the "
         "file header is invalid or missing. "
         "Review inputs and try again.", len(invalid_filter_files),
         first_five_fnames, omitted_files)
示例#5
0
def _get_non_null_values(record, sample, tag_type):
    values = []
    try:
        desired_tags = common_tags.AbstractJacquardTag\
                       .get_matching_tags(record.sample_tag_values[sample],
                                          tag_type)
        for value in list(desired_tags.values()):
            if value != ".":
                values.append(value)
    except KeyError:
        raise utils.JQException(
            "Sample [{}] was not recognized".format(sample))

    return values
示例#6
0
    def __init__(self, variant_caller_abbrev, tag_type, description):

        if '"' in description:
            raise utils.JQException(("Metaheader descriptions cannot contain "
                                     "double quotes: [{}]"), description)
        self.tag_type = tag_type
        self.tag_id = "JQ_{}_{}".format(variant_caller_abbrev,
                                        tag_type.abbreviation)
        self.metaheader = AbstractJacquardTag.FORMAT.format(self.tag_type\
                                                            .metaheader_type,
                                                            self.tag_id,
                                                            self.tag_type\
                                                            .vcf_number,
                                                            self.tag_type\
                                                            .vcf_type,
                                                            description)
示例#7
0
    def _validate_vcf_readers(prefix_by_patients):
        number_of_files = set()
        for file_names in prefix_by_patients.values():
            if len(file_names) == 1:
                for file_name in file_names:
                    if re.search("snvs", file_name):
                        msg = "Strelka VCF [{}] has no indels file."
                        logger.error(msg, file_name)
                    elif re.search("indels", file_name):
                        msg = "Strelka VCF [{}] has no snvs file."
                        logger.error(msg, file_name)
            number_of_files.add(len(file_names))

        if len(number_of_files) > 1:
            msg = ("Some Strelka VCFs were missing either a snvs or indels "
                   "file. Review inputs/command options and try again.")
            raise utils.JQException(msg)
示例#8
0
    def add_tag_values(self, record):
        new_sample_tag_values = {}
        for sample in record.sample_tag_values:
            tag_values = _get_non_null_values(record, sample,
                                              common_tags.SOMATIC_TAG)
            aggregated_values = "."
            if tag_values:
                aggregated_values = _aggregate_numeric_values(
                    tag_values, _count)
            if not aggregated_values:
                msg = "Error summarizing values {} at record [{}:{} {}]"
                raise utils.JQException(msg, list(tag_values), record.chrom,
                                        record.pos, sample)

            new_sample_tag_values[sample] = aggregated_values

        record.add_sample_tag_value(self.tag_id, new_sample_tag_values)
示例#9
0
    def _somatic_status(vcf_record, sample):
        tag_values = vcf_record.sample_tag_values[sample]
        try:
            gt = tag_values["GT"]
        except KeyError:
            msg_fmt = ('Cannot assign somatic status using FilterMutectCalls '
                       'when sample GT absent: '
                       '(CHROM:POS:REF:ALT={}:{}:{}:{})')
            msg = msg_fmt.format(vcf_record.chrom,
                                 vcf_record.pos,
                                 vcf_record.ref,
                                 vcf_record.alt)
            raise utils.JQException(msg)

        if gt == "0/0" or gt == '0|0':
            return "0"
        else:
            return "1"
示例#10
0
    def _get_snv_genotype(sample_genotype, ref, alt):
        sample_allele1, sample_allele2 = list(sample_genotype)
        alleles = [ref]
        alleles.extend(alt.split(","))

        chrom_a = None
        chrom_b = None

        for i, allele in enumerate(alleles):
            if sample_allele1 == allele:
                chrom_a = str(i)
            if sample_allele2 == allele:
                chrom_b = str(i)

        if chrom_a and chrom_b:
            return "/".join(sorted([chrom_a, chrom_b]))
        else:
            raise utils.JQException("Unable to determine Genotype")
示例#11
0
def _build_format_tags(format_tag_regex, vcf_readers):
    retained_tags = set()
    regexes_used = set()

    for vcf_reader in vcf_readers:
        for tag_regex in format_tag_regex:
            for original_tag, new_tag in list(vcf_reader.format_tags.items()):
                if re.match(tag_regex + "$", original_tag):
                    retained_tags.add(new_tag)
                    regexes_used.add(tag_regex)

    if len(retained_tags) == 0:
        msg = ("The specified format tag regex [{}] would exclude all format "
               "tags. Review inputs/usage and try again")
        raise utils.JQException(msg, format_tag_regex)

    unused_regexes = set(format_tag_regex).difference(regexes_used)
    if unused_regexes:
        for unused_regex in unused_regexes:
            msg = ("In the specified list of regexes {}, the regex [{}] does "
                   "not match any format tags; this expression may be "
                   "irrelevant.")
            logger.warning(msg, format_tag_regex, unused_regex)
    return sorted(list(retained_tags))
示例#12
0
 def _get_indel_genotype(sample_genotype):
     indel_mapping_values = {"ref": "0/0", "het": "0/1", "hom": "1/1"}
     if sample_genotype in indel_mapping_values:
         return indel_mapping_values[sample_genotype]
     else:
         raise utils.JQException("Unable to determine Genotype")
示例#13
0
 def test_init(self):
     actual = utils.JQException("msg:{}, {}", "bar", [1, 2, 3])
     self.assertIsInstance(actual, Exception)
     self.assertEquals(actual.args[0], "msg:bar, [1, 2, 3]")