示例#1
0
    def _infer_annotation_type_info_fields(self, variant, infos,
                                           defined_headers):
        # type: (vcfio.Variant, Dict[str, Info], vcf_header_io.VcfHeader) -> None
        """Updates `infos` with inferred annotation type info fields.

    All annotation headers in each annotation field are converted to Info header
    lines where the new ID corresponds to the given annotation field and header,
    and the new TYPE corresponds to inferred type of the original header. Since
    each variant potentially contains multiple values for each annotation
    header, a small 'merge' of value types is performed before VcfHeader
    creation for each variant.
    Args:
      variant: variant object
      infos: dict of (info_key, `Info`) for any info field in
        `variant` that is not defined in the header or the definition mismatches
        the field values.
      defined_headers: header fields defined in header section of VCF files.
    """
        def _check_annotation_lists_lengths(names, values):
            lengths = set(len(v) for v in values)
            lengths.add(len(names))
            if len(lengths) != 1:
                error = (
                    'Annotation lists have inconsistent lengths: {}.\nnames={}\n'
                    'values={}').format(lengths, names, values)
                raise ValueError(error)

        resolver = vcf_field_conflict_resolver.FieldConflictResolver(
            resolve_always=True)
        for field in self._annotation_fields_to_infer:
            if field not in variant.info:
                continue
            annotation_names = annotation_parser.extract_annotation_names(
                defined_headers.infos[field][_HeaderKeyConstants.DESC])
            # First element (ALT) is ignored, since its type is hard-coded as string
            annotation_values = [
                annotation_parser.extract_annotation_list_with_alt(annotation)
                [1:] for annotation in variant.info[field]
            ]
            _check_annotation_lists_lengths(annotation_names,
                                            annotation_values)
            annotation_values = zip(*annotation_values)
            for name, values in zip(annotation_names, annotation_values):
                variant_merged_type = None
                for v in values:
                    if not v:
                        continue
                    variant_merged_type = resolver.resolve_attribute_conflict(
                        _HeaderKeyConstants.TYPE, variant_merged_type,
                        self._get_field_type(v))
                    if variant_merged_type == _HeaderTypeConstants.STRING:
                        break
                key_id = get_inferred_annotation_type_header_key(field, name)
                infos[key_id] = Info(
                    key_id,
                    1,  # field count
                    variant_merged_type,
                    ('Inferred type field for annotation {}.'.format(name)),
                    '',  # UNKNOWN_SOURCE
                    '')  # UNKNOWN_VERSION
示例#2
0
 def _gen_annotation_name_key_pairs(self, annot_field):
     #  type: (str) -> (str, str)
     annotation_names = annotation_parser.extract_annotation_names(
         self._header_fields.infos[annot_field][_HeaderKeyConstants.DESC])
     for name in annotation_names:
         type_key = infer_headers_util.get_inferred_annotation_type_header_key(
             annot_field, name)
         yield name, type_key
示例#3
0
    def __init__(
            self,
            annotation_fields,  # type: List[str]
            header_fields,  # type: vcf_header_io.VcfHeader
            counter_factory,  # type: metrics_util.CounterFactoryInterface
            use_allele_num,  # type: bool
            minimal_match,  # type: bool
            infer_annotation_types,  # type: bool
    ):
        # type: (...) -> None
        """Creates an instance for adding annotations to `ProcessedVariant` objects.

    Note this class is intended to be an auxiliary for ProcessedVariantFactory
    and is used for creating annotation related parts of a `ProcessedVariant`
    object. So it is an implementation detail and not part of the public API.

    Args:
      annotation_fields: The list of INFO field names that store variant
        annotations. The format of how annotations are stored and their names
        are extracted from header_fields.
      header_fields: The VCF header information.
      infer_annotation_types: If set, then warnings will be provided if header
        fields fail to contain Info type lines for annotation fields
    """
        self._header_fields = header_fields
        self._annotation_names_map = {}  # type: Dict[str, List[str]]
        for field in annotation_fields or []:
            if field not in header_fields.infos:
                raise ValueError(
                    '{} INFO not found in the header'.format(field))
            header_desc = header_fields.infos[field][_HeaderKeyConstants.DESC]
            self._annotation_names_map[field] = (
                annotation_parser.extract_annotation_names(header_desc))
        self._alt_match_counter = counter_factory.create_counter(
            _CounterEnum.ANNOTATION_ALT_MATCH.value)
        self._alt_minimal_ambiguous_counter = counter_factory.create_counter(
            _CounterEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value)
        self._alt_mismatch_counter = counter_factory.create_counter(
            _CounterEnum.ANNOTATION_ALT_MISMATCH.value)
        self._allele_num_missing_counter = counter_factory.create_counter(
            _CounterEnum.ALLELE_NUM_MISSING.value)
        self._allele_num_incorrect_counter = counter_factory.create_counter(
            _CounterEnum.ALLELE_NUM_INCORRECT.value)
        self._use_allele_num = use_allele_num
        self._minimal_match = minimal_match
        self._infer_annotation_types = infer_annotation_types
 def test_extract_annotation_names_error(self):
     annotation_str = 'some desc-Consequence-IMPACT-SYMBOL-Gene'
     with self.assertRaisesRegexp(ValueError, 'Expected at least one.*'):
         annotation_parser.extract_annotation_names(annotation_str)
 def test_extract_annotation_names(self):
     annotation_str = 'some desc|Consequence|IMPACT|SYMBOL|Gene'
     name_list = annotation_parser.extract_annotation_names(annotation_str)
     self.assertEqual(name_list,
                      ['Consequence', 'IMPACT', 'SYMBOL', 'Gene'])
示例#6
0
    def create_alt_bases_field_schema(self):
        # type: () -> bigquery.TableFieldSchema
        """Returns the alternate_bases record compatible with this factory.

    Depending on how this class is set up to split INFO fields among alternate
    bases, this function produces a compatible alternate_bases record and
    returns it which can be added to a bigquery schema by the caller.
    """
        alternate_bases_record = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each alternate base (if any).')
        alternate_bases_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT,
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='Alternate base.'))
        if self._split_alternate_allele_info_fields:
            for key, field in self._header_fields.infos.iteritems():
                if (field[_HeaderKeyConstants.NUM] == vcf.parser.
                        field_counts[_FIELD_COUNT_ALTERNATE_ALLELE]):
                    alternate_bases_record.fields.append(
                        bigquery.TableFieldSchema(
                            name=_BigQuerySchemaSanitizer.
                            get_sanitized_field_name(key),
                            type=bigquery_util.get_bigquery_type_from_vcf_type(
                                field[_HeaderKeyConstants.TYPE]),
                            mode=bigquery_util.TableFieldConstants.
                            MODE_NULLABLE,
                            description=_BigQuerySchemaSanitizer.
                            get_sanitized_string(
                                field[_HeaderKeyConstants.DESC])))

        for annot_field in self._annotation_field_set:
            if annot_field not in self._header_fields.infos:
                raise ValueError(
                    'Annotation field {} not found'.format(annot_field))
            annotation_names = annotation_parser.extract_annotation_names(
                self._header_fields.infos[annot_field][
                    _HeaderKeyConstants.DESC])
            annotation_descs = descriptions.VEP_DESCRIPTIONS
            annotation_record = bigquery.TableFieldSchema(
                name=_BigQuerySchemaSanitizer.get_sanitized_field_name(
                    annot_field),
                type=bigquery_util.TableFieldConstants.TYPE_RECORD,
                mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
                description='List of {} annotations for this alternate.'.
                format(annot_field))
            annotation_record.fields.append(
                bigquery.TableFieldSchema(
                    name=annotation_parser.ANNOTATION_ALT,
                    type=bigquery_util.TableFieldConstants.TYPE_STRING,
                    mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                    description='The ALT part of the annotation field.'))
            for annotation_name in annotation_names:
                annotation_record.fields.append(
                    bigquery.TableFieldSchema(
                        name=_BigQuerySchemaSanitizer.get_sanitized_field_name(
                            annotation_name),
                        type=bigquery_util.TableFieldConstants.TYPE_STRING,
                        mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                        description=annotation_descs.get(annotation_name, '')))
            alternate_bases_record.fields.append(annotation_record)
        return alternate_bases_record