Python get_bigquery_type_from_vcf_type 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gcp_variant_transforms.libs.bigquery_util

메소드/함수: get_bigquery_type_from_vcf_type

hotexamples.com에서의 예제들: 6

Python get_bigquery_type_from_vcf_type - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gcp_variant_transforms.libs.bigquery_util.get_bigquery_type_from_vcf_type에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

 def test_get_bigquery_type_from_vcf_type(self):
   self.assertEqual(bigquery_util.TableFieldConstants.TYPE_INTEGER,
                    bigquery_util.get_bigquery_type_from_vcf_type('integer'))
   self.assertEqual(bigquery_util.TableFieldConstants.TYPE_STRING,
                    bigquery_util.get_bigquery_type_from_vcf_type('string'))
   self.assertEqual(bigquery_util.TableFieldConstants.TYPE_STRING,
                    bigquery_util.get_bigquery_type_from_vcf_type('character'))
   self.assertEqual(bigquery_util.TableFieldConstants.TYPE_FLOAT,
                    bigquery_util.get_bigquery_type_from_vcf_type('float'))
   self.assertEqual(bigquery_util.TableFieldConstants.TYPE_BOOLEAN,
                    bigquery_util.get_bigquery_type_from_vcf_type('flag'))
   self.assertRaises(
       ValueError,
       bigquery_util.get_bigquery_type_from_vcf_type, 'DUMMY')

예제 #2

파일 보기

def generate_schema_from_header_fields(
    header_fields,  # type: vcf_header_io.VcfHeader
    proc_variant_factory,  # type: processed_variant.ProcessedVariantFactory
    variant_merger=None  # type: variant_merge_strategy.VariantMergeStrategy
):
    # type: (...) -> bigquery.TableSchema
    """Returns a ``TableSchema`` for the BigQuery table storing variants.

  Args:
    header_fields: Representative header fields for all variants.
    proc_variant_factory: The factory class that knows how to convert Variant
      instances to ProcessedVariant. As a side effect it also knows how to
      modify BigQuery schema based on the ProcessedVariants that it generates.
      The latter functionality is what is needed here.
    variant_merger: The strategy used for merging variants (if any). Some
      strategies may change the schema, which is why this may be needed here.
  """
    schema = bigquery.TableSchema()
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference name.'))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.START_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Start position (0-based). Corresponds to the first base '
                'of the string of reference bases.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.END_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'End position (0-based). Corresponds to the first base '
                'after the last base in the reference allele.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference bases.'))

    schema.fields.append(proc_variant_factory.create_alt_bases_field_schema())

    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.NAMES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='Variant names (e.g. RefSNP ID).'))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.QUALITY,
            type=bigquery_util.TableFieldConstants.TYPE_FLOAT,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phred-scaled quality score (-10log10 prob(call is wrong)). '
                'Higher values imply better quality.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.FILTER,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'List of failed filters (if any) or "PASS" indicating the '
                'variant has passed all filters.')))

    # Add calls.
    calls_record = bigquery.TableFieldSchema(
        name=bigquery_util.ColumnKeyConstants.CALLS,
        type=bigquery_util.TableFieldConstants.TYPE_RECORD,
        mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_NAME,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Name of the call.'))
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'Genotype of the call. "-1" is used in cases where the '
                'genotype is not called.')))
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phaseset of the call (if any). "*" is used in cases where '
                'the genotype is phased, but no phase set ("PS" in FORMAT) '
                'was specified.')))
    for key, field in header_fields.formats.iteritems():
        # GT and PS are already included in 'genotype' and 'phaseset' fields.
        if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY):
            continue
        calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(
                    field[_HeaderKeyConstants.TYPE]),
                mode=bigquery_util.get_bigquery_mode_from_vcf_num(
                    field[_HeaderKeyConstants.NUM]),
                description=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_string(field[_HeaderKeyConstants.DESC])))
    schema.fields.append(calls_record)

    # Add info fields.
    info_keys = set()
    for key, field in header_fields.infos.iteritems():
        # END info is already included by modifying the end_position.
        if (key == vcfio.END_INFO_KEY
                or proc_variant_factory.info_is_in_alt_bases(key)):
            continue
        schema.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(
                    field[_HeaderKeyConstants.TYPE]),
                mode=bigquery_util.get_bigquery_mode_from_vcf_num(
                    field[_HeaderKeyConstants.NUM]),
                description=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_string(field[_HeaderKeyConstants.DESC])))
        info_keys.add(key)
    if variant_merger:
        variant_merger.modify_bigquery_schema(schema, info_keys)
    return schema

예제 #3

파일 보기

파일: schema_converter.py 프로젝트: wongfay0207/gcp-variant-transforms

def generate_schema_from_header_fields(
        header_fields,  # type: vcf_header_io.VcfHeader
        proc_variant_factory,  # type: processed_variant.ProcessedVariantFactory
        variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
        use_1_based_coordinate=False,  # type: bool
        include_call_name=False,  # type: bool
        move_hom_ref_calls=False  # type: bool
):
    # type: (...) -> bigquery.TableSchema
    """Returns a ``TableSchema`` for the BigQuery table storing variants.

  Args:
    header_fields: Representative header fields for all variants.
    proc_variant_factory: The factory class that knows how to convert Variant
      instances to ProcessedVariant. As a side effect it also knows how to
      modify BigQuery schema based on the ProcessedVariants that it generates.
      The latter functionality is what is needed here.
    variant_merger: The strategy used for merging variants (if any). Some
      strategies may change the schema, which is why this may be needed here.
    use_1_based_coordinate: If True use 1-based coordinate, otherwise 0-based.
  """
    schema = bigquery.TableSchema()
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference name.'))

    coordinate = '1-based' if use_1_based_coordinate else '0-based'
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.START_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Start position ({}). Corresponds to the first base '
                'of the string of reference bases.'.format(coordinate))))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.END_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=('End position. Corresponds to the first base '
                         'after the last base in the reference allele.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference bases.'))
    schema.fields.append(proc_variant_factory.create_alt_bases_field_schema())

    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.NAMES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='Variant names (e.g. RefSNP ID).'))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.QUALITY,
            type=bigquery_util.TableFieldConstants.TYPE_FLOAT,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phred-scaled quality score (-10log10 prob(call is wrong)). '
                'Higher values imply better quality.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.FILTER,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'List of failed filters (if any) or "PASS" indicating the '
                'variant has passed all filters.')))

    if move_hom_ref_calls:
        hom_ref_calls_record = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.HOM_REF_CALLS,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each homogeneous call.')
        hom_ref_calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
                type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description=
                'Unique ID (type INT64) assigned to each sample. Table '
                'with `__sample_info` suffix contains the mapping of '
                'sample names (as read from VCF header) to these assigned '
                'IDs.'))
        if include_call_name:
            hom_ref_calls_record.fields.append(
                bigquery.TableFieldSchema(
                    name=bigquery_util.ColumnKeyConstants.CALLS_NAME,
                    type=bigquery_util.TableFieldConstants.TYPE_STRING,
                    mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                    description=
                    'Name of the call (sample names in the VCF Header '
                    'line).'))
        schema.fields.append(hom_ref_calls_record)
    # Add calls.
    calls_record = bigquery.TableFieldSchema(
        name=bigquery_util.ColumnKeyConstants.CALLS,
        type=bigquery_util.TableFieldConstants.TYPE_RECORD,
        mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=
            'Unique ID (type INT64) assigned to each sample. Table with '
            '`__sample_info` suffix contains the mapping of sample names '
            '(as read from VCF header) to these assigned IDs.'))
    if include_call_name:
        calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.ColumnKeyConstants.CALLS_NAME,
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description=
                'Name of the call (sample names in the VCF Header line).'))

    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'Genotype of the call. "-1" is used in cases where the '
                'genotype is not called.')))
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phaseset of the call (if any). "*" is used in cases where '
                'the genotype is phased, but no phase set ("PS" in FORMAT) '
                'was specified.')))
    for key, field in header_fields.formats.items():
        # GT and PS are already included in 'genotype' and 'phaseset' fields.
        if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY):
            continue
        calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(
                    field[_HeaderKeyConstants.TYPE]),
                mode=bigquery_util.get_bigquery_mode_from_vcf_num(
                    field[_HeaderKeyConstants.NUM]),
                description=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_string(field[_HeaderKeyConstants.DESC])))
    schema.fields.append(calls_record)

    # Add info fields.
    info_keys = set()
    annotation_info_type_keys_set = set(
        proc_variant_factory.gen_annotation_info_type_keys())
    for key, field in header_fields.infos.items():
        # END info is already included by modifying the end_position. Info type
        # fields exist only to indicate the type of corresponding annotation fields,
        # and should not be added to the schema.
        if (key == vcfio.END_INFO_KEY
                or proc_variant_factory.info_is_in_alt_bases(key)
                or key in annotation_info_type_keys_set):
            continue
        schema.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(
                    field[_HeaderKeyConstants.TYPE]),
                mode=bigquery_util.get_bigquery_mode_from_vcf_num(
                    field[_HeaderKeyConstants.NUM]),
                description=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_string(field[_HeaderKeyConstants.DESC])))
        info_keys.add(key)
    if variant_merger:
        variant_merger.modify_bigquery_schema(schema, info_keys)
    return schema

예제 #4

파일 보기

    def create_alt_bases_field_schema(self):
        # type: () -> bigquery.TableFieldSchema
        """Returns the alternate_bases record compatible with this factory.

    Depending on how this class is set up to split INFO fields among alternate
    bases, this function produces a compatible alternate_bases record and
    returns it which can be added to a bigquery schema by the caller.
    """
        alternate_bases_record = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each alternate base (if any).')
        alternate_bases_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT,
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='Alternate base.'))
        if self._split_alternate_allele_info_fields:
            for key, field in self._header_fields.infos.items():
                if self._is_num_a(field[_HeaderKeyConstants.NUM]):
                    alternate_bases_record.fields.append(
                        bigquery.TableFieldSchema(
                            name=_BigQuerySchemaSanitizer.
                            get_sanitized_field_name(key),
                            type=bigquery_util.get_bigquery_type_from_vcf_type(
                                field[_HeaderKeyConstants.TYPE]),
                            mode=bigquery_util.TableFieldConstants.
                            MODE_NULLABLE,
                            description=_BigQuerySchemaSanitizer.
                            get_sanitized_string(
                                field[_HeaderKeyConstants.DESC])))
        for annot_field in self._annotation_field_set:
            if annot_field not in self._header_fields.infos:
                raise ValueError(
                    'Annotation field {} not found'.format(annot_field))
            annotation_descs = descriptions.VEP_DESCRIPTIONS
            annotation_record = bigquery.TableFieldSchema(
                name=_BigQuerySchemaSanitizer.get_sanitized_field_name(
                    annot_field),
                type=bigquery_util.TableFieldConstants.TYPE_RECORD,
                mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
                description='List of {} annotations for this alternate.'.
                format(annot_field))
            annotation_record.fields.append(
                bigquery.TableFieldSchema(
                    name=annotation_parser.ANNOTATION_ALT,
                    type=bigquery_util.TableFieldConstants.TYPE_STRING,
                    mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                    description='The ALT part of the annotation field.'))
            annotation_names_keys = self._gen_annotation_name_key_pairs(
                annot_field)
            for annotation_name, type_key in annotation_names_keys:
                if type_key in self._header_fields.infos:
                    vcf_type = self._header_fields.infos[type_key][
                        vcf_header_io.VcfParserHeaderKeyConstants.TYPE]
                else:
                    vcf_type = vcf_header_io.VcfHeaderFieldTypeConstants.STRING
                    if self._infer_annotation_types:
                        logging.warning(
                            ('Annotation field %s has no corresponding header '
                             'field with id %s to specify type. Using type %s '
                             'instead.'), annotation_name, type_key, vcf_type)
                annotation_record.fields.append(
                    bigquery.TableFieldSchema(
                        name=_BigQuerySchemaSanitizer.get_sanitized_field_name(
                            annotation_name),
                        type=bigquery_util.get_bigquery_type_from_vcf_type(
                            vcf_type),
                        mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                        description=annotation_descs.get(annotation_name, '')))
            alternate_bases_record.fields.append(annotation_record)
        return alternate_bases_record

예제 #5

파일 보기

파일: processed_variant.py 프로젝트: mhsaul/gcp-variant-transforms

    def create_alt_bases_field_schema(self):
        # type: () -> bigquery.TableFieldSchema
        """Returns the alternate_bases record compatible with this factory.

    Depending on how this class is set up to split INFO fields among alternate
    bases, this function produces a compatible alternate_bases record and
    returns it which can be added to a bigquery schema by the caller.
    """
        alternate_bases_record = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each alternate base (if any).')
        alternate_bases_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT,
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='Alternate base.'))
        if self._split_alternate_allele_info_fields:
            for key, field in self._header_fields.infos.iteritems():
                if field.num == vcf.parser.field_counts[
                        _FIELD_COUNT_ALTERNATE_ALLELE]:
                    alternate_bases_record.fields.append(
                        bigquery.TableFieldSchema(
                            name=bigquery_util.
                            get_bigquery_sanitized_field_name(key),
                            type=bigquery_util.get_bigquery_type_from_vcf_type(
                                field.type),
                            mode=bigquery_util.TableFieldConstants.
                            MODE_NULLABLE,
                            description=bigquery_util.
                            get_bigquery_sanitized_field(field.desc)))

        for annot_field in self._annotation_field_set:
            if annot_field not in self._header_fields.infos:
                raise ValueError(
                    'Annotation field {} not found'.format(annot_field))
            annotation_names = _extract_annotation_names(
                self._header_fields.infos[annot_field].desc)
            annotation_record = bigquery.TableFieldSchema(
                name=bigquery_util.get_bigquery_sanitized_field(annot_field),
                type=bigquery_util.TableFieldConstants.TYPE_RECORD,
                mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
                description='List of {} annotations for this alternate.'.
                format(annot_field))
            annotation_record.fields.append(
                bigquery.TableFieldSchema(
                    name=_ANNOTATION_ALT,
                    type=bigquery_util.TableFieldConstants.TYPE_STRING,
                    mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                    description='The ALT part of the annotation field.'))
            if self._minimal_match:
                annotation_record.fields.append(
                    bigquery.TableFieldSchema(
                        name=_ANNOTATION_ALT_AMBIGUOUS,
                        type=bigquery_util.TableFieldConstants.TYPE_BOOLEAN,
                        mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                        description=
                        'Whether the annotation ALT matching was ambiguous.'))
            for annotation_name in annotation_names:
                annotation_record.fields.append(
                    bigquery.TableFieldSchema(
                        name=bigquery_util.get_bigquery_sanitized_field(
                            annotation_name),
                        type=bigquery_util.TableFieldConstants.TYPE_STRING,
                        mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                        # TODO(bashir2): Add descriptions of well known annotations, e.g.,
                        # from VEP.
                        description=''))
            alternate_bases_record.fields.append(annotation_record)
        return alternate_bases_record

예제 #6

파일 보기

    def create_alt_bases_field_schema(self):
        # type: () -> bigquery.TableFieldSchema
        """Returns the alternate_bases record compatible with this factory.

    Depending on how this class is set up to split INFO fields among alternate
    bases, this function produces a compatible alternate_bases record and
    returns it which can be added to a bigquery schema by the caller.
    """
        alternate_bases_record = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each alternate base (if any).')
        alternate_bases_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT,
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description='Alternate base.'))
        if self._split_alternate_allele_info_fields:
            for key, field in self._header_fields.infos.iteritems():
                if (field[_HeaderKeyConstants.NUM] == vcf.parser.
                        field_counts[_FIELD_COUNT_ALTERNATE_ALLELE]):
                    alternate_bases_record.fields.append(
                        bigquery.TableFieldSchema(
                            name=_BigQuerySchemaSanitizer.
                            get_sanitized_field_name(key),
                            type=bigquery_util.get_bigquery_type_from_vcf_type(
                                field[_HeaderKeyConstants.TYPE]),
                            mode=bigquery_util.TableFieldConstants.
                            MODE_NULLABLE,
                            description=_BigQuerySchemaSanitizer.
                            get_sanitized_string(
                                field[_HeaderKeyConstants.DESC])))

        for annot_field in self._annotation_field_set:
            if annot_field not in self._header_fields.infos:
                raise ValueError(
                    'Annotation field {} not found'.format(annot_field))
            annotation_names = annotation_parser.extract_annotation_names(
                self._header_fields.infos[annot_field][
                    _HeaderKeyConstants.DESC])
            annotation_descs = descriptions.VEP_DESCRIPTIONS
            annotation_record = bigquery.TableFieldSchema(
                name=_BigQuerySchemaSanitizer.get_sanitized_field_name(
                    annot_field),
                type=bigquery_util.TableFieldConstants.TYPE_RECORD,
                mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
                description='List of {} annotations for this alternate.'.
                format(annot_field))
            annotation_record.fields.append(
                bigquery.TableFieldSchema(
                    name=annotation_parser.ANNOTATION_ALT,
                    type=bigquery_util.TableFieldConstants.TYPE_STRING,
                    mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                    description='The ALT part of the annotation field.'))
            for annotation_name in annotation_names:
                annotation_record.fields.append(
                    bigquery.TableFieldSchema(
                        name=_BigQuerySchemaSanitizer.get_sanitized_field_name(
                            annotation_name),
                        type=bigquery_util.TableFieldConstants.TYPE_STRING,
                        mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                        description=annotation_descs.get(annotation_name, '')))
            alternate_bases_record.fields.append(annotation_record)
        return alternate_bases_record