def _validate_reserved_field_mode(field_schema, reserved_definition):
    schema_mode = (field_schema.mode
                   or bigquery_util.TableFieldConstants.MODE_NULLABLE)
    reserved_mode = bigquery_util.get_bigquery_mode_from_vcf_num(
        reserved_definition.num)
    if schema_mode != reserved_mode:
        raise ValueError(
            'The mode of field {} is different from the VCF spec: {} vs {}.'.
            format(field_schema.name, schema_mode, reserved_mode))
Пример #2
0
def generate_schema_from_header_fields(
    header_fields,  # type: vcf_header_io.VcfHeader
    proc_variant_factory,  # type: processed_variant.ProcessedVariantFactory
    variant_merger=None  # type: variant_merge_strategy.VariantMergeStrategy
):
    # type: (...) -> bigquery.TableSchema
    """Returns a ``TableSchema`` for the BigQuery table storing variants.

  Args:
    header_fields: Representative header fields for all variants.
    proc_variant_factory: The factory class that knows how to convert Variant
      instances to ProcessedVariant. As a side effect it also knows how to
      modify BigQuery schema based on the ProcessedVariants that it generates.
      The latter functionality is what is needed here.
    variant_merger: The strategy used for merging variants (if any). Some
      strategies may change the schema, which is why this may be needed here.
  """
    schema = bigquery.TableSchema()
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference name.'))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.START_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Start position (0-based). Corresponds to the first base '
                'of the string of reference bases.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.END_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'End position (0-based). Corresponds to the first base '
                'after the last base in the reference allele.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference bases.'))

    schema.fields.append(proc_variant_factory.create_alt_bases_field_schema())

    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.NAMES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='Variant names (e.g. RefSNP ID).'))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.QUALITY,
            type=bigquery_util.TableFieldConstants.TYPE_FLOAT,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phred-scaled quality score (-10log10 prob(call is wrong)). '
                'Higher values imply better quality.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.FILTER,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'List of failed filters (if any) or "PASS" indicating the '
                'variant has passed all filters.')))

    # Add calls.
    calls_record = bigquery.TableFieldSchema(
        name=bigquery_util.ColumnKeyConstants.CALLS,
        type=bigquery_util.TableFieldConstants.TYPE_RECORD,
        mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_NAME,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Name of the call.'))
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'Genotype of the call. "-1" is used in cases where the '
                'genotype is not called.')))
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phaseset of the call (if any). "*" is used in cases where '
                'the genotype is phased, but no phase set ("PS" in FORMAT) '
                'was specified.')))
    for key, field in header_fields.formats.iteritems():
        # GT and PS are already included in 'genotype' and 'phaseset' fields.
        if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY):
            continue
        calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(
                    field[_HeaderKeyConstants.TYPE]),
                mode=bigquery_util.get_bigquery_mode_from_vcf_num(
                    field[_HeaderKeyConstants.NUM]),
                description=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_string(field[_HeaderKeyConstants.DESC])))
    schema.fields.append(calls_record)

    # Add info fields.
    info_keys = set()
    for key, field in header_fields.infos.iteritems():
        # END info is already included by modifying the end_position.
        if (key == vcfio.END_INFO_KEY
                or proc_variant_factory.info_is_in_alt_bases(key)):
            continue
        schema.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(
                    field[_HeaderKeyConstants.TYPE]),
                mode=bigquery_util.get_bigquery_mode_from_vcf_num(
                    field[_HeaderKeyConstants.NUM]),
                description=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_string(field[_HeaderKeyConstants.DESC])))
        info_keys.add(key)
    if variant_merger:
        variant_merger.modify_bigquery_schema(schema, info_keys)
    return schema
def generate_schema_from_header_fields(
        header_fields,  # type: vcf_header_io.VcfHeader
        proc_variant_factory,  # type: processed_variant.ProcessedVariantFactory
        variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
        use_1_based_coordinate=False,  # type: bool
        include_call_name=False,  # type: bool
        move_hom_ref_calls=False  # type: bool
):
    # type: (...) -> bigquery.TableSchema
    """Returns a ``TableSchema`` for the BigQuery table storing variants.

  Args:
    header_fields: Representative header fields for all variants.
    proc_variant_factory: The factory class that knows how to convert Variant
      instances to ProcessedVariant. As a side effect it also knows how to
      modify BigQuery schema based on the ProcessedVariants that it generates.
      The latter functionality is what is needed here.
    variant_merger: The strategy used for merging variants (if any). Some
      strategies may change the schema, which is why this may be needed here.
    use_1_based_coordinate: If True use 1-based coordinate, otherwise 0-based.
  """
    schema = bigquery.TableSchema()
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference name.'))

    coordinate = '1-based' if use_1_based_coordinate else '0-based'
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.START_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Start position ({}). Corresponds to the first base '
                'of the string of reference bases.'.format(coordinate))))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.END_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=('End position. Corresponds to the first base '
                         'after the last base in the reference allele.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference bases.'))
    schema.fields.append(proc_variant_factory.create_alt_bases_field_schema())

    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.NAMES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='Variant names (e.g. RefSNP ID).'))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.QUALITY,
            type=bigquery_util.TableFieldConstants.TYPE_FLOAT,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phred-scaled quality score (-10log10 prob(call is wrong)). '
                'Higher values imply better quality.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.FILTER,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'List of failed filters (if any) or "PASS" indicating the '
                'variant has passed all filters.')))

    if move_hom_ref_calls:
        hom_ref_calls_record = bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.HOM_REF_CALLS,
            type=bigquery_util.TableFieldConstants.TYPE_RECORD,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='One record for each homogeneous call.')
        hom_ref_calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
                type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description=
                'Unique ID (type INT64) assigned to each sample. Table '
                'with `__sample_info` suffix contains the mapping of '
                'sample names (as read from VCF header) to these assigned '
                'IDs.'))
        if include_call_name:
            hom_ref_calls_record.fields.append(
                bigquery.TableFieldSchema(
                    name=bigquery_util.ColumnKeyConstants.CALLS_NAME,
                    type=bigquery_util.TableFieldConstants.TYPE_STRING,
                    mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                    description=
                    'Name of the call (sample names in the VCF Header '
                    'line).'))
        schema.fields.append(hom_ref_calls_record)
    # Add calls.
    calls_record = bigquery.TableFieldSchema(
        name=bigquery_util.ColumnKeyConstants.CALLS,
        type=bigquery_util.TableFieldConstants.TYPE_RECORD,
        mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=
            'Unique ID (type INT64) assigned to each sample. Table with '
            '`__sample_info` suffix contains the mapping of sample names '
            '(as read from VCF header) to these assigned IDs.'))
    if include_call_name:
        calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.ColumnKeyConstants.CALLS_NAME,
                type=bigquery_util.TableFieldConstants.TYPE_STRING,
                mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
                description=
                'Name of the call (sample names in the VCF Header line).'))

    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'Genotype of the call. "-1" is used in cases where the '
                'genotype is not called.')))
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phaseset of the call (if any). "*" is used in cases where '
                'the genotype is phased, but no phase set ("PS" in FORMAT) '
                'was specified.')))
    for key, field in header_fields.formats.items():
        # GT and PS are already included in 'genotype' and 'phaseset' fields.
        if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY):
            continue
        calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(
                    field[_HeaderKeyConstants.TYPE]),
                mode=bigquery_util.get_bigquery_mode_from_vcf_num(
                    field[_HeaderKeyConstants.NUM]),
                description=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_string(field[_HeaderKeyConstants.DESC])))
    schema.fields.append(calls_record)

    # Add info fields.
    info_keys = set()
    annotation_info_type_keys_set = set(
        proc_variant_factory.gen_annotation_info_type_keys())
    for key, field in header_fields.infos.items():
        # END info is already included by modifying the end_position. Info type
        # fields exist only to indicate the type of corresponding annotation fields,
        # and should not be added to the schema.
        if (key == vcfio.END_INFO_KEY
                or proc_variant_factory.info_is_in_alt_bases(key)
                or key in annotation_info_type_keys_set):
            continue
        schema.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(
                    field[_HeaderKeyConstants.TYPE]),
                mode=bigquery_util.get_bigquery_mode_from_vcf_num(
                    field[_HeaderKeyConstants.NUM]),
                description=bigquery_sanitizer.SchemaSanitizer.
                get_sanitized_string(field[_HeaderKeyConstants.DESC])))
        info_keys.add(key)
    if variant_merger:
        variant_merger.modify_bigquery_schema(schema, info_keys)
    return schema