def test_get_bigquery_sanitized_field_name(self):
     self.assertEqual('AA',
                      bigquery_util.get_bigquery_sanitized_field_name('AA'))
     self.assertEqual(
         'field__AA',
         bigquery_util.get_bigquery_sanitized_field_name('_AA'))
     self.assertEqual(
         'field_1A1A',
         bigquery_util.get_bigquery_sanitized_field_name('1A1A'))
Пример #2
0
def _get_call_record(call):
    """A helper method for ``get_rows_from_variant`` to get a call as JSON.

  Args:
     call (``VariantCall``): Variant call to convert.

  Returns:
    BigQuery call value (dict).
  """
    call_record = {
        bigquery_util.ColumnKeyConstants.CALLS_NAME:
        bigquery_util.get_bigquery_sanitized_field(call.name),
        bigquery_util.ColumnKeyConstants.CALLS_PHASESET:
        call.phaseset,
        bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE:
        call.genotype or []
    }
    is_empty = (not call.genotype or set(call.genotype) == set(
        (vcfio.MISSING_GENOTYPE_VALUE, )))
    for key, field in call.info.iteritems():
        if field is not None:
            sanitized = bigquery_util.get_bigquery_sanitized_field(field)
            call_record[bigquery_util.get_bigquery_sanitized_field_name(
                key)] = sanitized
            is_empty = is_empty and _is_empty_field(sanitized)
    return call_record, is_empty
Пример #3
0
 def _get_bigquery_field_entry(
         self,
         key,  # type: str
         data,  # type: Union[Any, List[Any]]
         schema_descriptor,  # type: bigquery_schema_descriptor.SchemaDescriptor
         allow_incompatible_records,  # type: bool
 ):
     # type: (...) -> (str, Any)
     if data is None:
         return None, None
     field_name = bigquery_util.get_bigquery_sanitized_field_name(key)
     if not schema_descriptor.has_simple_field(field_name):
         raise ValueError(
             'BigQuery schema has no such field: {}.\n'
             'This can happen if the field is not defined in '
             'the VCF headers, or is not inferred automatically. '
             'Retry pipeline with --infer_undefined_headers.'.format(
                 field_name))
     sanitized_field_data = bigquery_util.get_bigquery_sanitized_field(data)
     field_schema = schema_descriptor.get_field_descriptor(field_name)
     field_data, is_compatible = self._check_and_resolve_schema_compatibility(
         field_schema, sanitized_field_data)
     if is_compatible or allow_incompatible_records:
         return field_name, field_data
     else:
         raise ValueError('Value and schema do not match for field {}. '
                          'Value: {} Schema: {}.'.format(
                              field_name, sanitized_field_data,
                              field_schema))
Пример #4
0
def _get_base_row_from_variant(variant):
    # type: (processed_variant.ProcessedVariant) -> Dict[str, Any]
    """A helper method for ``get_rows_from_variant`` to get row without calls."""
    row = {
        bigquery_util.ColumnKeyConstants.REFERENCE_NAME:
        variant.reference_name,
        bigquery_util.ColumnKeyConstants.START_POSITION: variant.start,
        bigquery_util.ColumnKeyConstants.END_POSITION: variant.end,
        bigquery_util.ColumnKeyConstants.REFERENCE_BASES:
        variant.reference_bases
    }  # type: Dict[str, Any]
    if variant.names:
        row[bigquery_util.ColumnKeyConstants.NAMES] = (
            bigquery_util.get_bigquery_sanitized_field(variant.names))
    if variant.quality is not None:
        row[bigquery_util.ColumnKeyConstants.QUALITY] = variant.quality
    if variant.filters:
        row[bigquery_util.ColumnKeyConstants.FILTER] = (
            bigquery_util.get_bigquery_sanitized_field(variant.filters))
    # Add alternate bases.
    row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES] = []
    for alt in variant.alternate_data_list:
        alt_record = {
            bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT:
            alt.alternate_bases
        }
        for key, data in alt.info.iteritems():
            alt_record[bigquery_util.get_bigquery_sanitized_field_name(
                key)] = data
        row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES].append(
            alt_record)
    # Add info.
    for key, data in variant.non_alt_info.iteritems():
        if data is not None:
            row[bigquery_util.get_bigquery_sanitized_field_name(key)] = (
                bigquery_util.get_bigquery_sanitized_field(data))
    # Set calls to empty for now (will be filled later).
    row[bigquery_util.ColumnKeyConstants.CALLS] = []
    return row
Пример #5
0
def _get_base_row_from_variant(
    variant,  # type: processed_variant.ProcessedVariant
    schema_descriptor,  # type: bigquery_schema_descriptor.SchemaDescriptor
    conflict_resolver=None,
    # type: vcf_field_conflict_resolver.ConflictResolver
    allow_incompatible_records=False  # type: bool
):
    # type: (...) -> Dict[str, Any]
    """A helper method for ``get_rows_from_variant`` to get row without calls."""
    row = {
        bigquery_util.ColumnKeyConstants.REFERENCE_NAME:
        variant.reference_name,
        bigquery_util.ColumnKeyConstants.START_POSITION: variant.start,
        bigquery_util.ColumnKeyConstants.END_POSITION: variant.end,
        bigquery_util.ColumnKeyConstants.REFERENCE_BASES:
        variant.reference_bases
    }  # type: Dict[str, Any]
    if variant.names:
        row[bigquery_util.ColumnKeyConstants.NAMES] = (
            bigquery_util.get_bigquery_sanitized_field(variant.names))
    if variant.quality is not None:
        row[bigquery_util.ColumnKeyConstants.QUALITY] = variant.quality
    if variant.filters:
        row[bigquery_util.ColumnKeyConstants.FILTER] = (
            bigquery_util.get_bigquery_sanitized_field(variant.filters))
    # Add alternate bases.
    row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES] = []
    for alt in variant.alternate_data_list:
        alt_record = {
            bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT:
            alt.alternate_bases
        }
        for key, data in alt.info.iteritems():
            alt_record[bigquery_util.get_bigquery_sanitized_field_name(
                key)] = data
        row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES].append(
            alt_record)
    # Add info.
    for key, data in variant.non_alt_info.iteritems():
        if data is not None:
            field_name, field_data = _get_bigquery_field_entry(
                key, data, schema_descriptor, conflict_resolver,
                allow_incompatible_records)
            row[field_name] = field_data

    # Set calls to empty for now (will be filled later).
    row[bigquery_util.ColumnKeyConstants.CALLS] = []
    return row
Пример #6
0
    def _get_base_row_from_variant(self, variant, allow_incompatible_records):
        # type: (processed_variant.ProcessedVariant, bool) -> Dict[str, Any]
        row = {
            bigquery_util.ColumnKeyConstants.REFERENCE_NAME:
            variant.reference_name,
            bigquery_util.ColumnKeyConstants.START_POSITION:
            variant.start,
            bigquery_util.ColumnKeyConstants.END_POSITION:
            variant.end,
            bigquery_util.ColumnKeyConstants.REFERENCE_BASES:
            variant.reference_bases
        }  # type: Dict[str, Any]
        if variant.names:
            row[bigquery_util.ColumnKeyConstants.NAMES] = (
                bigquery_util.get_bigquery_sanitized_field(variant.names))
        if variant.quality is not None:
            row[bigquery_util.ColumnKeyConstants.QUALITY] = variant.quality
        if variant.filters:
            row[bigquery_util.ColumnKeyConstants.FILTER] = (
                bigquery_util.get_bigquery_sanitized_field(variant.filters))
        # Add alternate bases.
        row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES] = []
        for alt in variant.alternate_data_list:
            alt_record = {
                bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT:
                alt.alternate_bases
            }
            for key, data in alt.info.iteritems():
                alt_record[bigquery_util.get_bigquery_sanitized_field_name(
                    key)] = data
            row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES].append(
                alt_record)
        # Add info.
        for key, data in variant.non_alt_info.iteritems():
            if data is not None:
                field_name, field_data = self._get_bigquery_field_entry(
                    key, data, self._schema_descriptor,
                    allow_incompatible_records)
                row[field_name] = field_data

        # Set calls to empty for now (will be filled later).
        row[bigquery_util.ColumnKeyConstants.CALLS] = []
        return row
Пример #7
0
def generate_schema_from_header_fields(
    header_fields,  # type: vcf_header_parser.HeaderFields
    proc_variant_factory,  # type: processed_variant.ProcessedVariantFactory
    variant_merger=None  # type: variant_merge_strategy.VariantMergeStrategy
):
    """Returns a ``TableSchema`` for the BigQuery table storing variants.

  Args:
    header_fields: A `namedtuple` containing representative header fields for
      all variant records. This specifies custom INFO and FORMAT fields in the
      VCF file(s).
    proc_variant_factory: The factory class that knows how to convert Variant
      instances to ProcessedVariant. As a side effect it also knows how to
      modify BigQuery schema based on the ProcessedVariants that it generates.
      The latter functionality is what is needed here.
    variant_merger: The strategy used for merging variants (if any). Some
      strategies may change the schema, which is why this may be needed here.
  """
    schema = bigquery.TableSchema()
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference name.'))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.START_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Start position (0-based). Corresponds to the first base '
                'of the string of reference bases.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.END_POSITION,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'End position (0-based). Corresponds to the first base '
                'after the last base in the reference allele.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Reference bases.'))

    schema.fields.append(proc_variant_factory.create_alt_bases_field_schema())

    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.NAMES,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description='Variant names (e.g. RefSNP ID).'))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.QUALITY,
            type=bigquery_util.TableFieldConstants.TYPE_FLOAT,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phred-scaled quality score (-10log10 prob(call is wrong)). '
                'Higher values imply better quality.')))
    schema.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.FILTER,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'List of failed filters (if any) or "PASS" indicating the '
                'variant has passed all filters.')))

    # Add calls.
    calls_record = bigquery.TableFieldSchema(
        name=bigquery_util.ColumnKeyConstants.CALLS,
        type=bigquery_util.TableFieldConstants.TYPE_RECORD,
        mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
        description='One record for each call.')
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_NAME,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description='Name of the call.'))
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE,
            type=bigquery_util.TableFieldConstants.TYPE_INTEGER,
            mode=bigquery_util.TableFieldConstants.MODE_REPEATED,
            description=(
                'Genotype of the call. "-1" is used in cases where the '
                'genotype is not called.')))
    calls_record.fields.append(
        bigquery.TableFieldSchema(
            name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET,
            type=bigquery_util.TableFieldConstants.TYPE_STRING,
            mode=bigquery_util.TableFieldConstants.MODE_NULLABLE,
            description=(
                'Phaseset of the call (if any). "*" is used in cases where '
                'the genotype is phased, but no phase set ("PS" in FORMAT) '
                'was specified.')))
    for key, field in header_fields.formats.iteritems():
        # GT and PS are already included in 'genotype' and 'phaseset' fields.
        if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY):
            continue
        calls_record.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.get_bigquery_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(field.type),
                mode=_get_bigquery_mode_from_vcf_num(field.num),
                description=bigquery_util.get_bigquery_sanitized_field(
                    field.desc)))
    schema.fields.append(calls_record)

    # Add info fields.
    info_keys = set()
    for key, field in header_fields.infos.iteritems():
        # END info is already included by modifying the end_position.
        if (key == vcfio.END_INFO_KEY
                or proc_variant_factory.info_is_in_alt_bases(key)):
            continue
        schema.fields.append(
            bigquery.TableFieldSchema(
                name=bigquery_util.get_bigquery_sanitized_field_name(key),
                type=bigquery_util.get_bigquery_type_from_vcf_type(field.type),
                mode=_get_bigquery_mode_from_vcf_num(field.num),
                description=bigquery_util.get_bigquery_sanitized_field(
                    field.desc)))
        info_keys.add(key)
    if variant_merger:
        variant_merger.modify_bigquery_schema(schema, info_keys)
    return schema