def test_get_bigquery_sanitized_field_name(self): self.assertEqual('AA', bigquery_util.get_bigquery_sanitized_field_name('AA')) self.assertEqual( 'field__AA', bigquery_util.get_bigquery_sanitized_field_name('_AA')) self.assertEqual( 'field_1A1A', bigquery_util.get_bigquery_sanitized_field_name('1A1A'))
def _get_call_record(call): """A helper method for ``get_rows_from_variant`` to get a call as JSON. Args: call (``VariantCall``): Variant call to convert. Returns: BigQuery call value (dict). """ call_record = { bigquery_util.ColumnKeyConstants.CALLS_NAME: bigquery_util.get_bigquery_sanitized_field(call.name), bigquery_util.ColumnKeyConstants.CALLS_PHASESET: call.phaseset, bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE: call.genotype or [] } is_empty = (not call.genotype or set(call.genotype) == set( (vcfio.MISSING_GENOTYPE_VALUE, ))) for key, field in call.info.iteritems(): if field is not None: sanitized = bigquery_util.get_bigquery_sanitized_field(field) call_record[bigquery_util.get_bigquery_sanitized_field_name( key)] = sanitized is_empty = is_empty and _is_empty_field(sanitized) return call_record, is_empty
def _get_bigquery_field_entry( self, key, # type: str data, # type: Union[Any, List[Any]] schema_descriptor, # type: bigquery_schema_descriptor.SchemaDescriptor allow_incompatible_records, # type: bool ): # type: (...) -> (str, Any) if data is None: return None, None field_name = bigquery_util.get_bigquery_sanitized_field_name(key) if not schema_descriptor.has_simple_field(field_name): raise ValueError( 'BigQuery schema has no such field: {}.\n' 'This can happen if the field is not defined in ' 'the VCF headers, or is not inferred automatically. ' 'Retry pipeline with --infer_undefined_headers.'.format( field_name)) sanitized_field_data = bigquery_util.get_bigquery_sanitized_field(data) field_schema = schema_descriptor.get_field_descriptor(field_name) field_data, is_compatible = self._check_and_resolve_schema_compatibility( field_schema, sanitized_field_data) if is_compatible or allow_incompatible_records: return field_name, field_data else: raise ValueError('Value and schema do not match for field {}. ' 'Value: {} Schema: {}.'.format( field_name, sanitized_field_data, field_schema))
def _get_base_row_from_variant(variant): # type: (processed_variant.ProcessedVariant) -> Dict[str, Any] """A helper method for ``get_rows_from_variant`` to get row without calls.""" row = { bigquery_util.ColumnKeyConstants.REFERENCE_NAME: variant.reference_name, bigquery_util.ColumnKeyConstants.START_POSITION: variant.start, bigquery_util.ColumnKeyConstants.END_POSITION: variant.end, bigquery_util.ColumnKeyConstants.REFERENCE_BASES: variant.reference_bases } # type: Dict[str, Any] if variant.names: row[bigquery_util.ColumnKeyConstants.NAMES] = ( bigquery_util.get_bigquery_sanitized_field(variant.names)) if variant.quality is not None: row[bigquery_util.ColumnKeyConstants.QUALITY] = variant.quality if variant.filters: row[bigquery_util.ColumnKeyConstants.FILTER] = ( bigquery_util.get_bigquery_sanitized_field(variant.filters)) # Add alternate bases. row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES] = [] for alt in variant.alternate_data_list: alt_record = { bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT: alt.alternate_bases } for key, data in alt.info.iteritems(): alt_record[bigquery_util.get_bigquery_sanitized_field_name( key)] = data row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES].append( alt_record) # Add info. for key, data in variant.non_alt_info.iteritems(): if data is not None: row[bigquery_util.get_bigquery_sanitized_field_name(key)] = ( bigquery_util.get_bigquery_sanitized_field(data)) # Set calls to empty for now (will be filled later). row[bigquery_util.ColumnKeyConstants.CALLS] = [] return row
def _get_base_row_from_variant( variant, # type: processed_variant.ProcessedVariant schema_descriptor, # type: bigquery_schema_descriptor.SchemaDescriptor conflict_resolver=None, # type: vcf_field_conflict_resolver.ConflictResolver allow_incompatible_records=False # type: bool ): # type: (...) -> Dict[str, Any] """A helper method for ``get_rows_from_variant`` to get row without calls.""" row = { bigquery_util.ColumnKeyConstants.REFERENCE_NAME: variant.reference_name, bigquery_util.ColumnKeyConstants.START_POSITION: variant.start, bigquery_util.ColumnKeyConstants.END_POSITION: variant.end, bigquery_util.ColumnKeyConstants.REFERENCE_BASES: variant.reference_bases } # type: Dict[str, Any] if variant.names: row[bigquery_util.ColumnKeyConstants.NAMES] = ( bigquery_util.get_bigquery_sanitized_field(variant.names)) if variant.quality is not None: row[bigquery_util.ColumnKeyConstants.QUALITY] = variant.quality if variant.filters: row[bigquery_util.ColumnKeyConstants.FILTER] = ( bigquery_util.get_bigquery_sanitized_field(variant.filters)) # Add alternate bases. row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES] = [] for alt in variant.alternate_data_list: alt_record = { bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT: alt.alternate_bases } for key, data in alt.info.iteritems(): alt_record[bigquery_util.get_bigquery_sanitized_field_name( key)] = data row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES].append( alt_record) # Add info. for key, data in variant.non_alt_info.iteritems(): if data is not None: field_name, field_data = _get_bigquery_field_entry( key, data, schema_descriptor, conflict_resolver, allow_incompatible_records) row[field_name] = field_data # Set calls to empty for now (will be filled later). row[bigquery_util.ColumnKeyConstants.CALLS] = [] return row
def _get_base_row_from_variant(self, variant, allow_incompatible_records): # type: (processed_variant.ProcessedVariant, bool) -> Dict[str, Any] row = { bigquery_util.ColumnKeyConstants.REFERENCE_NAME: variant.reference_name, bigquery_util.ColumnKeyConstants.START_POSITION: variant.start, bigquery_util.ColumnKeyConstants.END_POSITION: variant.end, bigquery_util.ColumnKeyConstants.REFERENCE_BASES: variant.reference_bases } # type: Dict[str, Any] if variant.names: row[bigquery_util.ColumnKeyConstants.NAMES] = ( bigquery_util.get_bigquery_sanitized_field(variant.names)) if variant.quality is not None: row[bigquery_util.ColumnKeyConstants.QUALITY] = variant.quality if variant.filters: row[bigquery_util.ColumnKeyConstants.FILTER] = ( bigquery_util.get_bigquery_sanitized_field(variant.filters)) # Add alternate bases. row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES] = [] for alt in variant.alternate_data_list: alt_record = { bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT: alt.alternate_bases } for key, data in alt.info.iteritems(): alt_record[bigquery_util.get_bigquery_sanitized_field_name( key)] = data row[bigquery_util.ColumnKeyConstants.ALTERNATE_BASES].append( alt_record) # Add info. for key, data in variant.non_alt_info.iteritems(): if data is not None: field_name, field_data = self._get_bigquery_field_entry( key, data, self._schema_descriptor, allow_incompatible_records) row[field_name] = field_data # Set calls to empty for now (will be filled later). row[bigquery_util.ColumnKeyConstants.CALLS] = [] return row
def generate_schema_from_header_fields( header_fields, # type: vcf_header_parser.HeaderFields proc_variant_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None # type: variant_merge_strategy.VariantMergeStrategy ): """Returns a ``TableSchema`` for the BigQuery table storing variants. Args: header_fields: A `namedtuple` containing representative header fields for all variant records. This specifies custom INFO and FORMAT fields in the VCF file(s). proc_variant_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. """ schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference name.')) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.START_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Start position (0-based). Corresponds to the first base ' 'of the string of reference bases.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.END_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'End position (0-based). Corresponds to the first base ' 'after the last base in the reference allele.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference bases.')) schema.fields.append(proc_variant_factory.create_alt_bases_field_schema()) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.NAMES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='Variant names (e.g. RefSNP ID).')) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.QUALITY, type=bigquery_util.TableFieldConstants.TYPE_FLOAT, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phred-scaled quality score (-10log10 prob(call is wrong)). ' 'Higher values imply better quality.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.FILTER, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'List of failed filters (if any) or "PASS" indicating the ' 'variant has passed all filters.'))) # Add calls. calls_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Name of the call.')) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'Genotype of the call. "-1" is used in cases where the ' 'genotype is not called.'))) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phaseset of the call (if any). "*" is used in cases where ' 'the genotype is phased, but no phase set ("PS" in FORMAT) ' 'was specified.'))) for key, field in header_fields.formats.iteritems(): # GT and PS are already included in 'genotype' and 'phaseset' fields. if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY): continue calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.get_bigquery_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type(field.type), mode=_get_bigquery_mode_from_vcf_num(field.num), description=bigquery_util.get_bigquery_sanitized_field( field.desc))) schema.fields.append(calls_record) # Add info fields. info_keys = set() for key, field in header_fields.infos.iteritems(): # END info is already included by modifying the end_position. if (key == vcfio.END_INFO_KEY or proc_variant_factory.info_is_in_alt_bases(key)): continue schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.get_bigquery_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type(field.type), mode=_get_bigquery_mode_from_vcf_num(field.num), description=bigquery_util.get_bigquery_sanitized_field( field.desc))) info_keys.add(key) if variant_merger: variant_merger.modify_bigquery_schema(schema, info_keys) return schema