def test_get_bigquery_type_from_vcf_type(self): self.assertEqual(bigquery_util.TableFieldConstants.TYPE_INTEGER, bigquery_util.get_bigquery_type_from_vcf_type('integer')) self.assertEqual(bigquery_util.TableFieldConstants.TYPE_STRING, bigquery_util.get_bigquery_type_from_vcf_type('string')) self.assertEqual(bigquery_util.TableFieldConstants.TYPE_STRING, bigquery_util.get_bigquery_type_from_vcf_type('character')) self.assertEqual(bigquery_util.TableFieldConstants.TYPE_FLOAT, bigquery_util.get_bigquery_type_from_vcf_type('float')) self.assertEqual(bigquery_util.TableFieldConstants.TYPE_BOOLEAN, bigquery_util.get_bigquery_type_from_vcf_type('flag')) self.assertRaises( ValueError, bigquery_util.get_bigquery_type_from_vcf_type, 'DUMMY')
def generate_schema_from_header_fields( header_fields, # type: vcf_header_io.VcfHeader proc_variant_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None # type: variant_merge_strategy.VariantMergeStrategy ): # type: (...) -> bigquery.TableSchema """Returns a ``TableSchema`` for the BigQuery table storing variants. Args: header_fields: Representative header fields for all variants. proc_variant_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. """ schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference name.')) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.START_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Start position (0-based). Corresponds to the first base ' 'of the string of reference bases.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.END_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'End position (0-based). Corresponds to the first base ' 'after the last base in the reference allele.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference bases.')) schema.fields.append(proc_variant_factory.create_alt_bases_field_schema()) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.NAMES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='Variant names (e.g. RefSNP ID).')) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.QUALITY, type=bigquery_util.TableFieldConstants.TYPE_FLOAT, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phred-scaled quality score (-10log10 prob(call is wrong)). ' 'Higher values imply better quality.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.FILTER, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'List of failed filters (if any) or "PASS" indicating the ' 'variant has passed all filters.'))) # Add calls. calls_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Name of the call.')) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'Genotype of the call. "-1" is used in cases where the ' 'genotype is not called.'))) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phaseset of the call (if any). "*" is used in cases where ' 'the genotype is phased, but no phase set ("PS" in FORMAT) ' 'was specified.'))) for key, field in header_fields.formats.iteritems(): # GT and PS are already included in 'genotype' and 'phaseset' fields. if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY): continue calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_sanitizer.SchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.get_bigquery_mode_from_vcf_num( field[_HeaderKeyConstants.NUM]), description=bigquery_sanitizer.SchemaSanitizer. get_sanitized_string(field[_HeaderKeyConstants.DESC]))) schema.fields.append(calls_record) # Add info fields. info_keys = set() for key, field in header_fields.infos.iteritems(): # END info is already included by modifying the end_position. if (key == vcfio.END_INFO_KEY or proc_variant_factory.info_is_in_alt_bases(key)): continue schema.fields.append( bigquery.TableFieldSchema( name=bigquery_sanitizer.SchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.get_bigquery_mode_from_vcf_num( field[_HeaderKeyConstants.NUM]), description=bigquery_sanitizer.SchemaSanitizer. get_sanitized_string(field[_HeaderKeyConstants.DESC]))) info_keys.add(key) if variant_merger: variant_merger.modify_bigquery_schema(schema, info_keys) return schema
def generate_schema_from_header_fields( header_fields, # type: vcf_header_io.VcfHeader proc_variant_factory, # type: processed_variant.ProcessedVariantFactory variant_merger=None, # type: variant_merge_strategy.VariantMergeStrategy use_1_based_coordinate=False, # type: bool include_call_name=False, # type: bool move_hom_ref_calls=False # type: bool ): # type: (...) -> bigquery.TableSchema """Returns a ``TableSchema`` for the BigQuery table storing variants. Args: header_fields: Representative header fields for all variants. proc_variant_factory: The factory class that knows how to convert Variant instances to ProcessedVariant. As a side effect it also knows how to modify BigQuery schema based on the ProcessedVariants that it generates. The latter functionality is what is needed here. variant_merger: The strategy used for merging variants (if any). Some strategies may change the schema, which is why this may be needed here. use_1_based_coordinate: If True use 1-based coordinate, otherwise 0-based. """ schema = bigquery.TableSchema() schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference name.')) coordinate = '1-based' if use_1_based_coordinate else '0-based' schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.START_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Start position ({}). Corresponds to the first base ' 'of the string of reference bases.'.format(coordinate)))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.END_POSITION, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=('End position. Corresponds to the first base ' 'after the last base in the reference allele.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.REFERENCE_BASES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Reference bases.')) schema.fields.append(proc_variant_factory.create_alt_bases_field_schema()) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.NAMES, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='Variant names (e.g. RefSNP ID).')) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.QUALITY, type=bigquery_util.TableFieldConstants.TYPE_FLOAT, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phred-scaled quality score (-10log10 prob(call is wrong)). ' 'Higher values imply better quality.'))) schema.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.FILTER, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'List of failed filters (if any) or "PASS" indicating the ' 'variant has passed all filters.'))) if move_hom_ref_calls: hom_ref_calls_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.HOM_REF_CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each homogeneous call.') hom_ref_calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Unique ID (type INT64) assigned to each sample. Table ' 'with `__sample_info` suffix contains the mapping of ' 'sample names (as read from VCF header) to these assigned ' 'IDs.')) if include_call_name: hom_ref_calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Name of the call (sample names in the VCF Header ' 'line).')) schema.fields.append(hom_ref_calls_record) # Add calls. calls_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each call.') calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_SAMPLE_ID, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Unique ID (type INT64) assigned to each sample. Table with ' '`__sample_info` suffix contains the mapping of sample names ' '(as read from VCF header) to these assigned IDs.')) if include_call_name: calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_NAME, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Name of the call (sample names in the VCF Header line).')) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_GENOTYPE, type=bigquery_util.TableFieldConstants.TYPE_INTEGER, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description=( 'Genotype of the call. "-1" is used in cases where the ' 'genotype is not called.'))) calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.CALLS_PHASESET, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=( 'Phaseset of the call (if any). "*" is used in cases where ' 'the genotype is phased, but no phase set ("PS" in FORMAT) ' 'was specified.'))) for key, field in header_fields.formats.items(): # GT and PS are already included in 'genotype' and 'phaseset' fields. if key in (vcfio.GENOTYPE_FORMAT_KEY, vcfio.PHASESET_FORMAT_KEY): continue calls_record.fields.append( bigquery.TableFieldSchema( name=bigquery_sanitizer.SchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.get_bigquery_mode_from_vcf_num( field[_HeaderKeyConstants.NUM]), description=bigquery_sanitizer.SchemaSanitizer. get_sanitized_string(field[_HeaderKeyConstants.DESC]))) schema.fields.append(calls_record) # Add info fields. info_keys = set() annotation_info_type_keys_set = set( proc_variant_factory.gen_annotation_info_type_keys()) for key, field in header_fields.infos.items(): # END info is already included by modifying the end_position. Info type # fields exist only to indicate the type of corresponding annotation fields, # and should not be added to the schema. if (key == vcfio.END_INFO_KEY or proc_variant_factory.info_is_in_alt_bases(key) or key in annotation_info_type_keys_set): continue schema.fields.append( bigquery.TableFieldSchema( name=bigquery_sanitizer.SchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.get_bigquery_mode_from_vcf_num( field[_HeaderKeyConstants.NUM]), description=bigquery_sanitizer.SchemaSanitizer. get_sanitized_string(field[_HeaderKeyConstants.DESC]))) info_keys.add(key) if variant_merger: variant_merger.modify_bigquery_schema(schema, info_keys) return schema
def create_alt_bases_field_schema(self): # type: () -> bigquery.TableFieldSchema """Returns the alternate_bases record compatible with this factory. Depending on how this class is set up to split INFO fields among alternate bases, this function produces a compatible alternate_bases record and returns it which can be added to a bigquery schema by the caller. """ alternate_bases_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') alternate_bases_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Alternate base.')) if self._split_alternate_allele_info_fields: for key, field in self._header_fields.infos.items(): if self._is_num_a(field[_HeaderKeyConstants.NUM]): alternate_bases_record.fields.append( bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.TableFieldConstants. MODE_NULLABLE, description=_BigQuerySchemaSanitizer. get_sanitized_string( field[_HeaderKeyConstants.DESC]))) for annot_field in self._annotation_field_set: if annot_field not in self._header_fields.infos: raise ValueError( 'Annotation field {} not found'.format(annot_field)) annotation_descs = descriptions.VEP_DESCRIPTIONS annotation_record = bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer.get_sanitized_field_name( annot_field), type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='List of {} annotations for this alternate.'. format(annot_field)) annotation_record.fields.append( bigquery.TableFieldSchema( name=annotation_parser.ANNOTATION_ALT, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='The ALT part of the annotation field.')) annotation_names_keys = self._gen_annotation_name_key_pairs( annot_field) for annotation_name, type_key in annotation_names_keys: if type_key in self._header_fields.infos: vcf_type = self._header_fields.infos[type_key][ vcf_header_io.VcfParserHeaderKeyConstants.TYPE] else: vcf_type = vcf_header_io.VcfHeaderFieldTypeConstants.STRING if self._infer_annotation_types: logging.warning( ('Annotation field %s has no corresponding header ' 'field with id %s to specify type. Using type %s ' 'instead.'), annotation_name, type_key, vcf_type) annotation_record.fields.append( bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer.get_sanitized_field_name( annotation_name), type=bigquery_util.get_bigquery_type_from_vcf_type( vcf_type), mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=annotation_descs.get(annotation_name, ''))) alternate_bases_record.fields.append(annotation_record) return alternate_bases_record
def create_alt_bases_field_schema(self): # type: () -> bigquery.TableFieldSchema """Returns the alternate_bases record compatible with this factory. Depending on how this class is set up to split INFO fields among alternate bases, this function produces a compatible alternate_bases record and returns it which can be added to a bigquery schema by the caller. """ alternate_bases_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') alternate_bases_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Alternate base.')) if self._split_alternate_allele_info_fields: for key, field in self._header_fields.infos.iteritems(): if field.num == vcf.parser.field_counts[ _FIELD_COUNT_ALTERNATE_ALLELE]: alternate_bases_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util. get_bigquery_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field.type), mode=bigquery_util.TableFieldConstants. MODE_NULLABLE, description=bigquery_util. get_bigquery_sanitized_field(field.desc))) for annot_field in self._annotation_field_set: if annot_field not in self._header_fields.infos: raise ValueError( 'Annotation field {} not found'.format(annot_field)) annotation_names = _extract_annotation_names( self._header_fields.infos[annot_field].desc) annotation_record = bigquery.TableFieldSchema( name=bigquery_util.get_bigquery_sanitized_field(annot_field), type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='List of {} annotations for this alternate.'. format(annot_field)) annotation_record.fields.append( bigquery.TableFieldSchema( name=_ANNOTATION_ALT, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='The ALT part of the annotation field.')) if self._minimal_match: annotation_record.fields.append( bigquery.TableFieldSchema( name=_ANNOTATION_ALT_AMBIGUOUS, type=bigquery_util.TableFieldConstants.TYPE_BOOLEAN, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description= 'Whether the annotation ALT matching was ambiguous.')) for annotation_name in annotation_names: annotation_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.get_bigquery_sanitized_field( annotation_name), type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, # TODO(bashir2): Add descriptions of well known annotations, e.g., # from VEP. description='')) alternate_bases_record.fields.append(annotation_record) return alternate_bases_record
def create_alt_bases_field_schema(self): # type: () -> bigquery.TableFieldSchema """Returns the alternate_bases record compatible with this factory. Depending on how this class is set up to split INFO fields among alternate bases, this function produces a compatible alternate_bases record and returns it which can be added to a bigquery schema by the caller. """ alternate_bases_record = bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES, type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='One record for each alternate base (if any).') alternate_bases_record.fields.append( bigquery.TableFieldSchema( name=bigquery_util.ColumnKeyConstants.ALTERNATE_BASES_ALT, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='Alternate base.')) if self._split_alternate_allele_info_fields: for key, field in self._header_fields.infos.iteritems(): if (field[_HeaderKeyConstants.NUM] == vcf.parser. field_counts[_FIELD_COUNT_ALTERNATE_ALLELE]): alternate_bases_record.fields.append( bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer. get_sanitized_field_name(key), type=bigquery_util.get_bigquery_type_from_vcf_type( field[_HeaderKeyConstants.TYPE]), mode=bigquery_util.TableFieldConstants. MODE_NULLABLE, description=_BigQuerySchemaSanitizer. get_sanitized_string( field[_HeaderKeyConstants.DESC]))) for annot_field in self._annotation_field_set: if annot_field not in self._header_fields.infos: raise ValueError( 'Annotation field {} not found'.format(annot_field)) annotation_names = annotation_parser.extract_annotation_names( self._header_fields.infos[annot_field][ _HeaderKeyConstants.DESC]) annotation_descs = descriptions.VEP_DESCRIPTIONS annotation_record = bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer.get_sanitized_field_name( annot_field), type=bigquery_util.TableFieldConstants.TYPE_RECORD, mode=bigquery_util.TableFieldConstants.MODE_REPEATED, description='List of {} annotations for this alternate.'. format(annot_field)) annotation_record.fields.append( bigquery.TableFieldSchema( name=annotation_parser.ANNOTATION_ALT, type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description='The ALT part of the annotation field.')) for annotation_name in annotation_names: annotation_record.fields.append( bigquery.TableFieldSchema( name=_BigQuerySchemaSanitizer.get_sanitized_field_name( annotation_name), type=bigquery_util.TableFieldConstants.TYPE_STRING, mode=bigquery_util.TableFieldConstants.MODE_NULLABLE, description=annotation_descs.get(annotation_name, ''))) alternate_bases_record.fields.append(annotation_record) return alternate_bases_record