Exemplo n.º 1
0
    def __init__(
            self,
            output_path,  # type: str
            schema,  # type: bigquery.TableSchema
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_path: The path under which output Avro files are generated.
      schema: Schema of the table to be generated.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_path = output_path
        self._avro_schema = avro.schema.parse(
            schema_converter.convert_table_schema_to_json_avro_schema(schema))
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
Exemplo n.º 2
0
    def setUp(self):
        self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            self._get_table_schema())
        self._conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        self._row_generator = bigquery_row_generator.VariantCallRowGenerator(
            self._schema_descriptor, self._conflict_resolver)
Exemplo n.º 3
0
    def setUp(self):
        self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            _get_table_schema())
        self._conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        self._row_generator = bigquery_vcf_data_converter.BigQueryRowGenerator(
            self._schema_descriptor, self._conflict_resolver)
  def __init__(
      self,
      output_table,  # type: str
      header_fields,  # type: vcf_header_io.VcfHeader
      variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
      proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
      append=False,  # type: bool
      update_schema_on_append=False,  # type: bool
      allow_incompatible_records=False,  # type: bool
      omit_empty_sample_calls=False,  # type: bool
      num_bigquery_write_shards=1  # type: int
      ):
    # type: (...) -> None
    """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      update_schema_on_append: If true, BigQuery schema will be updated by
        combining the existing schema and the new schema if they are compatible.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
    """
    self._output_table = output_table
    self._header_fields = header_fields
    self._variant_merger = variant_merger
    self._proc_var_factory = proc_var_factory
    self._append = append
    self._schema = bigquery_vcf_schema.generate_schema_from_header_fields(
        self._header_fields, self._proc_var_factory, self._variant_merger)
    # Resolver makes extra effort to resolve conflict when flag
    # allow_incompatible_records is set.
    self._bigquery_row_generator = bigquery_row_generator.BigQueryRowGenerator(
        bigquery_schema_descriptor.SchemaDescriptor(self._schema),
        vcf_field_conflict_resolver.FieldConflictResolver(
            resolve_always=allow_incompatible_records))

    self._allow_incompatible_records = allow_incompatible_records
    self._omit_empty_sample_calls = omit_empty_sample_calls
    self._num_bigquery_write_shards = num_bigquery_write_shards
    if update_schema_on_append:
      self._update_bigquery_schema_on_append()
Exemplo n.º 5
0
    def __init__(
            self,
            output_table,  # type: str
            schema,  # type: bigquery.TableSchema
            append=False,  # type: bool
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            num_bigquery_write_shards=1,  # type: int
            null_numeric_value_replacement=None,  # type: int
            include_call_name=False,  # type: bool
            move_hom_ref_calls=False  # type: bool
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      schema: Schema of the table to be generated.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      allow_incompatible_records: If true, field values are casted to Bigquery
        schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
      include_call_name: If true, sample name will be included in addition to
        sample ID.
      move_hom_ref_calls: If true, filter out 0 GT data out of call list and add
        the call name to a hom_ref_calls column.
    """
        self._output_table = output_table
        self._append = append
        self._schema = schema
        # Resolver makes extra effort to resolve conflict when flag
        # allow_incompatible_records is set.
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(self._schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement, include_call_name,
                move_hom_ref_calls))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
        self._num_bigquery_write_shards = num_bigquery_write_shards
    def __init__(
            self,
            output_path,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            proc_var_factory,  # type: processed_variant.ProcessedVariantFactory
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_path: The path under which output Avro files are generated.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_path = output_path
        self._proc_var_factory = proc_var_factory
        table_schema = (schema_converter.generate_schema_from_header_fields(
            header_fields, proc_var_factory, variant_merger))
        self._avro_schema = avro.schema.parse(
            schema_converter.convert_table_schema_to_json_avro_schema(
                table_schema))
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(table_schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
Exemplo n.º 7
0
  def __init__(
      self,
      output_table,  # type: str
      header_fields,  # type: vcf_header_parser.HeaderFields
      variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
      proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
      append=False,  # type: bool
      allow_incompatible_records=False,  # type: bool
      omit_empty_sample_calls=False  # type: bool
  ):
    """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: A `namedtuple` containing representative header fields for
        all variants. This is needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
    """
    self._output_table = output_table
    self._header_fields = header_fields
    self._variant_merger = variant_merger
    self._proc_var_factory = proc_var_factory
    self._append = append
    self._schema = bigquery_vcf_schema.generate_schema_from_header_fields(
        self._header_fields, self._proc_var_factory, self._variant_merger)
    self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
        self._schema)
    self._allow_incompatible_records = allow_incompatible_records
    self._omit_empty_sample_calls = omit_empty_sample_calls
 def _get_schema_descriptor(self):
     return bigquery_schema_descriptor.SchemaDescriptor(
         self._get_table_schema())
 def setUp(self):
     self._schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
         self._get_table_schema())
     self._conflict_resolver = (
         vcf_field_conflict_resolver.FieldConflictResolver())
Exemplo n.º 10
0
    def __init__(
            self,
            output_table,  # type: str
            header_fields,  # type: vcf_header_io.VcfHeader
            variant_merger=None,  # type: variant_merge_strategy.VariantMergeStrategy
            proc_var_factory=None,  # type: processed_variant.ProcessedVariantFactory
            # TODO(bashir2): proc_var_factory is a required argument and if `None` is
            # supplied this will fail in schema generation.
        append=False,  # type: bool
            update_schema_on_append=False,  # type: bool
            allow_incompatible_records=False,  # type: bool
            omit_empty_sample_calls=False,  # type: bool
            num_bigquery_write_shards=1,  # type: int
            null_numeric_value_replacement=None  # type: int
    ):
        # type: (...) -> None
        """Initializes the transform.

    Args:
      output_table: Full path of the output BigQuery table.
      header_fields: Representative header fields for all variants. This is
        needed for dynamically generating the schema.
      variant_merger: The strategy used for merging variants (if any). Some
        strategies may change the schema, which is why this may be needed here.
      proc_var_factory: The factory class that knows how to convert Variant
        instances to ProcessedVariant. As a side effect it also knows how to
        modify BigQuery schema based on the ProcessedVariants that it generates.
        The latter functionality is what is needed here.
      append: If true, existing records in output_table will not be
        overwritten. New records will be appended to those that already exist.
      update_schema_on_append: If true, BigQuery schema will be updated by
        combining the existing schema and the new schema if they are compatible.
      allow_incompatible_records: If true, field values are casted to Bigquery
+       schema if there is a mismatch.
      omit_empty_sample_calls: If true, samples that don't have a given call
        will be omitted.
      num_bigquery_write_shards: If > 1, we will limit number of sources which
        are used for writing to the output BigQuery table.
      null_numeric_value_replacement: the value to use instead of null for
        numeric (float/int/long) lists. For instance, [0, None, 1] will become
        [0, `null_numeric_value_replacement`, 1]. If not set, the value will set
        to bigquery_util._DEFAULT_NULL_NUMERIC_VALUE_REPLACEMENT.
    """
        self._output_table = output_table
        self._header_fields = header_fields
        self._variant_merger = variant_merger
        self._proc_var_factory = proc_var_factory
        self._append = append
        self._schema = (schema_converter.generate_schema_from_header_fields(
            self._header_fields, self._proc_var_factory, self._variant_merger))
        # Resolver makes extra effort to resolve conflict when flag
        # allow_incompatible_records is set.
        self._bigquery_row_generator = (
            bigquery_row_generator.VariantCallRowGenerator(
                bigquery_schema_descriptor.SchemaDescriptor(self._schema),
                vcf_field_conflict_resolver.FieldConflictResolver(
                    resolve_always=allow_incompatible_records),
                null_numeric_value_replacement))

        self._allow_incompatible_records = allow_incompatible_records
        self._omit_empty_sample_calls = omit_empty_sample_calls
        self._num_bigquery_write_shards = num_bigquery_write_shards
        if update_schema_on_append:
            bigquery_util.update_bigquery_schema_on_append(
                self._schema.fields, self._output_table)
Exemplo n.º 11
0
    def test_all_fields_with_hom_ref(self):
        schema_descriptor = bigquery_schema_descriptor.SchemaDescriptor(
            _get_table_schema(move_hom_ref_calls=True))
        conflict_resolver = (
            vcf_field_conflict_resolver.FieldConflictResolver())

        variant = vcfio.Variant(
            reference_name='chr19',
            start=11,
            end=12,
            reference_bases='C',
            alternate_bases=['A', 'TT'],
            names=['rs1', 'rs2'],
            quality=2,
            filters=['PASS'],
            info={
                'IFR': [0.1, 0.2],
                'IFR2': [0.2, 0.3],
                'IS': 'some data',
                'ISR': ['data1', 'data2']
            },
            hom_ref_calls=[('Sample2', hash_name('Sample2')),
                           ('Sample3', hash_name('Sample3'))],
            calls=[
                vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                  name='Sample1',
                                  genotype=[0, 1],
                                  phaseset='*',
                                  info={
                                      'GQ': 20,
                                      'FIR': [10, 20]
                                  })
            ])
        header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'}
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME:
            'chr19',
            ColumnKeyConstants.START_POSITION:
            11,
            ColumnKeyConstants.END_POSITION:
            12,
            ColumnKeyConstants.REFERENCE_BASES:
            'C',
            ColumnKeyConstants.ALTERNATE_BASES: [{
                ColumnKeyConstants.ALTERNATE_BASES_ALT:
                'A',
                'IFR':
                0.1,
                'IFR2':
                0.2
            }, {
                ColumnKeyConstants.ALTERNATE_BASES_ALT:
                'TT',
                'IFR':
                0.2,
                'IFR2':
                0.3
            }],
            ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
            ColumnKeyConstants.QUALITY:
            2,
            ColumnKeyConstants.FILTER: ['PASS'],
            ColumnKeyConstants.HOM_REF_CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample2'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample2'
            }, {
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample3'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample3'
            }],
            ColumnKeyConstants.CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample1'),
                ColumnKeyConstants.CALLS_NAME:
                'Sample1',
                ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
                ColumnKeyConstants.CALLS_PHASESET:
                '*',
                'GQ':
                20,
                'FIR': [10, 20]
            }],
            'IS':
            'some data',
            'ISR': ['data1', 'data2']
        }
        proc_variant = _get_processed_variant(variant, header_num_dict)
        row_generator = bigquery_row_generator.VariantCallRowGenerator(
            schema_descriptor,
            conflict_resolver,
            include_call_name=True,
            move_hom_ref_calls=True)
        self.assertEqual([expected_row],
                         list(row_generator.get_rows(proc_variant)))