示例#1
0
class ReadFromBGZF(beam.PTransform):
  """Reads variants from BGZF."""

  def __init__(self,
               input_files,
               representative_header_lines,
               allow_malformed_records,
               pre_infer_headers,
               sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,
               use_1_based_coordinate=False
              ):
    # type: (List[str], List[str], bool, bool, int, bool) -> None
    """Initializes the transform.

    Args:
      input_files: The BGZF file paths to read from.
      representative_header_lines: Header definitions to be used for parsing
        VCF files.
      allow_malformed_records: If true, malformed records from VCF files will be
        returned as `MalformedVcfRecord` instead of failing the pipeline.
      pre_infer_headers: If true, drop headers and make sure PySam return the
        exact data for variants and calls, without type matching.
      sample_name_encoding: specify how we want to encode sample_name mainly
        to deal with same sample_name used across multiple VCF files.
      use_1_based_coordinate: specify whether the coordinates should be stored
        in BQ using 0 based exclusive (default) or 1 based inclusive coordinate.
    """
    self._input_files = input_files
    self._representative_header_lines = representative_header_lines
    self._allow_malformed_records = allow_malformed_records
    self._pre_infer_headers = pre_infer_headers
    self._sample_name_encoding = sample_name_encoding
    self._use_1_based_coordinate = use_1_based_coordinate

  def _read_records(self, (file_path, block)):
    # type: (Tuple[str, Block]) -> Iterable(Variant)
    """Reads records from `file_path` in `block`."""
    record_iterator = vcf_parser.PySamParser(
        file_path,
        block,
        filesystems.CompressionTypes.GZIP,
        self._allow_malformed_records,
        representative_header_lines=self._representative_header_lines,
        splittable_bgzf=True,
        pre_infer_headers=self._pre_infer_headers,
        sample_name_encoding=self._sample_name_encoding,
        use_1_based_coordinate=self._use_1_based_coordinate)

    for record in record_iterator:
      yield record
示例#2
0
    def _read_records(self, file_path_and_block_tuple):
        # type: (Tuple[str, Block]) -> Iterable(Variant)
        """Reads records from `file_path` in `block`."""
        (file_path, block) = file_path_and_block_tuple
        record_iterator = vcf_parser.PySamParser(
            file_path,
            block,
            filesystems.CompressionTypes.GZIP,
            self._allow_malformed_records,
            representative_header_lines=self._representative_header_lines,
            splittable_bgzf=True,
            pre_infer_headers=self._pre_infer_headers,
            sample_name_encoding=self._sample_name_encoding,
            use_1_based_coordinate=self._use_1_based_coordinate)

        for record in record_iterator:
            yield record
示例#3
0
  def read_records(self,
                   file_name,  # type: str
                   range_tracker  # type: range_trackers.OffsetRangeTracker
                  ):
    # type: (...) -> Iterable[MalformedVcfRecord]
    record_iterator = vcf_parser.PySamParser(
        file_name,
        range_tracker,
        self._compression_type,
        self._allow_malformed_records,
        file_pattern=self._pattern,
        representative_header_lines=self._representative_header_lines,
        pre_infer_headers=self._pre_infer_headers,
        sample_name_encoding=self._sample_name_encoding,
        use_1_based_coordinate=self._use_1_based_coordinate,
        buffer_size=self._buffer_size,
        skip_header_lines=0)

    # Convert iterator to generator to abstract behavior
    for record in record_iterator:
      yield record