Пример #1
0
    def test_sharded_key_coder(self):
        key_and_coders = [(b'', b'\x00', coders.BytesCoder()),
                          (b'key', b'\x03key', coders.BytesCoder()),
                          ('key', b'\03\x6b\x65\x79', coders.StrUtf8Coder()),
                          (('k', 1), b'\x01\x6b\x01',
                           coders.TupleCoder(
                               (coders.StrUtf8Coder(), coders.VarIntCoder())))]

        for key, bytes_repr, key_coder in key_and_coders:
            coder = coders.ShardedKeyCoder(key_coder)
            # Verify cloud object representation
            self.assertEqual(
                {
                    '@type': 'kind:sharded_key',
                    'component_encodings': [key_coder.as_cloud_object()]
                }, coder.as_cloud_object())
            self.assertEqual(b'\x00' + bytes_repr,
                             coder.encode(ShardedKey(key, b'')))
            self.assertEqual(b'\x03123' + bytes_repr,
                             coder.encode(ShardedKey(key, b'123')))

            # Test unnested
            self.check_coder(coder, ShardedKey(key, b''))
            self.check_coder(coder, ShardedKey(key, b'123'))

            for other_key, _, other_key_coder in key_and_coders:
                other_coder = coders.ShardedKeyCoder(other_key_coder)
                # Test nested
                self.check_coder(
                    coders.TupleCoder((coder, other_coder)),
                    (ShardedKey(key, b''), ShardedKey(other_key, b'')))
                self.check_coder(
                    coders.TupleCoder((coder, other_coder)),
                    (ShardedKey(key, b'123'), ShardedKey(other_key, b'')))
Пример #2
0
  def __init__(self,
               file_name,  # type: str
               range_tracker,  # type: range_trackers.OffsetRangeTracker
               file_pattern,  # type: str
               compression_type,  # type: str
               allow_malformed_records,  # type: bool
               representative_header_lines=None,  # type:  List[str]
               splittable_bgzf=False,  # type: bool
               **kwargs  # type: **str
              ):
    # type: (...) -> None
    # If `representative_header_lines` is given, header lines in `file_name`
    # are ignored; refer to _process_header_lines() logic.
    self._representative_header_lines = representative_header_lines
    self._file_name = file_name
    self._allow_malformed_records = allow_malformed_records

    if splittable_bgzf:
      text_source = bgzf.BGZFBlockSource(
          file_name,
          range_tracker,
          representative_header_lines,
          compression_type,
          header_processor_fns=(
              lambda x: not x.strip() or x.startswith('#'),
              self._process_header_lines),
          **kwargs)
    elif compression_type == filesystems.CompressionTypes.GZIP:
      text_source = bgzf.BGZFSource(
          file_pattern,
          0,  # min_bundle_size
          compression_type,
          True,  # strip_trailing_newlines
          coders.StrUtf8Coder(),  # coder
          validate=False,
          header_processor_fns=(
              lambda x: not x.strip() or x.startswith('#'),
              self._process_header_lines),
          **kwargs)
    else:
      text_source = textio._TextSource(
          file_pattern,
          0,  # min_bundle_size
          compression_type,
          True,  # strip_trailing_newlines
          coders.StrUtf8Coder(),  # coder
          validate=False,
          header_processor_fns=(
              lambda x: not x.strip() or x.startswith('#'),
              self._process_header_lines),
          **kwargs)

    self._text_lines = text_source.read_records(self._file_name,
                                                range_tracker)
Пример #3
0
    def test_sharded_key_coder(self):
        key_and_coders = [(b'', b'\x00', coders.BytesCoder()),
                          (b'key', b'\x03key', coders.BytesCoder()),
                          ('key', b'\03\x6b\x65\x79', coders.StrUtf8Coder()),
                          (('k', 1), b'\x01\x6b\x01',
                           coders.TupleCoder(
                               (coders.StrUtf8Coder(), coders.VarIntCoder())))]

        for key, bytes_repr, key_coder in key_and_coders:
            coder = coders.ShardedKeyCoder(key_coder)
            # Verify cloud object representation
            self.assertEqual(
                {
                    '@type': 'kind:sharded_key',
                    'component_encodings': [key_coder.as_cloud_object()]
                }, coder.as_cloud_object())

            # Test str repr
            self.assertEqual('%s' % coder, 'ShardedKeyCoder[%s]' % key_coder)

            self.assertEqual(b'\x00' + bytes_repr,
                             coder.encode(ShardedKey(key, b'')))
            self.assertEqual(b'\x03123' + bytes_repr,
                             coder.encode(ShardedKey(key, b'123')))

            # Test unnested
            self.check_coder(coder, ShardedKey(key, b''))
            self.check_coder(coder, ShardedKey(key, b'123'))

            # Test type hints
            self.assertTrue(
                isinstance(coder.to_type_hint(),
                           sharded_key_type.ShardedKeyTypeConstraint))
            key_type = coder.to_type_hint().key_type
            if isinstance(key_type, typehints.TupleConstraint):
                self.assertEqual(key_type.tuple_types,
                                 (type(key[0]), type(key[1])))
            else:
                self.assertEqual(key_type, type(key))
            self.assertEqual(
                coders.ShardedKeyCoder.from_type_hint(
                    coder.to_type_hint(), typecoders.CoderRegistry()), coder)

            for other_key, _, other_key_coder in key_and_coders:
                other_coder = coders.ShardedKeyCoder(other_key_coder)
                # Test nested
                self.check_coder(
                    coders.TupleCoder((coder, other_coder)),
                    (ShardedKey(key, b''), ShardedKey(other_key, b'')))
                self.check_coder(
                    coders.TupleCoder((coder, other_coder)),
                    (ShardedKey(key, b'123'), ShardedKey(other_key, b'')))
Пример #4
0
    def test_deduplication_with_event_time(self):
        deduplicate_duration = 60
        with self.create_pipeline() as p:
            test_stream = (TestStream(coder=coders.StrUtf8Coder(
            )).with_output_types(str).advance_watermark_to(0).add_elements([
                window.TimestampedValue('k1', 0),
                window.TimestampedValue('k2', 20),
                window.TimestampedValue('k3', 30)
            ]).advance_watermark_to(30).add_elements([
                window.TimestampedValue('k1', 40),
                window.TimestampedValue('k2', 50),
                window.TimestampedValue('k3', 60)
            ]).advance_watermark_to(deduplicate_duration).add_elements(
                [window.TimestampedValue('k1',
                                         70)]).advance_watermark_to_infinity())
            res = (p
                   | test_stream
                   | deduplicate.Deduplicate(
                       event_time_duration=Duration(deduplicate_duration))
                   | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))

            assert_that(
                res,
                equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)),
                          ('k3', Timestamp(30)), ('k1', Timestamp(70))]))
Пример #5
0
        def __init__(self, file_name, range_tracker, file_pattern,
                     compression_type, allow_malformed_records, **kwargs):
            self._header_lines = []
            self._last_record = None
            self._file_name = file_name
            self._allow_malformed_records = allow_malformed_records

            text_source = _TextSource(
                file_pattern,
                0,  # min_bundle_size
                compression_type,
                True,  # strip_trailing_newlines
                coders.StrUtf8Coder(),  # coder
                validate=False,
                header_processor_fns=(lambda x: x.startswith('#'),
                                      self._store_header_lines),
                **kwargs)

            self._text_lines = text_source.read_records(
                self._file_name, range_tracker)
            try:
                self._vcf_reader = vcf.Reader(fsock=self._create_generator())
            except SyntaxError as e:
                raise ValueError('Invalid VCF header in %s: %s' %
                                 (self._file_name, str(e)))
Пример #6
0
    def __init__(self,
                 file_name,
                 range_tracker,
                 file_pattern,
                 compression_type,
                 allow_malformed_records,
                 **kwargs):
      self._header_lines = []
      self._last_record = None
      self._file_name = file_name
      self._allow_malformed_records = allow_malformed_records

      text_source = TextSource(
          file_pattern,
          0,  # min_bundle_size
          compression_type,
          True,  # strip_trailing_newlines
          coders.StrUtf8Coder(),  # coder
          validate=False,
          header_processor_fns=(lambda x: x.startswith('#'),
                                self._store_header_lines),
          **kwargs)

      self._text_lines = text_source.read_records(self._file_name,
                                                  range_tracker)
      try:
        self._vcf_reader = vcf.Reader(fsock=self._create_generator())
      except SyntaxError as e:
        # Throw the exception inside the generator to ensure file is properly
        # closed (it's opened inside TextSource.read_records).
        self._text_lines.throw(
            ValueError('An exception was raised when reading header from VCF '
                       'file %s: %s' % (self._file_name,
                                        traceback.format_exc(e))))
Пример #7
0
 def test_map_coder(self):
     self.check_coder(
         coders.MapCoder(coders.VarIntCoder(), coders.StrUtf8Coder()), {
             1: "one",
             300: "three hundred"
         }, {}, {i: str(i)
                 for i in range(5000)})
Пример #8
0
 def test_tuple_coder(self):
     kv_coder = coders.TupleCoder(
         (coders.VarIntCoder(), coders.BytesCoder()))
     # Verify cloud object representation
     self.assertEqual(
         {
             '@type':
             'kind:pair',
             'is_pair_like':
             True,
             'component_encodings': [
                 coders.VarIntCoder().as_cloud_object(),
                 coders.BytesCoder().as_cloud_object()
             ],
         }, kv_coder.as_cloud_object())
     # Test binary representation
     self.assertEqual('\x04abc', kv_coder.encode((4, 'abc')))
     # Test unnested
     self.check_coder(kv_coder, (1, 'a'), (-2, 'a' * 100),
                      (300, 'abc\0' * 5))
     # Test nested
     self.check_coder(
         coders.TupleCoder((coders.TupleCoder(
             (coders.PickleCoder(), coders.VarIntCoder())),
                            coders.StrUtf8Coder())), ((1, 2), 'a'),
         ((-2, 5), u'a\u0101' * 100), ((300, 1), 'abc\0' * 5))
Пример #9
0
 def to_runner_api(self, context):
     # type: (PipelineContext) -> beam_runner_api_pb2.TimerFamilySpec
     return beam_runner_api_pb2.TimerFamilySpec(
         time_domain=TimeDomain.to_runner_api(self.time_domain),
         timer_family_coder_id=context.coders.get_id(
             coders._TimerCoder(coders.StrUtf8Coder(),
                                coders.GlobalWindowCoder())))
Пример #10
0
    def __init__(
            self,
            min_bundle_size=0,
            desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE,
            compression_type=CompressionTypes.AUTO,
            strip_trailing_newlines=True,
            coder=coders.StrUtf8Coder(),  # type: coders.Coder
            skip_header_lines=0,
            with_filename=False,
            delimiter=None,
            escapechar=None,
            **kwargs):
        """Initialize the ``ReadAllFromText`` transform.

    Args:
      min_bundle_size: Minimum size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      desired_bundle_size: Desired size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      compression_type: Used to handle compressed input files. Typical value
        is ``CompressionTypes.AUTO``, in which case the underlying file_path's
        extension will be used to detect the compression.
      strip_trailing_newlines: Indicates whether this source should remove
        the newline char in each line it reads before decoding that line.
      validate: flag to verify that the files exist during the pipeline
        creation time.
      skip_header_lines: Number of header lines to skip. Same number is skipped
        from each source file. Must be 0 or higher. Large number of skipped
        lines might impact performance.
      coder: Coder used to decode each line.
      with_filename: If True, returns a Key Value with the key being the file
        name and the value being the actual data. If False, it only returns
        the data.
      delimiter (bytes) Optional: delimiter to split records.
        Must not self-overlap, because self-overlapping delimiters cause
        ambiguous parsing.
      escapechar (bytes) Optional: a single byte to escape the records
        delimiter, can also escape itself.
    """
        super().__init__(**kwargs)
        source_from_file = partial(
            _create_text_source,
            min_bundle_size=min_bundle_size,
            compression_type=compression_type,
            strip_trailing_newlines=strip_trailing_newlines,
            coder=coder,
            skip_header_lines=skip_header_lines,
            delimiter=delimiter,
            escapechar=escapechar)
        self._desired_bundle_size = desired_bundle_size
        self._min_bundle_size = min_bundle_size
        self._compression_type = compression_type
        self._read_all_files = ReadAllFiles(True, compression_type,
                                            desired_bundle_size,
                                            min_bundle_size, source_from_file,
                                            with_filename)
Пример #11
0
  def test_param_windowed_value_coder(self):
    from apache_beam.transforms.window import IntervalWindow
    from apache_beam.utils.windowed_value import PaneInfo
    wv = windowed_value.create(
        b'',
        # Milliseconds to microseconds
        1000 * 1000,
        (IntervalWindow(11, 21),),
        PaneInfo(True, False, 1, 2, 3))
    windowed_value_coder = coders.WindowedValueCoder(
        coders.BytesCoder(), coders.IntervalWindowCoder())
    payload = windowed_value_coder.encode(wv)
    coder = coders.ParamWindowedValueCoder(
        payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()])

    # Test binary representation
    self.assertEqual(b'\x01',
                     coder.encode(window.GlobalWindows.windowed_value(1)))

    # Test unnested
    self.check_coder(
        coders.ParamWindowedValueCoder(
            payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]),
        windowed_value.WindowedValue(
            3,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)),
        windowed_value.WindowedValue(
            1,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)))

    # Test nested
    self.check_coder(
        coders.TupleCoder((
            coders.ParamWindowedValueCoder(
                payload, [
                    coders.FloatCoder(),
                    coders.IntervalWindowCoder()]),
            coders.ParamWindowedValueCoder(
                payload, [
                    coders.StrUtf8Coder(),
                    coders.IntervalWindowCoder()]))),
        (windowed_value.WindowedValue(
            1.5,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)),
         windowed_value.WindowedValue(
             "abc",
             1,
             (window.IntervalWindow(11, 21),),
             PaneInfo(True, False, 1, 2, 3))))
Пример #12
0
 def __init__(self,
              file_pattern,
              min_bundle_size=0,
              compression_type=filesystem.CompressionTypes.AUTO,
              strip_trailing_newlines=True,
              coder=coders.StrUtf8Coder(),
              buffer_size=DEFAULT_READ_BUFFER_SIZE,
              validate=True):
     super(self.__class__,
           self).__init__(file_pattern, min_bundle_size, compression_type,
                          strip_trailing_newlines, coder, buffer_size,
                          validate, 1)
Пример #13
0
    def __init__(
            self,
            file_pattern=None,
            min_bundle_size=0,
            compression_type=CompressionTypes.AUTO,
            strip_trailing_newlines=True,
            coder=coders.StrUtf8Coder(),  # type: coders.Coder
            validate=True,
            skip_header_lines=0,
            delimiter=None,
            escapechar=None,
            **kwargs):
        """Initialize the :class:`ReadFromText` transform.

    Args:
      file_pattern (str): The file path to read from as a local file path or a
        GCS ``gs://`` path. The path can contain glob characters
        (``*``, ``?``, and ``[...]`` sets).
      min_bundle_size (int): Minimum size of bundles that should be generated
        when splitting this source into bundles. See
        :class:`~apache_beam.io.filebasedsource.FileBasedSource` for more
        details.
      compression_type (str): Used to handle compressed input files.
        Typical value is :attr:`CompressionTypes.AUTO
        <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the
        underlying file_path's extension will be used to detect the compression.
      strip_trailing_newlines (bool): Indicates whether this source should
        remove the newline char in each line it reads before decoding that line.
      validate (bool): flag to verify that the files exist during the pipeline
        creation time.
      skip_header_lines (int): Number of header lines to skip. Same number is
        skipped from each source file. Must be 0 or higher. Large number of
        skipped lines might impact performance.
      coder (~apache_beam.coders.coders.Coder): Coder used to decode each line.
      delimiter (bytes) Optional: delimiter to split records.
        Must not self-overlap, because self-overlapping delimiters cause
        ambiguous parsing.
      escapechar (bytes) Optional: a single byte to escape the records
        delimiter, can also escape itself.
    """

        super().__init__(**kwargs)
        self._source = self._source_class(file_pattern,
                                          min_bundle_size,
                                          compression_type,
                                          strip_trailing_newlines,
                                          coder,
                                          validate=validate,
                                          skip_header_lines=skip_header_lines,
                                          delimiter=delimiter,
                                          escapechar=escapechar)
Пример #14
0
 def test_map_coder(self):
     values = [
         {
             1: "one",
             300: "three hundred"
         },  # force yapf to be nice
         {},
         {i: str(i)
          for i in range(5000)}
     ]
     map_coder = coders.MapCoder(coders.VarIntCoder(),
                                 coders.StrUtf8Coder())
     self.check_coder(map_coder, *values)
     self.check_coder(map_coder.as_deterministic_coder("label"), *values)
Пример #15
0
  def __init__(
      self,
      min_bundle_size=0,
      desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE,
      compression_type=CompressionTypes.AUTO,
      strip_trailing_newlines=True,
      coder=coders.StrUtf8Coder(),  # type: coders.Coder
      skip_header_lines=0,
      **kwargs):
    """Initialize the ``ReadAllFromText`` transform.

    Args:
      min_bundle_size: Minimum size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      desired_bundle_size: Desired size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      compression_type: Used to handle compressed input files. Typical value
        is ``CompressionTypes.AUTO``, in which case the underlying file_path's
        extension will be used to detect the compression.
      strip_trailing_newlines: Indicates whether this source should remove
        the newline char in each line it reads before decoding that line.
      validate: flag to verify that the files exist during the pipeline
        creation time.
      skip_header_lines: Number of header lines to skip. Same number is skipped
        from each source file. Must be 0 or higher. Large number of skipped
        lines might impact performance.
      coder: Coder used to decode each line.
    """
    super(ReadAllFromText, self).__init__(**kwargs)
    source_from_file = partial(
        _create_text_source,
        min_bundle_size=min_bundle_size,
        compression_type=compression_type,
        strip_trailing_newlines=strip_trailing_newlines,
        coder=coder,
        skip_header_lines=skip_header_lines)
    self._desired_bundle_size = desired_bundle_size
    self._min_bundle_size = min_bundle_size
    self._compression_type = compression_type
    self._read_all_files = ReadAllFiles(
        True,
        compression_type,
        desired_bundle_size,
        min_bundle_size,
        source_from_file)
Пример #16
0
  def test_windowed_value_coder(self):
    coder = coders.WindowedValueCoder(
        coders.VarIntCoder(), coders.GlobalWindowCoder())
    # Verify cloud object representation
    self.assertEqual({
        '@type': 'kind:windowed_value',
        'is_wrapper': True,
        'component_encodings': [
            coders.VarIntCoder().as_cloud_object(),
            coders.GlobalWindowCoder().as_cloud_object(),
        ],
    },
                     coder.as_cloud_object())
    # Test binary representation
    self.assertEqual(
        b'\x7f\xdf;dZ\x1c\xac\t\x00\x00\x00\x01\x0f\x01',
        coder.encode(window.GlobalWindows.windowed_value(1)))

    # Test decoding large timestamp
    self.assertEqual(
        coder.decode(b'\x7f\xdf;dZ\x1c\xac\x08\x00\x00\x00\x01\x0f\x00'),
        windowed_value.create(0, MIN_TIMESTAMP.micros, (GlobalWindow(), )))

    # Test unnested
    self.check_coder(
        coders.WindowedValueCoder(coders.VarIntCoder()),
        windowed_value.WindowedValue(3, -100, ()),
        windowed_value.WindowedValue(-1, 100, (1, 2, 3)))

    # Test Global Window
    self.check_coder(
        coders.WindowedValueCoder(
            coders.VarIntCoder(), coders.GlobalWindowCoder()),
        window.GlobalWindows.windowed_value(1))

    # Test nested
    self.check_coder(
        coders.TupleCoder((
            coders.WindowedValueCoder(coders.FloatCoder()),
            coders.WindowedValueCoder(coders.StrUtf8Coder()))),
        (
            windowed_value.WindowedValue(1.5, 0, ()),
            windowed_value.WindowedValue("abc", 10, ('window', ))))
Пример #17
0
 def test_timer_coder(self):
     self.check_coder(
         coders._TimerCoder(coders.StrUtf8Coder(),
                            coders.GlobalWindowCoder()),
         *[
             userstate.Timer(user_key="key",
                             dynamic_timer_tag="tag",
                             windows=(GlobalWindow(), ),
                             clear_bit=True,
                             fire_timestamp=None,
                             hold_timestamp=None,
                             paneinfo=None),
             userstate.Timer(user_key="key",
                             dynamic_timer_tag="tag",
                             windows=(GlobalWindow(), ),
                             clear_bit=False,
                             fire_timestamp=timestamp.Timestamp.of(123),
                             hold_timestamp=timestamp.Timestamp.of(456),
                             paneinfo=windowed_value.PANE_INFO_UNKNOWN)
         ])
Пример #18
0
 def __init__(self,
              file_name,
              block,
              header_lines,
              compression_type,
              header_processor_fns,
              strip_trailing_newlines=True,
              min_bundle_size=0,
              coder=coders.StrUtf8Coder(),
              validate=True):
     """A source for reading BGZF Block."""
     super(BGZFBlockSource,
           self).__init__(file_name,
                          min_bundle_size,
                          compression_type,
                          strip_trailing_newlines,
                          coder,
                          validate=validate,
                          header_processor_fns=header_processor_fns)
     self._block = block
     self._header_lines = header_lines
Пример #19
0
  def test_deduplication_in_different_windows(self):
    with self.create_pipeline() as p:
      test_stream = (
          TestStream(
              coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements(
                  [
                      window.TimestampedValue('k1', 0),
                      window.TimestampedValue('k2', 10),
                      window.TimestampedValue('k3', 20),
                      window.TimestampedValue('k1', 30),
                      window.TimestampedValue('k2', 40),
                      window.TimestampedValue('k3', 50),
                      window.TimestampedValue('k4', 60),
                      window.TimestampedValue('k5', 70),
                      window.TimestampedValue('k6', 80)
                  ]).advance_watermark_to_infinity())

      res = (
          p
          | test_stream
          | beam.WindowInto(window.FixedWindows(30))
          | deduplicate.Deduplicate(processing_time_duration=10 * 60)
          | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts)))
      # Deduplication should happen per window.
      expect_unique_keys_per_window = {
          window.IntervalWindow(0, 30): [('k1', Timestamp(0)),
                                         ('k2', Timestamp(10)),
                                         ('k3', Timestamp(20))],
          window.IntervalWindow(30, 60): [('k1', Timestamp(30)),
                                          ('k2', Timestamp(40)),
                                          ('k3', Timestamp(50))],
          window.IntervalWindow(60, 90): [('k4', Timestamp(60)),
                                          ('k5', Timestamp(70)),
                                          ('k6', Timestamp(80))],
      }
      assert_that(
          res,
          equal_to_per_window(expect_unique_keys_per_window),
          use_global_window=False,
          label='assert per window')
Пример #20
0
    def __init__(self,
                 file_pattern=None,
                 min_bundle_size=0,
                 compression_type=CompressionTypes.AUTO,
                 strip_trailing_newlines=True,
                 coder=coders.StrUtf8Coder(),
                 validate=True,
                 skip_header_lines=0,
                 **kwargs):
        """Initialize the ``ReadFromText`` transform.

    Args:
      file_pattern: The file path to read from as a local file path or a GCS
        ``gs://`` path. The path can contain glob characters
        ``(*, ?, and [...] sets)``.
      min_bundle_size: Minimum size of bundles that should be generated when
        splitting this source into bundles. See ``FileBasedSource`` for more
        details.
      compression_type: Used to handle compressed input files. Typical value
        is ``CompressionTypes.AUTO``, in which case the underlying file_path's
        extension will be used to detect the compression.
      strip_trailing_newlines: Indicates whether this source should remove
        the newline char in each line it reads before decoding that line.
      validate: flag to verify that the files exist during the pipeline
        creation time.
      skip_header_lines: Number of header lines to skip. Same number is skipped
        from each source file. Must be 0 or higher. Large number of skipped
        lines might impact performance.
      coder: Coder used to decode each line.
    """

        super(ReadFromText, self).__init__(**kwargs)
        self._source = _TextSource(file_pattern,
                                   min_bundle_size,
                                   compression_type,
                                   strip_trailing_newlines,
                                   coder,
                                   validate=validate,
                                   skip_header_lines=skip_header_lines)
Пример #21
0
    def test_nested_observables(self):
        class FakeObservableIterator(observable.ObservableMixin):
            def __iter__(self):
                return iter([1, 2, 3])

        # Coder for elements from the observable iterator.
        elem_coder = coders.VarIntCoder()
        iter_coder = coders.TupleSequenceCoder(elem_coder)

        # Test nested WindowedValue observable.
        coder = coders.WindowedValueCoder(iter_coder)
        observ = FakeObservableIterator()
        value = windowed_value.WindowedValue(observ, 0, ())
        self.assertEqual(
            coder.get_impl().get_estimated_size_and_observables(value)[1],
            [(observ, elem_coder.get_impl())])

        # Test nested tuple observable.
        coder = coders.TupleCoder((coders.StrUtf8Coder(), iter_coder))
        value = (u'123', observ)
        self.assertEqual(
            coder.get_impl().get_estimated_size_and_observables(value)[1],
            [(observ, elem_coder.get_impl())])
        def __init__(
                self,
                file_name,  # type: str
                range_tracker,  # type: range_trackers.OffsetRangeTracker
                file_pattern,  # type: str
                compression_type,  # type: str
                allow_malformed_records,  # type: bool
                representative_header_lines=None,  # type:  List[str]
                **kwargs  # type: **str
        ):
            # type: (...) -> None
            # If `representative_header_lines` is given, header lines in `file_name`
            # are ignored.
            self._header_lines = []
            self._representative_header_lines = representative_header_lines
            self._last_record = None
            self._file_name = file_name
            self._allow_malformed_records = allow_malformed_records

            text_source = textio._TextSource(
                file_pattern,
                0,  # min_bundle_size
                compression_type,
                True,  # strip_trailing_newlines
                coders.StrUtf8Coder(),  # coder
                validate=False,
                header_processor_fns=(lambda x: x.startswith('#'),
                                      self._store_header_lines),
                **kwargs)

            self._text_lines = text_source.read_records(
                self._file_name, range_tracker)
            try:
                self._vcf_reader = vcf.Reader(fsock=self._create_generator())
            except SyntaxError as e:
                raise ValueError('Invalid VCF header in %s: %s' %
                                 (self._file_name, str(e)))
Пример #23
0
 def test_utf8_coder(self):
     self.check_coder(coders.StrUtf8Coder(), 'a', u'ab\u00FF', u'\u0101\0')