Пример #1
0
  def expand(self, pbegin):
    from apache_beam.options.pipeline_options import DebugOptions
    from apache_beam.transforms import util

    assert isinstance(pbegin, pvalue.PBegin)
    self.pipeline = pbegin.pipeline

    debug_options = self.pipeline._options.view_as(DebugOptions)
    if debug_options.experiments and 'beam_fn_api' in debug_options.experiments:
      source = self.source

      def split_source(unused_impulse):
        total_size = source.estimate_size()
        if total_size:
          # 1MB = 1 shard, 1GB = 32 shards, 1TB = 1000 shards, 1PB = 32k shards
          chunk_size = max(1 << 20, 1000 * int(math.sqrt(total_size)))
        else:
          chunk_size = 64 << 20  # 64mb
        return source.split(chunk_size)

      return (
          pbegin
          | core.Impulse()
          | 'Split' >> core.FlatMap(split_source)
          | util.Reshuffle()
          | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read(
              split.source.get_range_tracker(
                  split.start_position, split.stop_position))))
    else:
      # Treat Read itself as a primitive.
      return pvalue.PCollection(self.pipeline)
Пример #2
0
 def expand(self, pbegin):
   return (
       pbegin
       | core.Impulse()
       | 'Split' >> core.FlatMap(lambda _: source.split(
           Read.get_desired_chunk_size(source.estimate_size())))
       | util.Reshuffle()
       | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read(
           split.source.get_range_tracker(
               split.start_position, split.stop_position))))
Пример #3
0
    def expand(self, pbegin):
        from apache_beam.options.pipeline_options import DebugOptions
        from apache_beam.transforms import util

        assert isinstance(pbegin, pvalue.PBegin)
        self.pipeline = pbegin.pipeline

        debug_options = self.pipeline._options.view_as(DebugOptions)
        if debug_options.experiments and 'beam_fn_api' in debug_options.experiments:
            NUM_SPLITS = 1000
            source = self.source
            return (
                pbegin
                | core.Impulse()
                | 'Split' >> core.FlatMap(lambda _: source.split(NUM_SPLITS))
                | util.Reshuffle()
                | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read(
                    split.source.get_range_tracker(split.start_position, split.
                                                   stop_position))))
        else:
            # Treat Read itself as a primitive.
            return pvalue.PCollection(self.pipeline)