示例#1
0
    def deferred_status(self):
        # type: () -> Optional[Tuple[Any, Duration]]
        """Returns deferred work which is produced by ``defer_remainder()``.

    When there is a self-checkpoint performed, the system needs to fulfill the
    DelayedBundleApplication with deferred_work for a  ProcessBundleResponse.
    The system calls this API to get deferred_residual with watermark together
    to help the runner to schedule a future work.

    Returns: (deferred_residual, time_delay) if having any residual, else None.
    """
        if self._deferred_residual:
            # If _deferred_timestamp is None, create Duration(0).
            if not self._deferred_timestamp:
                self._deferred_timestamp = Duration()
            # If an absolute timestamp is provided, calculate the delta between
            # the absoluted time and the time deferred_status() is called.
            elif isinstance(self._deferred_timestamp, Timestamp):
                self._deferred_timestamp = (self._deferred_timestamp -
                                            Timestamp.now())
            # If a Duration is provided, the deferred time should be:
            # provided duration - the spent time since the defer_remainder() is
            # called.
            elif isinstance(self._deferred_timestamp, Duration):
                self._deferred_timestamp -= (Timestamp.now() - self._timestamp)
            return self._deferred_residual, self._deferred_timestamp
        return None
示例#2
0
    def defer_remainder(self, deferred_time=None):
        """Performs self-checkpoint on current processing restriction with an
    expected resuming time.

    Self-checkpoint could happen during processing elements. When executing an
    DoFn.process(), you may want to stop processing an element and resuming
    later if current element has been processed quit a long time or you also
    want to have some outputs from other elements. ``defer_remainder()`` can be
    called on per element if needed.

    Args:
      deferred_time: A relative ``Duration`` that indicates the ideal time gap
        between now and resuming, or an absolute ``Timestamp`` for resuming
        execution time. If the time_delay is None, the deferred work will be
        executed as soon as possible.
    """

        # Record current time for calculating deferred_time later.
        with self._lock:
            self._timestamp = Timestamp.now()
            if deferred_time and not isinstance(deferred_time,
                                                (Duration, Timestamp)):
                raise ValueError(
                    'The timestamp of deter_remainder() should be a '
                    'Duration or a Timestamp, or None.')
            self._deferred_timestamp = deferred_time
            checkpoint = self.try_split(0)
            if checkpoint:
                _, self._deferred_residual = checkpoint
示例#3
0
文件: fileio_test.py 项目: mahak/beam
    def test_match_updated_files(self):
        files = []
        tempdir = '%s%s' % (self._new_tempdir(), os.sep)

        def _create_extra_file(element):
            writer = FileSystems.create(FileSystems.join(tempdir, 'extra'))
            writer.close()
            return element.path

        # Create two files to be matched before pipeline
        files.append(self._create_temp_file(dir=tempdir))
        writer = FileSystems.create(FileSystems.join(tempdir, 'extra'))
        writer.close()

        # Add file name that will be created mid-pipeline
        files.append(FileSystems.join(tempdir, 'extra'))
        files.append(FileSystems.join(tempdir, 'extra'))

        interval = 0.2
        start = Timestamp.now()
        stop = start + interval + 0.1

        with TestPipeline() as p:
            match_continiously = (p
                                  | fileio.MatchContinuously(
                                      file_pattern=FileSystems.join(
                                          tempdir, '*'),
                                      interval=interval,
                                      start_timestamp=start,
                                      stop_timestamp=stop,
                                      match_updated_files=True)
                                  | beam.Map(_create_extra_file))

            assert_that(match_continiously, equal_to(files))
示例#4
0
 def _get_message_iter(self):
   """Returns an iterator of messages from the Spark server.
   Note that while message history is de-duped, this function's returned
   iterator may contain duplicate values."""
   sleep_secs = 1.0
   message_ix = 0
   while True:
     response = self._get_spark_status()
     state = self._get_beam_state(response)
     timestamp = Timestamp.now()
     message = None
     if 'message' in response:
       importance = (
           beam_job_api_pb2.JobMessage.MessageImportance.JOB_MESSAGE_ERROR if
           state == beam_job_api_pb2.JobState.FAILED else
           beam_job_api_pb2.JobMessage.MessageImportance.JOB_MESSAGE_BASIC)
       message = beam_job_api_pb2.JobMessage(
           message_id='message%d' % message_ix,
           time=str(int(timestamp)),
           importance=importance,
           message_text=response['message'])
       yield message
       message_ix += 1
       # TODO(BEAM-8983) In the event of a failure, query
       #  additional info from Spark master and/or workers.
     check_timestamp = self.set_state(state)
     if check_timestamp is not None:
       if message:
         self._message_history.append(message)
       self._message_history.append((state, check_timestamp))
     yield state, timestamp
     sleep_secs = min(60, sleep_secs * 1.2)
     time.sleep(sleep_secs)
示例#5
0
 def __init__(self, job_id, job_name, pipeline, options):
     self._job_id = job_id
     self._job_name = job_name
     self._pipeline_proto = pipeline
     self._pipeline_options = options
     self._state_history = [(beam_job_api_pb2.JobState.STOPPED,
                             Timestamp.now())]
示例#6
0
    def __init__(self,
                 file_pattern,
                 interval=360.0,
                 has_deduplication=True,
                 start_timestamp=Timestamp.now(),
                 stop_timestamp=MAX_TIMESTAMP,
                 match_updated_files=False,
                 apply_windowing=False):
        """Initializes a MatchContinuously transform.

    Args:
      file_pattern: The file path to read from.
      interval: Interval at which to check for files in seconds.
      has_deduplication: Whether files already read are discarded or not.
      start_timestamp: Timestamp for start file checking.
      stop_timestamp: Timestamp after which no more files will be checked.
      match_updated_files: (When has_deduplication is set to True) whether match
        file with timestamp changes.
      apply_windowing: Whether each element should be assigned to
        individual window. If false, all elements will reside in global window.
    """

        self.file_pattern = file_pattern
        self.interval = interval
        self.has_deduplication = has_deduplication
        self.start_ts = start_timestamp
        self.stop_ts = stop_timestamp
        self.match_upd = match_updated_files
        self.apply_windowing = apply_windowing
示例#7
0
文件: fileio_test.py 项目: mahak/beam
    def test_without_deduplication(self):
        interval = 0.2
        start = Timestamp.now()
        stop = start + interval + 0.1

        files = []
        tempdir = '%s%s' % (self._new_tempdir(), os.sep)

        # Create a file to be matched before pipeline starts
        file = self._create_temp_file(dir=tempdir)
        # Add file twice, since it will be matched for every interval
        files += [file, file]
        # Add file name that will be created mid-pipeline
        files.append(FileSystems.join(tempdir, 'extra'))

        def _create_extra_file(element):
            writer = FileSystems.create(FileSystems.join(tempdir, 'extra'))
            writer.close()
            return element.path

        with TestPipeline() as p:
            match_continiously = (p
                                  | fileio.MatchContinuously(
                                      file_pattern=FileSystems.join(
                                          tempdir, '*'),
                                      interval=interval,
                                      has_deduplication=False,
                                      start_timestamp=start,
                                      stop_timestamp=stop)
                                  | beam.Map(_create_extra_file))

            assert_that(match_continiously, equal_to(files))
 def __init__(self,
              job_id,  # type: str
              job_name,  # type: Optional[str]
              pipeline,  # type: beam_runner_api_pb2.Pipeline
              options  # type: struct_pb2.Struct
             ):
   self._job_id = job_id
   self._job_name = job_name
   self._pipeline_proto = pipeline
   self._pipeline_options = options
   self._state_history = [(beam_job_api_pb2.JobState.STOPPED, Timestamp.now())]
示例#9
0
    def set_state(self, new_state):
        """Set the latest state as an int enum and update the state history.

    :param new_state: int
      latest state enum
    :return: Timestamp or None
      the new timestamp if the state has not changed, else None
    """
        if new_state != self._state_history[-1][0]:
            timestamp = Timestamp.now()
            self._state_history.append((new_state, timestamp))
            return timestamp
        else:
            return None
示例#10
0
 def __init__(self,
              start_timestamp=Timestamp.now(),
              stop_timestamp=MAX_TIMESTAMP,
              fire_interval=360.0,
              apply_windowing=False):
     '''
 :param start_timestamp: Timestamp for first element.
 :param stop_timestamp: Timestamp after which no elements will be output.
 :param fire_interval: Interval at which to output elements.
 :param apply_windowing: Whether each element should be assigned to
   individual window. If false, all elements will reside in global window.
 '''
     self.start_ts = start_timestamp
     self.stop_ts = stop_timestamp
     self.interval = fire_interval
     self.apply_windowing = apply_windowing
示例#11
0
    def __init__(self,
                 file_pattern,
                 interval=360.0,
                 has_deduplication=True,
                 start_timestamp=Timestamp.now(),
                 stop_timestamp=MAX_TIMESTAMP):
        """Initializes a MatchContinuously transform.

    Args:
      file_pattern: The file path to read from.
      interval: Interval at which to check for files in seconds.
      has_deduplication: Whether files already read are discarded or not.
      start_timestamp: Timestamp for start file checking.
      stop_timestamp: Timestamp after which no more files will be checked.
    """

        self.file_pattern = file_pattern
        self.interval = interval
        self.has_deduplication = has_deduplication
        self.start_ts = start_timestamp
        self.stop_ts = stop_timestamp
    def process(self,
                element,
                batch=DoFn.StateParam(BATCH),
                batchSize=DoFn.StateParam(BATCH_SIZE),
                flushTimer=DoFn.TimerParam(FLUSH_TIMER),
                endOfTime=DoFn.TimerParam(EOW_TIMER)):

        from apache_beam.utils.timestamp import Timestamp, Duration
        from apache_beam.transforms.window import GlobalWindow

        currentSize = batchSize.read()
        if not currentSize:
            currentSize = 1
            flushTimer.set(Timestamp.now() +
                           Duration(micros=self.maxWaitTime * 1000))
            endOfTime.set(GlobalWindow().max_timestamp())
        else:
            currentSize += 1
        batchSize.write(currentSize)
        batch.add(element[1])
        if currentSize >= self.batchSize:
            return self.flush(batch, batchSize)
示例#13
0
 def test_now(self):
     now = Timestamp.now()
     self.assertTrue(isinstance(now, Timestamp))
示例#14
0
 def current_watermark(self):
     self._timestamp = max(self._timestamp, Timestamp.now())
     return self._timestamp
示例#15
0
 def __init__(self, timestamp=None):
     self._timestamp = timestamp or Timestamp.now()
 def test_advance_watermark_with_incorrect_sys_clock(self):
   initial_timestamp = Timestamp.now() + Duration(100)
   watermark_estimator = WalltimeWatermarkEstimator(initial_timestamp)
   self.assertEqual(watermark_estimator.current_watermark(), initial_timestamp)
   self.assertEqual(
       watermark_estimator.get_estimator_state(), initial_timestamp)
 def test_observe_timestamp(self):
   now_time = Timestamp.now() + Duration(10)
   watermark_estimator = WalltimeWatermarkEstimator(now_time)
   watermark_estimator.observe_timestamp(Timestamp(10))
   watermark_estimator.observe_timestamp(Timestamp(10))
   self.assertEqual(watermark_estimator.current_watermark(), now_time)
 def test_initialization(self, mock_timestamp):
   now_time = Timestamp.now() - Duration(10)
   mock_timestamp.side_effect = lambda: now_time
   watermark_estimator = WalltimeWatermarkEstimator()
   self.assertIsInstance(watermark_estimator, WatermarkEstimator)
   self.assertEqual(watermark_estimator.get_estimator_state(), now_time)