def test_proto_conversion(self):
   data = 'data'
   attributes = {'k1': 'v1', 'k2': 'v2'}
   m = PubsubMessage(data, attributes)
   m_converted = PubsubMessage._from_proto_str(m._to_proto_str())
   self.assertEqual(m_converted.data, data)
   self.assertEqual(m_converted.attributes, attributes)
示例#2
0
    def test_read_messages_timestamp_attribute_missing(self, mock_pubsub):
        data = b'data'
        attributes = {}
        publish_time_secs = 1520861821
        publish_time_nanos = 234567000
        publish_time = '2018-03-12T13:37:01.234567Z'
        ack_id = 'ack_id'
        pull_response = test_utils.create_pull_response([
            test_utils.PullResponseMessage(data, attributes, publish_time_secs,
                                           publish_time_nanos, ack_id)
        ])
        expected_elements = [
            TestWindowedValue(PubsubMessage(data, attributes),
                              timestamp.Timestamp.from_rfc3339(publish_time),
                              [window.GlobalWindow()]),
        ]
        mock_pubsub.return_value.pull.return_value = pull_response

        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        pcoll = (p
                 | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                                  None,
                                  None,
                                  with_attributes=True,
                                  timestamp_attribute='nonexistent'))
        assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
        p.run()
        mock_pubsub.return_value.acknowledge.assert_has_calls(
            [mock.call(mock.ANY, [ack_id])])
示例#3
0
  def test_read_messages_timestamp_attribute_milli_success(self, mock_pubsub):
    data = b'data'
    attributes = {'time': '1337'}
    publish_time_secs = 1520861821
    publish_time_nanos = 234567000
    ack_id = 'ack_id'
    pull_response = test_utils.create_pull_response([
        test_utils.PullResponseMessage(
            data, attributes, publish_time_secs, publish_time_nanos, ack_id)
    ])
    expected_elements = [
        TestWindowedValue(
            PubsubMessage(data, attributes),
            timestamp.Timestamp(micros=int(attributes['time']) * 1000),
            [window.GlobalWindow()]),
    ]
    mock_pubsub.return_value.pull.return_value = pull_response

    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    pcoll = (p
             | ReadFromPubSub(
                 'projects/fakeprj/topics/a_topic', None, None,
                 with_attributes=True, timestamp_attribute='time'))
    assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
    p.run()
    mock_pubsub.return_value.acknowledge.assert_has_calls([
        mock.call(mock.ANY, [ack_id])])

    mock_pubsub.return_value.api.transport.channel.close.assert_has_calls([
        mock.call()])
示例#4
0
    def test_write_messages_unsupported_features(self, mock_pubsub):
        data = b'data'
        attributes = {'key': 'value'}
        payloads = [PubsubMessage(data, attributes)]

        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        _ = (p
             | Create(payloads)
             | WriteToPubSub('projects/fakeprj/topics/a_topic',
                             id_label='a_label'))
        with self.assertRaisesRegexp(NotImplementedError,
                                     r'id_label is not supported'):
            p.run()
        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        _ = (p
             | Create(payloads)
             | WriteToPubSub('projects/fakeprj/topics/a_topic',
                             timestamp_attribute='timestamp'))
        with self.assertRaisesRegexp(NotImplementedError,
                                     r'timestamp_attribute is not supported'):
            p.run()
示例#5
0
    def _wait_for_messages(self, subscription, expected_num, timeout):
        """Wait for messages from given subscription."""
        logging.debug('Start pulling messages from %s', subscription.full_name)
        total_messages = []
        start_time = time.time()
        while time.time() - start_time <= timeout:
            pulled = subscription.pull(max_messages=MAX_MESSAGES_IN_ONE_PULL)
            for ack_id, message in pulled:
                if not self.with_attributes:
                    total_messages.append(message.data)
                    continue

                msg = PubsubMessage._from_message(message)
                if self.strip_attributes:
                    for attr in self.strip_attributes:
                        try:
                            del msg.attributes[attr]
                        except KeyError:
                            msg.attributes[attr] = (
                                'PubSubMessageMatcher error: '
                                'expected attribute not found.')
                total_messages.append(msg)

                subscription.acknowledge([ack_id])
            if len(total_messages) >= expected_num:
                return total_messages
            time.sleep(1)

        logging.error('Timeout after %d sec. Received %d messages from %s.',
                      timeout, len(total_messages), subscription.full_name)
        return total_messages
示例#6
0
 def test_message_matcher_attributes_success(self, mock_get_sub, unsued_mock):
   self.init_matcher(expected_msg=[PubsubMessage(b'a', {'k': 'v'})],
                     with_attributes=True)
   mock_sub = mock_get_sub.return_value
   mock_sub.pull.side_effect = [
       create_pull_response([PullResponseMessage(b'a', {'k': 'v'})])
   ]
   hc_assert_that(self.mock_presult, self.pubsub_matcher)
   self.assertEqual(mock_sub.pull.call_count, 1)
   self.assertEqual(mock_sub.acknowledge.call_count, 1)
示例#7
0
 def test_message_matcher_attributes_fail(self, mock_get_sub, unsued_mock):
     self.init_matcher(with_attributes=True)
     self.pubsub_matcher.expected_msg = [PubsubMessage('a', {})]
     mock_sub = mock_get_sub.return_value
     msg_a = pubsub.message.Message(b'a', 'unused_id')
     msg_a.attributes['k'] = 'v'  # Unexpected.
     mock_sub.pull.side_effect = [[(1, msg_a)]]
     with self.assertRaisesRegexp(AssertionError, r'Unexpected'):
         hc_assert_that(self.mock_presult, self.pubsub_matcher)
     self.assertEqual(mock_sub.pull.call_count, 1)
示例#8
0
 def test_message_matcher_attributes_success(self, mock_get_sub,
                                             unsued_mock):
     self.init_matcher(with_attributes=True)
     self.pubsub_matcher.expected_msg = [PubsubMessage('a', {'k': 'v'})]
     mock_sub = mock_get_sub.return_value
     msg_a = pubsub.message.Message(b'a', 'unused_id')
     msg_a.attributes['k'] = 'v'
     mock_sub.pull.side_effect = [[(1, msg_a)]]
     hc_assert_that(self.mock_presult, self.pubsub_matcher)
     self.assertEqual(mock_sub.pull.call_count, 1)
示例#9
0
 def test_write_to_pubsub_with_attributes(self):
   mock_pubsub = mock.Mock()
   topic_path = "project/fakeproj/topics/faketopic"
   data = b'data'
   attributes = {'key': 'value'}
   message = PubsubMessage(data, attributes)
   utils.write_to_pubsub(
       mock_pubsub, topic_path, [message], with_attributes=True)
   mock_pubsub.publish.assert_has_calls(
       [mock.call(topic_path, data, **attributes),
        mock.call().result()])
    def _inject_numbers(self, topic, num_messages):
        """Inject numbers as test data to PubSub."""

        for n in range(num_messages):
            user = {'name': f'conall_{n}'}
            user_str = json.dumps(user)

            # logging.info(f'Injecting {user_str} to topic {topic.name}')

            msg = PubsubMessage(b'conall_0', {'timestamp': '1608051184000'})
            self.pub_client.publish(self.input_topic.name, msg.data,
                                    **msg.attributes)
示例#11
0
 def test_message_matcher_attributes_fail(self, mock_get_sub, unsued_mock):
     self.init_matcher(expected_msg=[PubsubMessage(b'a', {})],
                       with_attributes=True)
     mock_sub = mock_get_sub.return_value
     # Unexpected attribute 'k'.
     mock_sub.pull.side_effect = [
         create_pull_response([PullResponseMessage(b'a', {'k': 'v'})])
     ]
     with self.assertRaisesRegex(AssertionError, r'Unexpected'):
         hc_assert_that(self.mock_presult, self.pubsub_matcher)
     self.assertEqual(mock_sub.pull.call_count, 1)
     self.assertEqual(mock_sub.acknowledge.call_count, 1)
示例#12
0
 def test_message_matcher_strip_fail(self, mock_get_sub, unsued_mock):
     self.init_matcher(with_attributes=True,
                       strip_attributes=['id', 'timestamp'])
     self.pubsub_matcher.expected_msg = [PubsubMessage('a', {'k': 'v'})]
     mock_sub = mock_get_sub.return_value
     # msg_a is missing attribute 'timestamp'.
     msg_a = pubsub.message.Message(b'a', 'unused_id')
     msg_a.attributes['id'] = 'foo'
     msg_a.attributes['k'] = 'v'
     mock_sub.pull.side_effect = [[(1, msg_a)]]
     with self.assertRaisesRegexp(AssertionError, r'Stripped attributes'):
         hc_assert_that(self.mock_presult, self.pubsub_matcher)
     self.assertEqual(mock_sub.pull.call_count, 1)
示例#13
0
 def test_message_matcher_strip_fail(self, mock_get_sub, unsued_mock):
   self.init_matcher(expected_msg=[PubsubMessage(b'a', {'k': 'v'})],
                     with_attributes=True,
                     strip_attributes=['id', 'timestamp'])
   mock_sub = mock_get_sub.return_value
   # Message is missing attribute 'timestamp'.
   mock_sub.pull.side_effect = [create_pull_response([
       PullResponseMessage(b'a', {'id': 'foo', 'k': 'v'})
   ])]
   with self.assertRaisesRegex(AssertionError, r'Stripped attributes'):
     hc_assert_that(self.mock_presult, self.pubsub_matcher)
   self.assertEqual(mock_sub.pull.call_count, 1)
   self.assertEqual(mock_sub.acknowledge.call_count, 1)
示例#14
0
    def test_write_messages_with_attributes_success(self, mock_pubsub):
        data = b'data'
        attributes = {'key': 'value'}
        payloads = [PubsubMessage(data, attributes)]

        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            _ = (p
                 | Create(payloads)
                 | WriteToPubSub('projects/fakeprj/topics/a_topic',
                                 with_attributes=True))
        mock_pubsub.return_value.publish.assert_has_calls(
            [mock.call(mock.ANY, data, **attributes)])
示例#15
0
      def _get_element(message):
        parsed_message = PubsubMessage._from_message(message)
        if (timestamp_attribute and
            timestamp_attribute in parsed_message.attributes):
          rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
          try:
            timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
          except ValueError:
            try:
              timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
            except ValueError as e:
              raise ValueError('Bad timestamp value: %s' % e)
        else:
          timestamp = Timestamp.from_rfc3339(message.service_timestamp)

        return timestamp, parsed_message
示例#16
0
  def test_write_messages_with_attributes_success(self, mock_pubsub):
    data = 'data'
    attributes = {'key': 'value'}
    payloads = [PubsubMessage(data, attributes)]
    expected_payloads = [[data, attributes]]

    mock_pubsub.Client = functools.partial(FakePubsubClient,
                                           messages_write=expected_payloads)

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    _ = (p
         | Create(payloads)
         | WriteToPubSub('projects/fakeprj/topics/a_topic',
                         with_attributes=True))
    p.run()
示例#17
0
    def test_read_messages_success(self, mock_pubsub):
        payload = 'payload'
        message_id = 'message_id'
        attributes = {'attribute': 'value'}
        data = [pubsub.message.Message(payload, message_id, attributes)]
        expected_data = [PubsubMessage(payload, message_id, attributes, None)]

        mock_pubsub.Client = functools.partial(FakePubsubClient, data)
        mock_pubsub.subscription.AutoAck = FakeAutoAck

        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadMessagesFromPubSub('projects/fakeprj/topics/a_topic',
                                          None, 'a_label'))
        assert_that(pcoll, equal_to(expected_data))
        p.run()
示例#18
0
 def test_read_from_pubsub_with_attributes(self):
     mock_pubsub = mock.Mock()
     subscription_path = "project/fakeproj/subscriptions/fakesub"
     data = b'data'
     ack_id = 'ack_id'
     attributes = {'key': 'value'}
     message = PubsubMessage(data, attributes)
     pull_response = test_utils.create_pull_response(
         [test_utils.PullResponseMessage(data, attributes, ack_id=ack_id)])
     mock_pubsub.pull.return_value = pull_response
     output = utils.read_from_pubsub(mock_pubsub,
                                     subscription_path,
                                     with_attributes=True,
                                     number_of_elements=1)
     self.assertEqual([message], output)
     mock_pubsub.acknowledge.assert_called_once_with(
         subscription_path, [ack_id])
    def _get_element(message):
      parsed_message = PubsubMessage._from_message(message)
      if (timestamp_attribute and
          timestamp_attribute in parsed_message.attributes):
        rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
        try:
          timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
        except ValueError:
          try:
            timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
          except ValueError as e:
            raise ValueError('Bad timestamp value: %s' % e)
      else:
        timestamp = Timestamp(message.publish_time.seconds,
                              message.publish_time.nanos // 1000)

      return timestamp, parsed_message
示例#20
0
    def _wait_for_messages(self, expected_num, timeout):
        """Wait for messages from given subscription."""
        total_messages = []
        total_messages_all_details = []

        sub_client = pubsub.SubscriberClient()
        start_time = time.time()
        while time.time() - start_time <= timeout:
            response = sub_client.pull(
                subscription=self.sub_name,
                max_messages=self.max_messages_in_one_pull,
                timeout=self.pull_timeout)
            for rm in response.received_messages:
                msg = PubsubMessage._from_message(rm.message)
                full_message = (msg.data, msg.attributes, msg.attributes,
                                msg.publish_time, msg.ordering_key)
                if not self.with_attributes:
                    total_messages.append(msg.data)
                    total_messages_all_details.append(full_message)
                    continue

                if self.strip_attributes:
                    for attr in self.strip_attributes:
                        try:
                            del msg.attributes[attr]
                        except KeyError:
                            msg.attributes[attr] = (
                                'PubSubMessageMatcher error: '
                                'expected attribute not found.')
                total_messages.append(msg)
                total_messages_all_details.append(full_message)

            ack_ids = [rm.ack_id for rm in response.received_messages]
            if ack_ids:
                sub_client.acknowledge(subscription=self.sub_name,
                                       ack_ids=ack_ids)
            if len(total_messages) >= expected_num:
                break
            time.sleep(self.sleep_time)

        if time.time() - start_time > timeout:
            _LOGGER.error(
                'Timeout after %d sec. Received %d messages from %s.', timeout,
                len(total_messages), self.sub_name)
        return total_messages, total_messages_all_details
      def _get_element(message):
        parsed_message = PubsubMessage._from_message(message)
        if timestamp_attribute:
          try:
            rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
          except KeyError as e:
            raise KeyError('Timestamp attribute not found: %s' % e)
          try:
            timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
          except ValueError:
            try:
              timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
            except ValueError as e:
              raise ValueError('Bad timestamp value: %s' % e)
        else:
          timestamp = Timestamp.from_rfc3339(message.service_timestamp)

        return timestamp, parsed_message
示例#22
0
    def _get_element(message):
      parsed_message = PubsubMessage._from_message(message)
      if (timestamp_attribute and
          timestamp_attribute in parsed_message.attributes):
        rfc3339_or_milli = parsed_message.attributes[timestamp_attribute]
        try:
          timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000)
        except ValueError:
          try:
            timestamp = Timestamp.from_rfc3339(rfc3339_or_milli)
          except ValueError as e:
            raise ValueError('Bad timestamp value: %s' % e)
      else:
        if message.publish_time is None:
          raise ValueError('No publish time present in message: %s' % message)
        timestamp = Timestamp.from_utc_datetime(message.publish_time)

      return timestamp, parsed_message
示例#23
0
    def test_read_from_pubsub_many(self):
        response_size = 33
        number_of_elements = 100
        mock_pubsub = mock.Mock()
        subscription_path = "project/fakeproj/subscriptions/fakesub"
        data_list = [
            'data {}'.format(i).encode("utf-8")
            for i in range(number_of_elements)
        ]
        attributes_list = [{
            'key': 'value {}'.format(i)
        } for i in range(number_of_elements)]
        ack_ids = ['ack_id_{}'.format(i) for i in range(number_of_elements)]
        messages = [
            PubsubMessage(data, attributes)
            for data, attributes in zip(data_list, attributes_list)
        ]
        response_messages = [
            test_utils.PullResponseMessage(data, attributes, ack_id=ack_id)
            for data, attributes, ack_id in zip(data_list, attributes_list,
                                                ack_ids)
        ]

        class SequentialPullResponse(object):
            def __init__(self, response_messages, response_size):
                self.response_messages = response_messages
                self.response_size = response_size
                self._index = 0

            def __call__(self, *args, **kwargs):
                start = self._index
                self._index += self.response_size
                response = test_utils.create_pull_response(
                    self.response_messages[start:start + self.response_size])
                return response

        mock_pubsub.pull.side_effect = SequentialPullResponse(
            response_messages, response_size)
        output = utils.read_from_pubsub(mock_pubsub,
                                        subscription_path,
                                        with_attributes=True,
                                        number_of_elements=number_of_elements)
        self.assertEqual(messages, output)
        self._assert_ack_ids_equal(mock_pubsub, ack_ids)
示例#24
0
  def _test_read_messages_success(self, mock_pubsub):
    payload = 'payload'
    message_id = 'message_id'
    publish_time = '2018-03-12T13:37:01.234567Z'
    attributes = {'key': 'value'}
    data = [create_client_message(
        payload, message_id, attributes, publish_time)]
    expected_data = [TestWindowedValue(PubsubMessage(payload, attributes),
                                       timestamp.Timestamp(1520861821.234567),
                                       [window.GlobalWindow()])]

    mock_pubsub.Client = functools.partial(FakePubsubClient, data)
    mock_pubsub.subscription.AutoAck = FakeAutoAck

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    pcoll = (p
             | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                              None, 'a_label', with_attributes=True))
    assert_that(pcoll, equal_to(expected_data), reify_windows=True)
    p.run()
示例#25
0
    def _wait_for_messages(self, expected_num, timeout):
        """Wait for messages from given subscription."""
        total_messages = []

        sub_client = pubsub.SubscriberClient()
        start_time = time.time()
        while time.time() - start_time <= timeout:
            response = sub_client.pull(self.sub_name,
                                       max_messages=MAX_MESSAGES_IN_ONE_PULL,
                                       return_immediately=True)
            for rm in response.received_messages:
                msg = PubsubMessage._from_message(rm.message)
                if not self.with_attributes:
                    if isinstance(msg.data, bytes):
                        msg.data = msg.data.decode('utf-8')
                    total_messages.append(msg.data)
                    continue

                if self.strip_attributes:
                    for attr in self.strip_attributes:
                        try:
                            del msg.attributes[attr]
                        except KeyError:
                            msg.attributes[attr] = (
                                'PubSubMessageMatcher error: '
                                'expected attribute not found.')
                total_messages.append(msg)

            ack_ids = [rm.ack_id for rm in response.received_messages]
            if ack_ids:
                sub_client.acknowledge(self.sub_name, ack_ids)
            if len(total_messages) >= expected_num:
                break
            time.sleep(1)

        if time.time() - start_time > timeout:
            logging.error(
                'Timeout after %d sec. Received %d messages from %s.', timeout,
                len(total_messages), self.sub_name)
        return total_messages
  def _wait_for_messages(self, expected_num, timeout):
    """Wait for messages from given subscription."""
    total_messages = []

    sub_client = pubsub.SubscriberClient()
    start_time = time.time()
    while time.time() - start_time <= timeout:
      response = sub_client.pull(self.sub_name,
                                 max_messages=MAX_MESSAGES_IN_ONE_PULL,
                                 return_immediately=True)
      for rm in response.received_messages:
        msg = PubsubMessage._from_message(rm.message)
        if not self.with_attributes:
          total_messages.append(msg.data)
          continue

        if self.strip_attributes:
          for attr in self.strip_attributes:
            try:
              del msg.attributes[attr]
            except KeyError:
              msg.attributes[attr] = ('PubSubMessageMatcher error: '
                                      'expected attribute not found.')
        total_messages.append(msg)

      ack_ids = [rm.ack_id for rm in response.received_messages]
      if ack_ids:
        sub_client.acknowledge(self.sub_name, ack_ids)
      if len(total_messages) >= expected_num:
        break
      time.sleep(1)

    if time.time() - start_time > timeout:
      logging.error('Timeout after %d sec. Received %d messages from %s.',
                    timeout, len(total_messages), self.sub_name)
    return total_messages
示例#27
0
class PubSubIntegrationTest(unittest.TestCase):

    ID_LABEL = 'id'
    TIMESTAMP_ATTRIBUTE = 'timestamp'
    INPUT_MESSAGES = [
        # Use ID_LABEL attribute to deduplicate messages with the same ID.
        PubsubMessage('data001', {ID_LABEL: 'foo'}),
        PubsubMessage('data001', {ID_LABEL: 'foo'}),
        PubsubMessage('data001', {ID_LABEL: 'foo'}),
        # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the IT
        # pipeline writes back the timestamp of each element (as reported by
        # Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
        PubsubMessage('data002', {
            TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
        }),
    ]
    EXPECTED_OUTPUT_MESSAGES = [
        PubsubMessage('data001-seen', {'processed': 'IT'}),
        PubsubMessage(
            'data002-seen', {
                TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
                'processed': 'IT',
            }),
    ]

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')
        self.uuid = str(uuid.uuid4())

        # Set up PubSub environment.
        from google.cloud import pubsub
        self.pubsub_client = pubsub.Client(project=self.project)
        self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid)
        self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid)
        self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid)
        self.output_sub = self.output_topic.subscription(OUTPUT_SUB +
                                                         self.uuid)

        self.input_topic.create()
        self.output_topic.create()
        test_utils.wait_for_topics_created(
            [self.input_topic, self.output_topic])
        self.input_sub.create()
        self.output_sub.create()

    def tearDown(self):
        test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
        test_utils.cleanup_topics([self.input_topic, self.output_topic])

    def _test_streaming(self, with_attributes):
        """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
        # Build expected dataset.
        # Set extra options to the pipeline for test purpose
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        expected_messages = self.EXPECTED_OUTPUT_MESSAGES
        if not with_attributes:
            expected_messages = [
                pubsub_msg.data for pubsub_msg in expected_messages
            ]
        pubsub_msg_verifier = PubSubMessageMatcher(
            self.project,
            OUTPUT_SUB + self.uuid,
            expected_messages,
            timeout=MESSAGE_MATCHER_TIMEOUT_S,
            with_attributes=with_attributes,
            strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE])
        extra_opts = {
            'input_subscription': self.input_sub.full_name,
            'output_topic': self.output_topic.full_name,
            'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
            'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
        }

        # Generate input data and inject to PubSub.
        test_utils.wait_for_subscriptions_created([self.input_sub])
        for msg in self.INPUT_MESSAGES:
            self.input_topic.publish(msg.data, **msg.attributes)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        pubsub_it_pipeline.run_pipeline(
            argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
            with_attributes=with_attributes,
            id_label=self.ID_LABEL,
            timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

    @attr('IT')
    def test_streaming_data_only(self):
        self._test_streaming(with_attributes=False)

    @attr('IT')
    def test_streaming_with_attributes(self):
        self._test_streaming(with_attributes=True)
示例#28
0
 def test_payload_valid(self):
     _ = PubsubMessage('', None)
     _ = PubsubMessage('data', None)
     _ = PubsubMessage(None, {'k': 'v'})
示例#29
0
 def _get_element(message):
   if self.source.with_attributes:
     return PubsubMessage._from_message(message)
   else:
     return message.data
示例#30
0
 def test_payload_invalid(self):
   with self.assertRaisesRegex(ValueError, r'data.*attributes.*must be set'):
     _ = PubsubMessage(None, None)
   with self.assertRaisesRegex(ValueError, r'data.*attributes.*must be set'):
     _ = PubsubMessage(None, {})
示例#31
0
文件: pubsub_test.py 项目: mahak/beam
 def test_payload_publish_invalid(self):
     with self.assertRaisesRegex(ValueError, r'data field.*10MB'):
         msg = PubsubMessage(b'0' * 1024 * 1024 * 11, None)
         msg._to_proto_str(for_publish=True)
     with self.assertRaisesRegex(ValueError, 'attribute key'):
         msg = PubsubMessage(b'0', {'0' * 257: '0'})
         msg._to_proto_str(for_publish=True)
     with self.assertRaisesRegex(ValueError, 'attribute value'):
         msg = PubsubMessage(b'0', {'0' * 100: '0' * 1025})
         msg._to_proto_str(for_publish=True)
     with self.assertRaisesRegex(ValueError, '100 attributes'):
         attributes = {}
         for i in range(0, 101):
             attributes[str(i)] = str(i)
         msg = PubsubMessage(b'0', attributes)
         msg._to_proto_str(for_publish=True)
     with self.assertRaisesRegex(ValueError, 'ordering key'):
         msg = PubsubMessage(b'0', None, ordering_key='0' * 1301)
         msg._to_proto_str(for_publish=True)
class PubSubIntegrationTest(unittest.TestCase):

  ID_LABEL = 'id'
  TIMESTAMP_ATTRIBUTE = 'timestamp'
  INPUT_MESSAGES = {
      # TODO(BEAM-4275): DirectRunner doesn't support reading or writing
      # label_ids, nor writing timestamp attributes. Once these features exist,
      # TestDirectRunner and TestDataflowRunner should behave identically.
      'TestDirectRunner': [
          PubsubMessage(b'data001', {}),
          # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
          # IT pipeline writes back the timestamp of each element (as reported
          # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
          PubsubMessage(
              b'data002', {
                  TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
              }),
          PubsubMessage(b'data003\xab\xac', {}),
          PubsubMessage(
              b'data004\xab\xac', {
                  TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
              })
      ],
      'TestDataflowRunner': [
          # Use ID_LABEL attribute to deduplicate messages with the same ID.
          PubsubMessage(b'data001', {ID_LABEL: 'foo'}),
          PubsubMessage(b'data001', {ID_LABEL: 'foo'}),
          PubsubMessage(b'data001', {ID_LABEL: 'foo'}),
          # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
          # IT pipeline writes back the timestamp of each element (as reported
          # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
          PubsubMessage(
              b'data002', {
                  TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
              }),
          PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}),
          PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}),
          PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}),
          PubsubMessage(
              b'data004\xab\xac', {
                  TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
              })
      ],
  }
  EXPECTED_OUTPUT_MESSAGES = {
      'TestDirectRunner': [
          PubsubMessage(b'data001-seen', {'processed': 'IT'}),
          PubsubMessage(
              b'data002-seen',
              {
                  TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
                  TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
                  'processed': 'IT',
              }),
          PubsubMessage(b'data003\xab\xac-seen', {'processed': 'IT'}),
          PubsubMessage(
              b'data004\xab\xac-seen',
              {
                  TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
                  TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
                  'processed': 'IT',
              })
      ],
      'TestDataflowRunner': [
          PubsubMessage(b'data001-seen', {'processed': 'IT'}),
          PubsubMessage(
              b'data002-seen',
              {
                  TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
                  'processed': 'IT',
              }),
          PubsubMessage(b'data003\xab\xac-seen', {'processed': 'IT'}),
          PubsubMessage(
              b'data004\xab\xac-seen',
              {
                  TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
                  'processed': 'IT',
              })
      ],
  }

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pub_client = pubsub.PublisherClient()
    self.input_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid))
    self.output_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid))

    self.sub_client = pubsub.SubscriberClient()
    self.input_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project, INPUT_SUB + self.uuid),
        self.input_topic.name)
    self.output_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project, OUTPUT_SUB + self.uuid),
        self.output_topic.name)

  def tearDown(self):
    test_utils.cleanup_subscriptions(
        self.sub_client, [self.input_sub, self.output_sub])
    test_utils.cleanup_topics(
        self.pub_client, [self.input_topic, self.output_topic])

  def _test_streaming(self, with_attributes):
    """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
    # Set on_success_matcher to verify pipeline state and pubsub output. These
    # verifications run on a (remote) worker.

    # Expect the state to be RUNNING since a streaming pipeline is usually
    # never DONE. The test runner will cancel the pipeline after verification.
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name]
    if not with_attributes:
      expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages]
    if self.runner_name == 'TestDirectRunner':
      strip_attributes = None
    else:
      strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]
    pubsub_msg_verifier = PubSubMessageMatcher(
        self.project,
        self.output_sub.name,
        expected_messages,
        timeout=MESSAGE_MATCHER_TIMEOUT_S,
        with_attributes=with_attributes,
        strip_attributes=strip_attributes)
    extra_opts = {
        'input_subscription': self.input_sub.name,
        'output_topic': self.output_topic.name,
        'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
        'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
    }

    # Generate input data and inject to PubSub.
    for msg in self.INPUT_MESSAGES[self.runner_name]:
      self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    pubsub_it_pipeline.run_pipeline(
        argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
        with_attributes=with_attributes,
        id_label=self.ID_LABEL,
        timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

  @attr('IT')
  def test_streaming_data_only(self):
    self._test_streaming(with_attributes=False)

  @attr('IT')
  def test_streaming_with_attributes(self):
    self._test_streaming(with_attributes=True)