def test_klio_filter_force(global_force, mock_config): mock_config.job_config.data.outputs[0].force = global_force kmsg1 = klio_pb2.KlioMessage() kmsg1.metadata.force = True kmsg2 = klio_pb2.KlioMessage() kmsg2.metadata.force = False pcoll = [kmsg1.SerializeToString(), kmsg2.SerializeToString()] with test_pipeline.TestPipeline() as p: p | beam.Create(pcoll) | helpers.KlioFilterForce() actual_counters = p.result.metrics().query()["counters"] force_ctr = actual_counters[0] assert "KlioFilterForce" == force_ctr.key.metric.namespace assert "kmsg-process-force" == force_ctr.key.metric.name if global_force: assert 1 == len(actual_counters) assert 2 == force_ctr.committed else: assert 2 == len(actual_counters) skip_ctr = actual_counters[1] assert 1 == force_ctr.committed assert 1 == skip_ctr.committed assert "KlioFilterForce" == skip_ctr.key.metric.namespace assert "kmsg-skip-force" == skip_ctr.key.metric.name
def _assert_expected_msg(actual): actual_msg = klio_pb2.KlioMessage() actual_msg.ParseFromString(actual) expected_msg = klio_pb2.KlioMessage() expected_msg.data.element = b"2" assert actual_msg == expected_msg
def test_klio_filter_ping(global_ping, mock_config): mock_config.job_config.data.inputs[0].ping = global_ping kmsg1 = klio_pb2.KlioMessage() kmsg1.metadata.ping = True kmsg2 = klio_pb2.KlioMessage() kmsg2.metadata.ping = False pcoll = [kmsg1.SerializeToString(), kmsg2.SerializeToString()] with test_pipeline.TestPipeline() as p: p | beam.Create(pcoll) | helpers.KlioFilterPing() actual_counters = p.result.metrics().query()["counters"] pass_thru_ctr = actual_counters[0] assert "KlioFilterPing" == pass_thru_ctr.key.metric.namespace assert "kmsg-skip-ping" == pass_thru_ctr.key.metric.name if global_ping: assert 1 == len(actual_counters) assert 2 == pass_thru_ctr.committed else: assert 2 == len(actual_counters) process_ctr = actual_counters[1] assert 1 == pass_thru_ctr.committed assert 1 == process_ctr.committed assert "KlioFilterPing" == process_ctr.key.metric.namespace assert "kmsg-process-ping" == process_ctr.key.metric.name
def test_trigger_upstream_job(mock_config, mocker, caplog): mock_gcs_client = mocker.patch("klio.transforms._helpers.gcsio.GcsIO") mock_gcs_client.return_value.exists.return_value = False mock_pubsub_client = mocker.patch("google.cloud.pubsub.PublisherClient") kmsg = klio_pb2.KlioMessage() kmsg.data.element = b"does_not_exist" exp_current_job = klio_pb2.KlioJob() exp_current_job.job_name = "a-job" exp_current_job.gcp_project = "not-a-real-project" exp_upstream_job = klio_pb2.KlioJob() exp_upstream_job.job_name = "upstream-job" exp_upstream_job.gcp_project = "upstream-project" exp_kmsg = klio_pb2.KlioMessage() exp_kmsg.version = klio_pb2.Version.V2 exp_kmsg.data.element = b"does_not_exist" exp_lmtd = exp_kmsg.metadata.intended_recipients.limited exp_lmtd.recipients.extend([exp_upstream_job, exp_current_job]) exp_lmtd.trigger_children_of.CopyFrom(exp_current_job) options = pipeline_options.PipelineOptions([]) options.view_as(pipeline_options.StandardOptions).streaming = True with test_pipeline.TestPipeline(options=options) as p: in_pcol = p | beam.Create([kmsg.SerializeToString()]) input_data = in_pcol | helpers.KlioGcsCheckInputExists() _ = input_data.not_found | helpers.KlioTriggerUpstream( upstream_job_name="upstream-job", upstream_topic="projects/upstream-project/topics/does-not-exist", ) mock_gcs_client.return_value.exists.assert_called_once_with( "gs://hopefully-this-bucket-doesnt-exist/does_not_exist") mock_pubsub_client.return_value.publish.assert_called_once_with( mock_pubsub_client.return_value.topic_path.return_value, exp_kmsg.SerializeToString(), ) actual_counters = p.result.metrics().query()["counters"] assert 2 == len(actual_counters) data_not_found_ctr = actual_counters[0] trigger_upstream_ctr = actual_counters[1] assert 1 == data_not_found_ctr.committed assert "KlioGcsCheckInputExists" == data_not_found_ctr.key.metric.namespace assert "kmsg-data-not-found-input" == data_not_found_ctr.key.metric.name assert 1 == trigger_upstream_ctr.committed assert "KlioTriggerUpstream" == trigger_upstream_ctr.key.metric.namespace assert "kmsg-trigger-upstream" == trigger_upstream_ctr.key.metric.name expected_log_msg = "Triggering upstream upstream-job for does_not_exist" for record in caplog.records: if expected_log_msg in record.message: assert True break else: assert False, "Expected log message not found"
def _generate_kmsg_with_payload(element): message = klio_pb2.KlioMessage() message.version = klio_pb2.Version.V2 message.metadata.intended_recipients.anyone.SetInParent() message.data.element = bytes(str(element["entity_id"]), "utf-8") message.data.payload = bytes(json.dumps(element), "utf-8") return message.SerializeToString()
def _dump_to_klio_message(key, payload): kmsg = klio_pb2.KlioMessage() kmsg.data.element = key out = io.BytesIO() np.save(out, payload) kmsg.data.payload = out.getvalue() return kmsg.SerializeToString()
def test_read_messages_timestamp_attribute_rfc3339_success( mocker, patch_sub_client, patch_msg_manager, ): exp_entity_id = "entity_id" kmsg = klio_pb2.KlioMessage() kmsg.data.element = bytes(exp_entity_id, "utf-8") data = kmsg.SerializeToString() attributes = {"time": "2018-03-12T13:37:01.234567Z"} publish_time_secs = 1337000000 publish_time_nanos = 133700000 ack_id = "ack_id" pull_response = beam_test_utils.create_pull_response([ beam_test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) pmsg = b_pubsub.PubsubMessage(data, attributes) expected_elements = [ beam_testing_util.TestWindowedValue( pmsg, beam_utils.timestamp.Timestamp.from_rfc3339(attributes["time"]), [beam_transforms.window.GlobalWindow()], ), ] patch_sub_client.pull.return_value = pull_response options = pipeline_options.PipelineOptions([]) options.view_as(pipeline_options.StandardOptions).streaming = True with beam_test_pipeline.TestPipeline(options=options) as p: pcoll = p | b_pubsub.ReadFromPubSub( "projects/fakeprj/topics/a_topic", None, None, with_attributes=True, timestamp_attribute="time", ) # Check original functionality that was kept the same beam_testing_util.assert_that( pcoll, beam_testing_util.equal_to(expected_elements), reify_windows=True, ) # Check overridden functionality: # 1. Check that auto-acking is skipped patch_sub_client.acknowledge.assert_not_called() # 2. Check that MessageManager daemon threads were started patch_msg_manager.assert_called_once_with( patch_sub_client.subscription_path()) # 3. Check that messages were added to the MessageManager patch_msg_manager.return_value.add.assert_called_once_with(ack_id, pmsg) # 4. Check that one message is handled at a time, instead of the # original 10 patch_sub_client.pull.assert_called_once_with(mocker.ANY, max_messages=1, return_immediately=True) patch_sub_client.api.transport.channel.close.assert_called_once_with()
def test_klio_drop(mock_config, caplog): kmsg = klio_pb2.KlioMessage() with test_pipeline.TestPipeline() as p: p | beam.Create([kmsg.SerializeToString()]) | helpers.KlioDrop() # beam produces 50+ log messages so let's just iterate and find what # we're looking for *shrug* for rec in caplog.records: if "Dropping KlioMessage" in rec.message: assert True break else: assert False, "Expected log message not found" actual_counters = p.result.metrics().query()["counters"] assert 3 == len(actual_counters) received_ctr = actual_counters[0] drop_ctr = actual_counters[1] success_ctr = actual_counters[2] assert 1 == received_ctr.committed assert "KlioDrop.process" == received_ctr.key.metric.namespace assert "kmsg-received" == received_ctr.key.metric.name assert 1 == drop_ctr.committed assert "KlioDrop" == drop_ctr.key.metric.namespace assert "kmsg-drop" == drop_ctr.key.metric.name assert 1 == success_ctr.committed assert "KlioDrop.process" == success_ctr.key.metric.namespace assert "kmsg-success" == success_ctr.key.metric.name
def assert_expected_klio_msg_from_avro_write(element): file_path_read = os.path.join(FIXTURE_PATH, "elements_text_file.txt") with open(file_path_read, "rb") as fr: expected_elements = fr.read().splitlines() message = klio_pb2.KlioMessage() message.ParseFromString(element) assert message.data.element in expected_elements
def test_read_messages_timestamp_attribute_fail_parse(patch_sub_client): exp_entity_id = "entity_id" kmsg = klio_pb2.KlioMessage() kmsg.data.element = bytes(exp_entity_id, "utf-8") data = kmsg.SerializeToString() attributes = {"time": "1337 unparseable"} publish_time_secs = 1520861821 publish_time_nanos = 234567000 ack_id = "ack_id" pull_response = beam_test_utils.create_pull_response([ beam_test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) patch_sub_client.pull.return_value = pull_response options = pipeline_options.PipelineOptions([]) options.view_as(pipeline_options.StandardOptions).streaming = True p = beam_test_pipeline.TestPipeline(options=options) _ = p | b_pubsub.ReadFromPubSub( "projects/fakeprj/topics/a_topic", None, None, with_attributes=True, timestamp_attribute="time", ) with pytest.raises(ValueError, match=r"parse"): p.run() patch_sub_client.acknowledge.assert_not_called() patch_sub_client.api.transport.channel.close.assert_called_with()
def _convert_raw_pubsub_message(ack_id, pmessage): # TODO: either use klio.message.serializer.to_klio_message, or # figure out how to handle when a parsed_message can't be parsed # into a KlioMessage (will need to somehow get the klio context) kmsg = klio_pb2.KlioMessage() kmsg.ParseFromString(pmessage.data) entity_id = kmsg.data.element.decode("utf-8") psk_msg = PubSubKlioMessage(ack_id, entity_id) return psk_msg
def read_records(self, file_name, range_tracker): records = super(_KlioFastAvroSource, self).read_records( file_name=file_name, range_tracker=range_tracker ) for record in records: message = klio_pb2.KlioMessage() message.version = klio_pb2.Version.V2 message.metadata.intended_recipients.anyone.SetInParent() message.data.element = bytes(json.dumps(record).encode("utf-8")) yield message.SerializeToString()
def read_records(self, file_name, range_tracker): records = super(_KlioReadFromTextSource, self).read_records(file_name, range_tracker) for record in records: record_as_bytes = record.encode("utf-8") message = klio_pb2.KlioMessage() message.version = klio_pb2.Version.V2 message.metadata.intended_recipients.anyone.SetInParent() message.data.element = record_as_bytes yield message.SerializeToString()
def write_record(self, file_handle, encoded_element): """Writes a single encoded record. Args: file_handle (str): a referential identifier that points to an audio file found in the configured output data location. encoded_element (KlioMessage): KlioMessage """ message = klio_pb2.KlioMessage() message.ParseFromString(encoded_element) record = message.data.element super(_KlioTextSink, self).write_encoded_record(file_handle, record)
def subtract_filter_from_full(key_pair): key, pair_data = key_pair full = _unpickle_from_klio_message(pair_data["full"][0]) nn_filter = _unpickle_from_klio_message(pair_data["nnfilter"][0]) net = full - nn_filter payload = pickle.dumps(net) kmsg = klio_pb2.KlioMessage() kmsg.data.element = key kmsg.data.payload = payload return (key, kmsg.SerializeToString())
def test_klio_debug(mock_config): kmsg = klio_pb2.KlioMessage() with test_pipeline.TestPipeline() as p: p | beam.Create([kmsg.SerializeToString() ]) | helpers.KlioDebugMessage() actual_counters = p.result.metrics().query()["counters"] assert 1 == len(actual_counters) assert 1 == actual_counters[0].committed assert "KlioDebugMessage" == actual_counters[0].key.metric.namespace assert "kmsg-debug" == actual_counters[0].key.metric.name
def assert_audit(actual): job = klio_pb2.KlioJob() job.job_name = "a-job" job.gcp_project = "not-a-real-project" audit_log_item = klio_pb2.KlioJobAuditLogItem() audit_log_item.klio_job.CopyFrom(job) exp_msg = klio_pb2.KlioMessage() exp_msg.version = klio_pb2.Version.V2 exp_msg.metadata.job_audit_log.extend([audit_log_item]) expected = exp_msg.SerializeToString() assert expected == actual return actual
def test_trigger_upstream_job(mock_config, mocker, capsys): mock_gcs_client = mocker.patch("klio.transforms._helpers.gcsio.GcsIO") mock_gcs_client.return_value.exists.return_value = False mock_pubsub_client = mocker.patch("google.cloud.pubsub.PublisherClient") kmsg = klio_pb2.KlioMessage() kmsg.data.element = b"does_not_exist" exp_current_job = klio_pb2.KlioJob() exp_current_job.job_name = "a-job" exp_current_job.gcp_project = "not-a-real-project" exp_upstream_job = klio_pb2.KlioJob() exp_upstream_job.job_name = "upstream-job" exp_upstream_job.gcp_project = "upstream-project" exp_kmsg = klio_pb2.KlioMessage() exp_kmsg.version = klio_pb2.Version.V2 exp_kmsg.data.element = b"does_not_exist" exp_lmtd = exp_kmsg.metadata.intended_recipients.limited exp_lmtd.recipients.extend([exp_upstream_job, exp_current_job]) exp_lmtd.trigger_children_of.CopyFrom(exp_current_job) options = pipeline_options.PipelineOptions([]) options.view_as(pipeline_options.StandardOptions).streaming = True with test_pipeline.TestPipeline(options=options) as p: in_pcol = p | beam.Create([kmsg.SerializeToString()]) input_data = in_pcol | helpers.KlioGcsCheckInputExists() _ = input_data.not_found | helpers.KlioTriggerUpstream( upstream_job_name="upstream-job", upstream_topic="projects/upstream-project/topics/does-not-exist", ) mock_gcs_client.return_value.exists.assert_called_once_with( "gs://hopefully-this-bucket-doesnt-exist/does_not_exist") mock_pubsub_client.return_value.publish.assert_called_once_with( mock_pubsub_client.return_value.topic_path.return_value, exp_kmsg.SerializeToString(), )
def to_klio_message(incoming_message, kconfig=None, logger=None): """Serialize ``bytes`` to a :ref:`KlioMessage <klio-message>`. .. tip:: Set ``job_config.allow_non_klio_messages`` to ``True`` in ``klio-job.yaml`` in order to process non-``KlioMessages`` as regular ``bytes``. This function will create a new ``KlioMessage`` and set the incoming ``bytes`` to ``KlioMessage.data.element``. Args: incoming_message (bytes): Incoming bytes to parse into a \ ``KlioMessage``. kconfig (klio_core.config.KlioConfig): the current job's configuration. logger (logging.Logger): the logger associated with the Klio job. Returns: klio_core.proto.klio_pb2.KlioMessage: a ``KlioMessage``. Raises: klio_core.proto.klio_pb2._message.DecodeError: incoming message can not be parsed into a ``KlioMessage`` and ``job_config.allow_non_klio_messages`` in ``klio-job.yaml`` is set to ``False``. """ # TODO: when making a generic de/ser func, be sure to assert # kconfig and logger exists parsed_message = klio_pb2.KlioMessage() try: parsed_message.ParseFromString(incoming_message) except klio_pb2._message.DecodeError as e: if kconfig.job_config.allow_non_klio_messages: # We are assuming that we have been given "raw" data that is not in # the form of a serialized KlioMessage. parsed_message.data.element = incoming_message # default to set recipients to anyone - can't know who the # appropriate recipient is when it's not a real klio msg parsed_message.metadata.intended_recipients.anyone.SetInParent() parsed_message.version = klio_pb2.Version.V2 else: logger.error( "Can not parse incoming message. To support non-Klio " "messages, add `job_config.allow_non_klio_messages = true` " "in the job's `klio-job.yaml` file." ) raise e parsed_message = _handle_msg_compat(parsed_message) return parsed_message
def subtract_filter_from_full(key_pair): # key_pair looks like # (element, {"full": [<serialized numpy array>], "nnfilter": [<serialized numpy array>]}) key, pair_data = key_pair full = _load_from_msg(pair_data["full"][0]) nn_filter = _load_from_msg(pair_data["nnfilter"][0]) net = full - nn_filter payload = pickle.dumps(net) kmsg = klio_pb2.KlioMessage() kmsg.data.element = key kmsg.data.payload = payload return (key, kmsg.SerializeToString())
def mark_done(kmsg_or_bytes): """Mark a KlioMessage as done and to be removed from handling. This method just sets the PubSubKlioMessage.event object where then in the next iteration in `MessageManager.manage`, it is then acknowledged and removed from further "babysitting". Args: kmsg_or_bytes (klio_pb2.KlioMessage or bytes): the KlioMessage (or a KlioMessage that has been serialzied to bytes) to be marked as done. """ kmsg = kmsg_or_bytes mm_logger = logging.getLogger("klio.gke_direct_runner.message_manager") # Wrap in a general try/except to make sure this method returns cleanly, # aka no raised errors that may prevent the pipeline from consuming # the next message available. Not sure if this causes problems # of being unable to pull a message, but at least it's for # sanity. try: # TODO: either use klio.message.serializer.to_klio_message, or # figure out how to handle when a parsed_message can't be parsed # into a KlioMessage (will need to somehow get the klio context). if not isinstance(kmsg_or_bytes, klio_pb2.KlioMessage): kmsg = klio_pb2.KlioMessage() kmsg.ParseFromString(kmsg_or_bytes) entity_id = kmsg.data.element.decode("utf-8") # This call to remove the message from the dict ENTITY_ID_TO_ACK_ID # will tell the MessageManager that this message is now ready to # be acknowledged and no longer being worked upon. with MESSAGE_LOCK: msg = ENTITY_ID_TO_ACK_ID.pop(entity_id, None) if not msg: # NOTE: this logger exists as `self.mgr_logger`, but this method # needs to be a staticmethod so we don't need to unnecessarily # init the class in order to just mark a message as done. mm_logger.warn( f"Unable to acknowledge {entity_id}: Not found.") except Exception as e: # Catch all Exceptions so that the pipeline doesn't enter into # a weird state because of an uncaught error. mm_logger.warning( f"Error occurred while trying to remove message {kmsg}: {e}", exc_info=True, )
def test_convert_raw_pubsub_message(mocker, monkeypatch, msg_manager): mock_event = mocker.Mock() monkeypatch.setattr(pmm.threading, "Event", mock_event) exp_message = pmm.PubSubKlioMessage("ack_id1", "kmsg_id1") kmsg = klio_pb2.KlioMessage() kmsg.data.element = b"kmsg_id1" kmsg_bytes = kmsg.SerializeToString() pmsg = beam_pubsub.PubsubMessage(data=kmsg_bytes, attributes={}) act_message = msg_manager._convert_raw_pubsub_message("ack_id1", pmsg) # comparing class attributes (via __dict__) since we'd need to implement # __eq__ on the PubSubKlioMessage class, but doing so would make it un- # hashable. Which can be addressed, but this just seems easier for now. assert _compare_objects_dicts(exp_message, act_message)
def read_records(self, file_name, range_tracker): records = super(_KlioFastAvroSource, self).read_records(file_name=file_name, range_tracker=range_tracker) for record in records: message = klio_pb2.KlioMessage() message.version = klio_pb2.Version.V2 message.metadata.intended_recipients.anyone.SetInParent() # If an element is sent then we set the element # to handle event reading # If "element" is not present then we stuff the record # into the message element message.data.element = (record["element"] if "element" in record else bytes( json.dumps(record).encode("utf-8"))) yield message.SerializeToString()
def test_process(klio_msg, expected_log_messages, caplog): helloklio_fn = transforms.LogKlioMessage() output = helloklio_fn.process(klio_msg.SerializeToString()) row = { "entity_id": klio_msg.data.element.decode("utf-8"), "value": klio_msg.data.element.decode("utf-8") } expected_kmsg = klio_pb2.KlioMessage() expected_kmsg.data.element = klio_msg.data.element expected_kmsg.data.payload = bytes(json.dumps(row), "utf-8") expected_kmsg.version = klio_pb2.Version.V2 assert expected_kmsg.SerializeToString() == list(output)[0] for index, record in enumerate(caplog.records): assert "INFO" == record.levelname assert expected_log_messages[index] == record.message
def _generate_klio_message(self): message = klio_pb2.KlioMessage() message.version = klio_pb2.Version.V2 message.metadata.intended_recipients.anyone.SetInParent() # TODO: this is where we should add (relevant) KlioMessage.metadata; # (1) One thing to figure out is the klio_pb2.KlioJob definition, # particularly the JobInput definition, in light of KlioConfig v2. # Once that's figured out, we should at least populate the # job audit log. # (2) Another thing to figure out is force/ping. In streaming, messages # are individually marked as force or ping when needed. However, # users aren't able to tag individual messages generated from a row # of BQ data as force/ping, and it's probably very difficult for us # to provide a way to do that. So, should we allow users to at least # globally set force/ping on their event input config in klio-job.yaml? # Potentially. return message
def _expected_avro_kmsgs(): expected_records = [ { "username": "******", "tweet": "Rock: Nerf paper, scissors is fine.", "timestamp": 1366150681, }, { "username": "******", "tweet": "Works as intended. Terran is IMBA.", "timestamp": 1366154481, }, ] expected_kmsgs = [] for record in expected_records: message = klio_pb2.KlioMessage() message.version = klio_pb2.Version.V2 message.metadata.intended_recipients.anyone.SetInParent() message.data.element = bytes(json.dumps(record).encode("utf-8")) expected_kmsgs.append(message) return expected_kmsgs
def test_update_klio_log(mocker, monkeypatch, caplog, mock_config): mock_ts = mocker.Mock() monkeypatch.setattr(klio_pb2.KlioJobAuditLogItem, "timestamp", mock_ts) kmsg = klio_pb2.KlioMessage() kmsg.version = klio_pb2.Version.V2 assert not kmsg.metadata.job_audit_log # sanity check with test_pipeline.TestPipeline() as p: in_pcol = p | beam.Create([kmsg.SerializeToString()]) act_pcol = in_pcol | helpers.KlioUpdateAuditLog() _ = act_pcol | beam.Map(assert_audit) exp_log = ( "KlioMessage full audit log - Entity ID: - Path: not-a-real-project::" "a-job (current job)") for rec in caplog.records: if exp_log in rec.message: assert True break else: assert False, "Expected debug audit log not found"
def test_process(klio_msg, expected_log_messages, caplog): helloklio_fn = transforms.LogKlioMessage() output = helloklio_fn.process(klio_msg.SerializeToString()) row = { "entity_id": klio_msg.data.element.decode("utf-8"), "value": klio_msg.data.element.decode("utf-8") } expected_kmsg = klio_pb2.KlioMessage() expected_kmsg.data.element = klio_msg.data.element expected_kmsg.data.payload = bytes(json.dumps(row), "utf-8") expected_kmsg.version = klio_pb2.Version.V2 assert expected_kmsg.SerializeToString() == list(output)[0] # logs may not all be available yet since some may be on a different thread # so we'll wait a second time.sleep(1) assert len(caplog.records) == len(expected_log_messages) for index, record in enumerate(caplog.records): expected_log_message = expected_log_messages[index] assert expected_log_message["level"] == record.levelname assert expected_log_message["message"] in record.message
def klio_msg(): element = b"s0m3_tr4ck_1d" msg = klio_pb2.KlioMessage() msg.data.element = element msg.version = klio_pb2.Version.V2 return msg
def assert_expected_klio_msg_from_file(element): message = klio_pb2.KlioMessage() message.ParseFromString(element) assert message.data.element is not None assert isinstance(message.data.element, bytes)