def test_trigger_upstream_job(mock_config, mocker, caplog): mock_gcs_client = mocker.patch("klio.transforms._helpers.gcsio.GcsIO") mock_gcs_client.return_value.exists.return_value = False mock_pubsub_client = mocker.patch("google.cloud.pubsub.PublisherClient") kmsg = klio_pb2.KlioMessage() kmsg.data.element = b"does_not_exist" exp_current_job = klio_pb2.KlioJob() exp_current_job.job_name = "a-job" exp_current_job.gcp_project = "not-a-real-project" exp_upstream_job = klio_pb2.KlioJob() exp_upstream_job.job_name = "upstream-job" exp_upstream_job.gcp_project = "upstream-project" exp_kmsg = klio_pb2.KlioMessage() exp_kmsg.version = klio_pb2.Version.V2 exp_kmsg.data.element = b"does_not_exist" exp_lmtd = exp_kmsg.metadata.intended_recipients.limited exp_lmtd.recipients.extend([exp_upstream_job, exp_current_job]) exp_lmtd.trigger_children_of.CopyFrom(exp_current_job) options = pipeline_options.PipelineOptions([]) options.view_as(pipeline_options.StandardOptions).streaming = True with test_pipeline.TestPipeline(options=options) as p: in_pcol = p | beam.Create([kmsg.SerializeToString()]) input_data = in_pcol | helpers.KlioGcsCheckInputExists() _ = input_data.not_found | helpers.KlioTriggerUpstream( upstream_job_name="upstream-job", upstream_topic="projects/upstream-project/topics/does-not-exist", ) mock_gcs_client.return_value.exists.assert_called_once_with( "gs://hopefully-this-bucket-doesnt-exist/does_not_exist") mock_pubsub_client.return_value.publish.assert_called_once_with( mock_pubsub_client.return_value.topic_path.return_value, exp_kmsg.SerializeToString(), ) actual_counters = p.result.metrics().query()["counters"] assert 2 == len(actual_counters) data_not_found_ctr = actual_counters[0] trigger_upstream_ctr = actual_counters[1] assert 1 == data_not_found_ctr.committed assert "KlioGcsCheckInputExists" == data_not_found_ctr.key.metric.namespace assert "kmsg-data-not-found-input" == data_not_found_ctr.key.metric.name assert 1 == trigger_upstream_ctr.committed assert "KlioTriggerUpstream" == trigger_upstream_ctr.key.metric.namespace assert "kmsg-trigger-upstream" == trigger_upstream_ctr.key.metric.name expected_log_msg = "Triggering upstream upstream-job for does_not_exist" for record in caplog.records: if expected_log_msg in record.message: assert True break else: assert False, "Expected log message not found"
def test_trigger_upstream_job(mock_config, mocker, capsys): mock_gcs_client = mocker.patch("klio.transforms._helpers.gcsio.GcsIO") mock_gcs_client.return_value.exists.return_value = False mock_pubsub_client = mocker.patch("google.cloud.pubsub.PublisherClient") kmsg = klio_pb2.KlioMessage() kmsg.data.element = b"does_not_exist" exp_current_job = klio_pb2.KlioJob() exp_current_job.job_name = "a-job" exp_current_job.gcp_project = "not-a-real-project" exp_upstream_job = klio_pb2.KlioJob() exp_upstream_job.job_name = "upstream-job" exp_upstream_job.gcp_project = "upstream-project" exp_kmsg = klio_pb2.KlioMessage() exp_kmsg.version = klio_pb2.Version.V2 exp_kmsg.data.element = b"does_not_exist" exp_lmtd = exp_kmsg.metadata.intended_recipients.limited exp_lmtd.recipients.extend([exp_upstream_job, exp_current_job]) exp_lmtd.trigger_children_of.CopyFrom(exp_current_job) options = pipeline_options.PipelineOptions([]) options.view_as(pipeline_options.StandardOptions).streaming = True with test_pipeline.TestPipeline(options=options) as p: in_pcol = p | beam.Create([kmsg.SerializeToString()]) input_data = in_pcol | helpers.KlioGcsCheckInputExists() _ = input_data.not_found | helpers.KlioTriggerUpstream( upstream_job_name="upstream-job", upstream_topic="projects/upstream-project/topics/does-not-exist", ) mock_gcs_client.return_value.exists.assert_called_once_with( "gs://hopefully-this-bucket-doesnt-exist/does_not_exist") mock_pubsub_client.return_value.publish.assert_called_once_with( mock_pubsub_client.return_value.topic_path.return_value, exp_kmsg.SerializeToString(), )
def assert_audit(actual): job = klio_pb2.KlioJob() job.job_name = "a-job" job.gcp_project = "not-a-real-project" audit_log_item = klio_pb2.KlioJobAuditLogItem() audit_log_item.klio_job.CopyFrom(job) exp_msg = klio_pb2.KlioMessage() exp_msg.version = klio_pb2.Version.V2 exp_msg.metadata.job_audit_log.extend([audit_log_item]) expected = exp_msg.SerializeToString() assert expected == actual return actual
def _should_process(self, klio_message): downstream = klio_message.metadata.downstream if not downstream: # if there's nothing in downstream, then it means the message is # in top-down mode and should be handled return True current_job = klio_pb2.KlioJob() current_job.ParseFromString(self._klio.job) if _helpers._job_in_jobs(current_job, downstream): return True self._klio.logger.info( "Dropping KlioMessage - job not an intended recipient for message " "with entity_id {}.".format(klio_message.data.entity_id)) return False
def _should_process(self, klio_message): intended_recipients = klio_message.metadata.intended_recipients # returns "anyone", "limited", or None if not set recipients = intended_recipients.WhichOneof("recipients") if recipients is None: # is it safe to assume if this is not set in a v2 message, it should # be top-down? I think this will be the case for batch self._klio.logger.warning( "Dropping KlioMessage - No 'intended_recipients' set in " "metadata of KlioMessage with element '{}'.".format( klio_message.data.element ) ) return False if recipients == "anyone": return True current_job = klio_pb2.KlioJob() current_job.ParseFromString(self._klio.job) # otherwise, recipients == "limited" # don't process if this job is not in the intended recipients if not _helpers._job_in_jobs( current_job, intended_recipients.limited.recipients ): return False # if it is in the intended recipients _and_ is the job in # trigger_children_of, then this message was originally in top-down # mode, but was missing dependencies, and therefore should update the # message intended receipients to be "anyone" signifying top-down if _helpers._job_in_jobs( current_job, [intended_recipients.limited.trigger_children_of] ): # FYI: since 'anyone' is essentially empty msg, it can't simply # be assigned. To set `anyone` as the intended_recipients, use # kmsg.metadata.intended_recipients.anyone.SetInParent()` # https://stackoverflow.com/a/29651069 intended_recipients.anyone.SetInParent() return True
def _generate_current_job_object(self): job = klio_pb2.KlioJob() job.job_name = self._klio.config.job_name job.gcp_project = self._klio.config.pipeline_options.project return job
def _generate_upstream_job_object(self): upstream_job = klio_pb2.KlioJob() upstream_job.job_name = self.upstream_job_name upstream_job.gcp_project = self.upstream_gcp_project return upstream_job
def _create_klio_job_obj(self): klio_job = klio_pb2.KlioJob() klio_job.job_name = self.config.job_name klio_job.gcp_project = self.config.pipeline_options.project klio_job_str = klio_job.SerializeToString() return klio_job_str
def get_other_job(self): job = klio_pb2.KlioJob() job.job_name = "other" job.gcp_project = "not-a-real-project" return job
def get_current_job(self): job = klio_pb2.KlioJob() job.job_name = "a-job" job.gcp_project = "not-a-real-project" return job