Exemplo n.º 1
0
def test_serde_idempotence(state_obj):
    """
    Verifies that Serialization + Deserialization reconstructs the original object fully.
    """
    # 1. Construct the initial checkpoint object
    orig_checkpoint_obj = Checkpoint(
        job_name=test_job_name,
        pipeline_name=test_pipeline_name,
        platform_instance_id=test_platform_instance_id,
        run_id=test_run_id,
        config=test_source_config,
        state=state_obj,
    )

    # 2. Convert it to the aspect form.
    checkpoint_aspect = orig_checkpoint_obj.to_checkpoint_aspect(
        # fmt: off
        max_allowed_state_size=2**20
        # fmt: on
    )
    assert checkpoint_aspect is not None

    # 3. Reconstruct from the aspect form and verify that it matches the original.
    serde_checkpoint_obj = Checkpoint.create_from_checkpoint_aspect(
        job_name=test_job_name,
        checkpoint_aspect=checkpoint_aspect,
        state_class=type(state_obj),
        config_class=MySQLConfig,
    )
    assert orig_checkpoint_obj == serde_checkpoint_obj
Exemplo n.º 2
0
def test_create_from_checkpoint_aspect(state_obj):
    """
    Tests the Checkpoint class API 'create_from_checkpoint_aspect' with the state_obj parameter as the state.
    """
    # 1. Construct the raw aspect object with the state
    checkpoint_state = IngestionCheckpointStateClass(
        formatVersion=state_obj.version,
        serde=state_obj.serde,
        payload=state_obj.to_bytes(),
    )
    checkpoint_aspect = DatahubIngestionCheckpointClass(
        timestampMillis=int(datetime.utcnow().timestamp() * 1000),
        pipelineName=test_pipeline_name,
        platformInstanceId=test_platform_instance_id,
        config=test_source_config.json(),
        state=checkpoint_state,
        runId=test_run_id,
    )

    # 2. Create the checkpoint from the raw checkpoint aspect and validate.
    checkpoint_obj = Checkpoint.create_from_checkpoint_aspect(
        job_name=test_job_name,
        checkpoint_aspect=checkpoint_aspect,
        state_class=type(state_obj),
        config_class=MySQLConfig,
    )

    expected_checkpoint_obj = Checkpoint(
        job_name=test_job_name,
        pipeline_name=test_pipeline_name,
        platform_instance_id=test_platform_instance_id,
        run_id=test_run_id,
        config=test_source_config,
        state=state_obj,
    )
    assert checkpoint_obj == expected_checkpoint_obj
Exemplo n.º 3
0
 def create_checkpoint(self, job_id: JobId) -> Optional[Checkpoint]:
     """
     Create the custom checkpoint with empty state for the job.
     """
     assert self.ctx.pipeline_name is not None
     if job_id == self.get_default_ingestion_job_id():
         return Checkpoint(
             job_name=job_id,
             pipeline_name=self.ctx.pipeline_name,
             platform_instance_id=self.get_platform_instance_id(),
             run_id=self.ctx.run_id,
             config=self.config,
             state=BaseSQLAlchemyCheckpointState(),
         )
     return None
Exemplo n.º 4
0
 def create_checkpoint(self, job_id: JobId) -> Optional[Checkpoint]:
     """
     Create the custom checkpoint with empty state for the job.
     """
     assert self.ctx.pipeline_name
     if job_id == self.get_default_ingestion_job_id():
         return Checkpoint(
             job_name=job_id,
             pipeline_name=self.ctx.pipeline_name,
             platform_instance_id=self.get_platform_instance_id(),
             run_id=self.ctx.run_id,
             config=self.config,
             state=BaseUsageCheckpointState(
                 begin_timestamp_millis=int(
                     self.config.start_time.timestamp() * 1000),
                 end_timestamp_millis=int(self.config.end_time.timestamp() *
                                          1000),
             ),
         )
     return None
Exemplo n.º 5
0
 def _get_last_checkpoint(
     self, job_id: JobId, checkpoint_state_class: Type[CheckpointStateBase]
 ) -> Optional[Checkpoint]:
     """
     This is a template method implementation for querying the last checkpoint state.
     """
     last_checkpoint: Optional[Checkpoint] = None
     if self.is_stateful_ingestion_configured():
         # Obtain the latest checkpoint from GMS for this job.
         last_checkpoint_aspect = self.ingestion_checkpointing_state_provider.get_latest_checkpoint(  # type: ignore
             pipeline_name=self.ctx.pipeline_name,  # type: ignore
             platform_instance_id=self.get_platform_instance_id(),
             job_name=job_id,
         )
         # Convert it to a first-class Checkpoint object.
         last_checkpoint = Checkpoint.create_from_checkpoint_aspect(
             job_name=job_id,
             checkpoint_aspect=last_checkpoint_aspect,
             config_class=self.source_config_type,
             state_class=checkpoint_state_class,
         )
     return last_checkpoint
Exemplo n.º 6
0
    def test_provider(self):

        # 1. Create the individual job checkpoints with appropriate states.
        # Job1 - Checkpoint with a BaseSQLAlchemyCheckpointState state
        job1_state_obj = BaseSQLAlchemyCheckpointState()
        job1_checkpoint = Checkpoint(
            job_name=self.job_names[0],
            pipeline_name=self.pipeline_name,
            platform_instance_id=self.platform_instance_id,
            run_id=self.run_id,
            config=MySQLConfig(),
            state=job1_state_obj,
        )
        # Job2 - Checkpoint with a BaseUsageCheckpointState state
        job2_state_obj = BaseUsageCheckpointState(begin_timestamp_millis=10,
                                                  end_timestamp_millis=100)
        job2_checkpoint = Checkpoint(
            job_name=self.job_names[1],
            pipeline_name=self.pipeline_name,
            platform_instance_id=self.platform_instance_id,
            run_id=self.run_id,
            config=MySQLConfig(),
            state=job2_state_obj,
        )

        # 2. Set the provider's state_to_commit.
        self.provider.state_to_commit = {
            # NOTE: state_to_commit accepts only the aspect version of the checkpoint.
            self.job_names[0]:
            job1_checkpoint.to_checkpoint_aspect(
                # fmt: off
                max_allowed_state_size=2**20
                # fmt: on
            ),
            self.job_names[1]:
            job2_checkpoint.to_checkpoint_aspect(
                # fmt: off
                max_allowed_state_size=2**20
                # fmt: on
            ),
        }

        # 3. Perform the commit
        # NOTE: This will commit the state to the in-memory self.mcps_emitted because of the monkey-patching.
        self.provider.commit()
        self.assertTrue(self.provider.committed)

        # 4. Get last committed state. This must match what has been committed earlier.
        # NOTE: This will retrieve from in-memory self.mcps_emitted because of the monkey-patching.
        last_state: Optional[
            CheckpointJobStatesMap] = self.provider.get_last_state(
                self.job_state_key)
        assert last_state is not None
        self.assertEqual(len(last_state), 2)

        # 5. Validate individual job checkpoint state values that have been committed and retrieved
        # against the original values.
        self.assertIsNotNone(last_state[self.job_names[0]])
        job1_last_checkpoint = Checkpoint.create_from_checkpoint_aspect(
            job_name=self.job_names[0],
            checkpoint_aspect=last_state[self.job_names[0]],
            state_class=type(job1_state_obj),
            config_class=type(job1_checkpoint.config),
        )
        self.assertEqual(job1_last_checkpoint, job1_checkpoint)

        self.assertIsNotNone(last_state[self.job_names[1]])
        job2_last_checkpoint = Checkpoint.create_from_checkpoint_aspect(
            job_name=self.job_names[1],
            checkpoint_aspect=last_state[self.job_names[1]],
            state_class=type(job2_state_obj),
            config_class=type(job2_checkpoint.config),
        )
        self.assertEqual(job2_last_checkpoint, job2_checkpoint)